From 312832ff89cf9c9a6b4d5e2a969ea8798b99e9c6 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 19 Apr 2024 10:31:04 +0200 Subject: [PATCH 001/103] Initial commit --- hls4ml/backends/__init__.py | 4 + hls4ml/backends/vitis_accelerator/__init__.py | 0 .../vitis_accelerator/passes/__init__.py | 0 .../passes/fifo_depth_optimization.py | 69 ++++++++ .../vitis_accelerator/supported_boards.json | 42 +++++ .../vitis_accelerator_backend.py | 163 ++++++++++++++++++ .../vitis_accelerator_config.py | 162 +++++++++++++++++ 7 files changed, 440 insertions(+) create mode 100644 hls4ml/backends/vitis_accelerator/__init__.py create mode 100644 hls4ml/backends/vitis_accelerator/passes/__init__.py create mode 100644 hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py create mode 100644 hls4ml/backends/vitis_accelerator/supported_boards.json create mode 100644 hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py create mode 100644 hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 6396d7815f..91a9272e74 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -2,14 +2,18 @@ from hls4ml.backends.fpga.fpga_backend import FPGABackend # noqa: F401 from hls4ml.backends.quartus.quartus_backend import QuartusBackend from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend + from hls4ml.backends.vivado.vivado_backend import VivadoBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401 from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip +from hls4ml.backends.vitis_accelerator.vitis_accelerator_backend import VitisAcceleratorBackend # isort: skip +from hls4ml.backends.vitis_accelerator.vitis_accelerator_config import VitisAcceleratorConfig # noqa: F401 register_backend('Vivado', VivadoBackend) register_backend('VivadoAccelerator', VivadoAcceleratorBackend) register_backend('Vitis', VitisBackend) +register_backend('VitisAccelerator', VitisAcceleratorBackend) register_backend('Quartus', QuartusBackend) register_backend('SymbolicExpression', SymbolicExpressionBackend) diff --git a/hls4ml/backends/vitis_accelerator/__init__.py b/hls4ml/backends/vitis_accelerator/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/vitis_accelerator/passes/__init__.py b/hls4ml/backends/vitis_accelerator/passes/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py new file mode 100644 index 0000000000..e983ca49fb --- /dev/null +++ b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py @@ -0,0 +1,69 @@ +# from hls4ml.backends.vivado.passes.fifo_depth_optimization import ( +# generate_max_depth_file, +# get_vcd_data, +# populate_values, +# set_big_fifos, +# set_fifo_depth, +# ) +# from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass + + +# class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass): +# def __init__(self): +# self.values = [] + +# def transform(self, model): +# # use `large_fifo_depth = 0` to keep the default fifo depth +# profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000) + +# # check axi-stream or io-stream, if not one the 2 exit +# if not ( +# model.config.get_config_value('IOType') == 'io_stream' +# or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_stream' +# or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_master' +# ): +# raise Exception( +# 'To use this optimization you have to set `IOType` field to `io_stream` in the HLS config ' +# 'or `axi_stream` or `axi_master` in `AcceleratorConfig` interface field' +# ) + +# # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be +# # profiled + +# if profiling_fifo_depth: +# set_big_fifos(model.output_vars, profiling_fifo_depth) + +# data = get_vcd_data(model) + +# for i in range(1, len(data['children'][0]['children'][0]['children'])): +# # wrapper fifos +# populate_values( +# self.values, +# data['children'][0]['children'][0]['children'][i]['name'], +# data['children'][0]['children'][0]['children'][i]['children'][0]['data'], +# data['children'][0]['children'][0]['children'][i]['children'][1]['data'], +# ) + +# n_elem = len(data['children'][0]['children'][0]['children'][0]['children']) +# for i in range(n_elem): +# name = data['children'][0]['children'][0]['children'][0]['children'][i]['name'] +# data_p = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data'] +# depth = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data'] +# populate_values(self.values, name, data_p, depth) + +# maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values] + +# generate_max_depth_file(model, maxs) + +# set_fifo_depth(model, maxs) + +# inp = model.get_input_variables()[0] +# out = model.get_output_variables()[0] +# for x in maxs: +# if 'in_local' in x['name']: +# inp.pragma = (inp.pragma[0], x['max'] + 1) +# elif 'out_local' in x['name']: +# out.pragma = (out.pragma[0], x['max'] + 1) + +# print('[hls4ml] - FIFO optimization completed') +# return False diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator/supported_boards.json new file mode 100644 index 0000000000..1279ec22d0 --- /dev/null +++ b/hls4ml/backends/vitis_accelerator/supported_boards.json @@ -0,0 +1,42 @@ +{ + "pynq-z2": { + "part": "xc7z020clg400-1", + "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "c_drivers": {} + }, + "zcu102": { + "part": "xczu9eg-ffvb1156-2-e", + "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "c_drivers": {} + }, + "alveo-u50": { + "part": "xcu50-fsvh2104-2-e", + "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, + "c_drivers": {} + }, + "alveo-u250": { + "part": "xcu250-figd2104-2L-e", + "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, + "c_drivers": {} + }, + "alveo-u200": { + "part": "xcu200-fsgd2104-2-e", + "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, + "c_drivers": {} + }, + "alveo-u280": { + "part": "xcu280-fsvh2892-2L-e", + "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, + "c_drivers": {} + } +} diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py new file mode 100644 index 0000000000..ccd9521269 --- /dev/null +++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py @@ -0,0 +1,163 @@ +import os + +from hls4ml.backends import VivadoBackend +from hls4ml.model.flow import register_flow +from hls4ml.report import parse_vivado_report + + +class VivadoAcceleratorBackend(VivadoBackend): + def __init__(self): + super(VivadoBackend, self).__init__(name='VivadoAccelerator') + self._register_layer_attributes() + self._register_flows() + + def build( + self, + model, + reset=False, + csim=True, + synth=True, + cosim=False, + validation=False, + export=False, + vsynth=False, + fifo_opt=False, + bitfile=False, + ): + # run the VivadoBackend build + super().build( + model, + reset=reset, + csim=csim, + synth=synth, + cosim=cosim, + validation=validation, + export=export, + vsynth=vsynth, + fifo_opt=fifo_opt, + ) + # Get Config to view Board and Platform + from hls4ml.backends import VivadoAcceleratorConfig + + vivado_accelerator_config = VivadoAcceleratorConfig( + model.config, model.get_input_variables(), model.get_output_variables() + ) + # now make a bitfile + if bitfile: + if vivado_accelerator_config.get_board().startswith('alveo'): + self.make_xclbin(model, vivado_accelerator_config.get_platform()) + else: + curr_dir = os.getcwd() + os.chdir(model.config.get_output_dir()) + try: + os.system('vivado -mode batch -source design.tcl') + except Exception: + print("Something went wrong, check the Vivado logs") + os.chdir(curr_dir) + + return parse_vivado_report(model.config.get_output_dir()) + + def make_xclbin(self, model, platform='xilinx_u250_xdma_201830_2'): + """Create the xclbin for the given model and target platform. + + Args: + model (ModelGraph): Compiled and build model. + platform (str, optional): Development/Deployment target platform, must be installed first. + The host machine only requires the deployment target platform. Refer to the Getting Started section of + the Alveo guide. Defaults to 'xilinx_u250_xdma_201830_2'. + """ + curr_dir = os.getcwd() + abs_path_dir = os.path.abspath(model.config.get_output_dir()) + os.chdir(abs_path_dir) + os.makedirs('xo_files', exist_ok=True) + try: + os.system('vivado -mode batch -source design.tcl') + except Exception: + print("Something went wrong, check the Vivado logs") + project_name = model.config.get_project_name() + ip_repo_path = abs_path_dir + '/' + project_name + '_prj' + '/solution1/impl/ip' + os.makedirs('xclbin_files', exist_ok=True) + os.chdir(abs_path_dir + '/xclbin_files') + # TODO Add other platforms + vitis_cmd = ( + "v++ -t hw --platform " + + platform + + " --link ../xo_files/" + + project_name + + "_kernel.xo -o'" + + project_name + + "_kernel.xclbin' --user_ip_repo_paths " + + ip_repo_path + ) + try: + os.system(vitis_cmd) + except Exception: + print("Something went wrong, check the Vitis/Vivado logs") + os.chdir(curr_dir) + + def create_initial_config( + self, + board='pynq-z2', + part=None, + clock_period=5, + clock_uncertainty='12.5%', + io_type='io_parallel', + interface='axi_stream', + driver='python', + input_type='float', + output_type='float', + platform='xilinx_u250_xdma_201830_2', + ): + ''' + Create initial accelerator config with default parameters + + Args: + board: one of the keys defined in supported_boards.json + clock_period: clock period passed to hls project + io_type: io_parallel or io_stream + interface: `axi_stream`: generate hardware designs and drivers which exploit axi stream channels. + `axi_master`: generate hardware designs and drivers which exploit axi master channels. + `axi_lite` : generate hardware designs and drivers which exploit axi lite channels. (Don't use it + to exchange large amount of data) + driver: `python`: generates the python driver to use the accelerator in the PYNQ stack. + `c`: generates the c driver to use the accelerator bare-metal. + input_type: the wrapper input precision. Can be `float` or an `ap_type`. Note: VivadoAcceleratorBackend + will round the number of bits used to the next power-of-2 value. + output_type: the wrapper output precision. Can be `float` or an `ap_type`. Note: + VivadoAcceleratorBackend will round the number of bits used to the next power-of-2 value. + platform: development target platform + + Returns: + populated config + ''' + board = board if board is not None else 'pynq-z2' + config = super().create_initial_config(part, clock_period, clock_uncertainty, io_type) + config['AcceleratorConfig'] = {} + config['AcceleratorConfig']['Board'] = board + config['AcceleratorConfig']['Interface'] = interface # axi_stream, axi_master, axi_lite + config['AcceleratorConfig']['Driver'] = driver + config['AcceleratorConfig']['Precision'] = {} + config['AcceleratorConfig']['Precision']['Input'] = {} + config['AcceleratorConfig']['Precision']['Output'] = {} + config['AcceleratorConfig']['Precision']['Input'] = input_type # float, double or ap_fixed + config['AcceleratorConfig']['Precision']['Output'] = output_type # float, double or ap_fixed + if board.startswith('alveo'): + config['AcceleratorConfig']['Platform'] = platform + + return config + + def get_default_flow(self): + return self._default_flow + + def get_writer_flow(self): + return self._writer_flow + + def _register_flows(self): + vivado_ip = 'vivado:ip' + writer_passes = ['make_stamp', 'vivadoaccelerator:write_hls'] + self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name) + self._default_flow = vivado_ip + + fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes + + register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name) diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py new file mode 100644 index 0000000000..7bd931ede3 --- /dev/null +++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py @@ -0,0 +1,162 @@ +import json +import os + +import numpy as np + +from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType + + +class VivadoAcceleratorConfig: + def __init__(self, config, model_inputs, model_outputs): + self.config = config.config + self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2') + self.supported_boards = json.load(open(os.path.dirname(__file__) + '/supported_boards.json')) + if self.board in self.supported_boards.keys(): + board_info = self.supported_boards[self.board] + self.part = board_info['part'] + else: + raise Exception('The board does not appear in supported_boards.json file') + + if self.config.get('Part') is not None: + if self.config.get('Part') != self.part: + print( + 'WARNING: You set a Part that does not correspond to the Board you specified. The correct ' + 'Part is now set.' + ) + self.config['Part'] = self.part + accel_config = self.config.get('AcceleratorConfig', None) + if accel_config is not None: + prec = accel_config.get('Precision') + if prec is None: + raise Exception('Precision must be provided in the AcceleratorConfig') + else: + if prec.get('Input') is None or prec.get('Output') is None: + raise Exception('Input and Output fields must be provided in the AcceleratorConfig->Precision') + else: + accel_config = { + 'Precision': {'Input': 'float', 'Output': 'float'}, + 'Driver': 'python', + 'Interface': 'axi_stream', + } + config.config['AcceleratorConfig'] = accel_config + + self.interface = self.config['AcceleratorConfig'].get('Interface', 'axi_stream') # axi_stream, axi_master, axi_lite + self.driver = self.config['AcceleratorConfig'].get('Driver', 'python') # python or c + self.input_type = self.config['AcceleratorConfig']['Precision'].get( + 'Input', 'float' + ) # float, double or ap_fixed + self.output_type = self.config['AcceleratorConfig']['Precision'].get( + 'Output', 'float' + ) # float, double or ap_fixed + self.platform = self.config['AcceleratorConfig'].get( + 'Platform', 'xilinx_u250_xdma_201830_2' + ) # Get platform folder name + + assert ( + len(model_inputs) == 1 + ), "Only models with one input tensor are currently supported by VivadoAcceleratorBackend" + assert ( + len(model_outputs) == 1 + ), "Only models with one output tensor are currently supported by VivadoAcceleratorBackend" + self.inp = model_inputs[0] + self.out = model_outputs[0] + inp_axi_t = self.input_type + out_axi_t = self.output_type + + if inp_axi_t not in ['float', 'double']: + self.input_type = self._next_factor8_type(config.backend.convert_precision_string(inp_axi_t)) + if out_axi_t not in ['float', 'double']: + self.output_type = self._next_factor8_type(config.backend.convert_precision_string(out_axi_t)) + + if self.input_type == 'float': + self.input_bitwidth = 32 + elif self.input_type == 'double': + self.input_bitwidth = 64 + else: + self.input_bitwidth = config.backend.convert_precision_string(inp_axi_t).width + + if out_axi_t == 'float': + self.output_bitwidth = 32 + elif out_axi_t == 'double': + self.output_bitwidth = 64 + else: + self.output_bitwidth = config.backend.convert_precision_string(out_axi_t).width + + def _next_factor8_type(self, p): + '''Return a new type with the width rounded to the next factor of 8 up to p's width + Args: + p : IntegerPrecisionType or FixedPrecisionType + Returns: + An IntegerPrecisionType or FixedPrecisionType with the width rounder up to the next factor of 8 + of p's width. Other parameters (fractional bits, extra modes) stay the same. + ''' + W = p.width + newW = int(np.ceil(W / 8) * 8) + if isinstance(p, FixedPrecisionType): + return FixedPrecisionType(newW, p.integer, p.signed, p.rounding_mode, p.saturation_mode, p.saturation_bits) + elif isinstance(p, IntegerPrecisionType): + return IntegerPrecisionType(newW, p.signed) + + def get_io_bitwidth(self): + return self.input_bitwidth, self.output_bitwidth + + def get_corrected_types(self): + return self.input_type, self.output_type, self.inp, self.out + + def get_interface(self): + return self.interface + + def get_board_info(self, board=None): + if board is None: + board = self.board + if board in self.supported_boards.keys(): + return self.supported_boards[board] + else: + raise Exception('The board is still not supported') + + def get_part(self): + return self.part + + def get_driver(self): + return self.driver + + def get_board(self): + return self.board + + def get_platform(self): + return self.platform + + def get_clock_period(self): + return self.clock_period + + def get_driver_path(self): + if self.board.startswith('alveo'): + return '../templates/vivado_accelerator/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file() + else: + return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file() + + def get_driver_file(self): + driver_ext = '.py' if self.driver == 'python' else '.h' + return self.interface + '_driver' + driver_ext + + def get_krnl_rtl_src_dir(self): + return '../templates/vivado_accelerator/' + 'alveo/' + '/krnl_rtl_src' + + def get_input_type(self): + return self.input_type + + def get_output_type(self): + return self.output_type + + def get_tcl_file_path(self): + board_info = self.get_board_info(self.board) + tcl_scripts = board_info.get('tcl_scripts', None) + if tcl_scripts is None: + raise Exception('No tcl scripts definition available for the board in supported_board.json') + tcl_script = tcl_scripts.get(self.interface, None) + if tcl_script is None: + raise Exception('No tcl script definition available for the desired interface in supported_board.json') + if self.board.startswith('alveo'): + return '../templates/vivado_accelerator/' + 'alveo/' + '/tcl_scripts/' + tcl_script + else: + return '../templates/vivado_accelerator/' + self.board + '/tcl_scripts/' + tcl_script From d2b5a15bcbd37c04529989bf434c19f031fcc03b Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 19 Apr 2024 11:13:21 +0200 Subject: [PATCH 002/103] Set change the backend --- .../passes/fifo_depth_optimization.py | 69 ----------- .../vitis_accelerator/supported_boards.json | 34 ------ .../vitis_accelerator_backend.py | 110 +++++++++--------- .../vitis_accelerator_config.py | 2 +- 4 files changed, 56 insertions(+), 159 deletions(-) delete mode 100644 hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py diff --git a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py deleted file mode 100644 index e983ca49fb..0000000000 --- a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py +++ /dev/null @@ -1,69 +0,0 @@ -# from hls4ml.backends.vivado.passes.fifo_depth_optimization import ( -# generate_max_depth_file, -# get_vcd_data, -# populate_values, -# set_big_fifos, -# set_fifo_depth, -# ) -# from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass - - -# class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass): -# def __init__(self): -# self.values = [] - -# def transform(self, model): -# # use `large_fifo_depth = 0` to keep the default fifo depth -# profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000) - -# # check axi-stream or io-stream, if not one the 2 exit -# if not ( -# model.config.get_config_value('IOType') == 'io_stream' -# or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_stream' -# or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_master' -# ): -# raise Exception( -# 'To use this optimization you have to set `IOType` field to `io_stream` in the HLS config ' -# 'or `axi_stream` or `axi_master` in `AcceleratorConfig` interface field' -# ) - -# # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be -# # profiled - -# if profiling_fifo_depth: -# set_big_fifos(model.output_vars, profiling_fifo_depth) - -# data = get_vcd_data(model) - -# for i in range(1, len(data['children'][0]['children'][0]['children'])): -# # wrapper fifos -# populate_values( -# self.values, -# data['children'][0]['children'][0]['children'][i]['name'], -# data['children'][0]['children'][0]['children'][i]['children'][0]['data'], -# data['children'][0]['children'][0]['children'][i]['children'][1]['data'], -# ) - -# n_elem = len(data['children'][0]['children'][0]['children'][0]['children']) -# for i in range(n_elem): -# name = data['children'][0]['children'][0]['children'][0]['children'][i]['name'] -# data_p = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data'] -# depth = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data'] -# populate_values(self.values, name, data_p, depth) - -# maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values] - -# generate_max_depth_file(model, maxs) - -# set_fifo_depth(model, maxs) - -# inp = model.get_input_variables()[0] -# out = model.get_output_variables()[0] -# for x in maxs: -# if 'in_local' in x['name']: -# inp.pragma = (inp.pragma[0], x['max'] + 1) -# elif 'out_local' in x['name']: -# out.pragma = (out.pragma[0], x['max'] + 1) - -# print('[hls4ml] - FIFO optimization completed') -# return False diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator/supported_boards.json index 1279ec22d0..5f44560ccd 100644 --- a/hls4ml/backends/vitis_accelerator/supported_boards.json +++ b/hls4ml/backends/vitis_accelerator/supported_boards.json @@ -4,39 +4,5 @@ "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl"}, "python_drivers": {"axi_stream": "axi_stream_driver.py"}, "c_drivers": {} - }, - "zcu102": { - "part": "xczu9eg-ffvb1156-2-e", - "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "c_drivers": {} - }, - "alveo-u50": { - "part": "xcu50-fsvh2104-2-e", - "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, - "c_drivers": {} - }, - "alveo-u250": { - "part": "xcu250-figd2104-2L-e", - "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, - "c_drivers": {} - }, - "alveo-u200": { - "part": "xcu200-fsgd2104-2-e", - "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, - "c_drivers": {} - }, - "alveo-u280": { - "part": "xcu280-fsvh2892-2L-e", - "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, - "c_drivers": {} } } diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py index ccd9521269..4c54e05328 100644 --- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py +++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py @@ -1,13 +1,13 @@ import os -from hls4ml.backends import VivadoBackend +from hls4ml.backends import VitisBackend, VivadoBackend from hls4ml.model.flow import register_flow from hls4ml.report import parse_vivado_report -class VivadoAcceleratorBackend(VivadoBackend): +class VitisAcceleratorBackend(VitisBackend): def __init__(self): - super(VivadoBackend, self).__init__(name='VivadoAccelerator') + super(VivadoBackend, self).__init__(name='VitisAccelerator') self._register_layer_attributes() self._register_flows() @@ -24,7 +24,7 @@ def build( fifo_opt=False, bitfile=False, ): - # run the VivadoBackend build + # run the VitisBackend build super().build( model, reset=reset, @@ -37,63 +37,63 @@ def build( fifo_opt=fifo_opt, ) # Get Config to view Board and Platform - from hls4ml.backends import VivadoAcceleratorConfig + from hls4ml.backends import VitisAcceleratorConfig - vivado_accelerator_config = VivadoAcceleratorConfig( + vitis_accelerator_config = VitisAcceleratorConfig( model.config, model.get_input_variables(), model.get_output_variables() ) # now make a bitfile if bitfile: - if vivado_accelerator_config.get_board().startswith('alveo'): - self.make_xclbin(model, vivado_accelerator_config.get_platform()) - else: - curr_dir = os.getcwd() - os.chdir(model.config.get_output_dir()) - try: - os.system('vivado -mode batch -source design.tcl') - except Exception: - print("Something went wrong, check the Vivado logs") - os.chdir(curr_dir) + # if vitis_accelerator_config.get_board().startswith('alveo'): + # self.make_xclbin(model, vitis_accelerator_config.get_platform()) + # else: + curr_dir = os.getcwd() + os.chdir(model.config.get_output_dir()) + try: + os.system('vivado -mode batch -source design.tcl') # check if this is accepted as a command + except Exception: + print("Something went wrong, check the Vivado logs") + os.chdir(curr_dir) return parse_vivado_report(model.config.get_output_dir()) - def make_xclbin(self, model, platform='xilinx_u250_xdma_201830_2'): - """Create the xclbin for the given model and target platform. - - Args: - model (ModelGraph): Compiled and build model. - platform (str, optional): Development/Deployment target platform, must be installed first. - The host machine only requires the deployment target platform. Refer to the Getting Started section of - the Alveo guide. Defaults to 'xilinx_u250_xdma_201830_2'. - """ - curr_dir = os.getcwd() - abs_path_dir = os.path.abspath(model.config.get_output_dir()) - os.chdir(abs_path_dir) - os.makedirs('xo_files', exist_ok=True) - try: - os.system('vivado -mode batch -source design.tcl') - except Exception: - print("Something went wrong, check the Vivado logs") - project_name = model.config.get_project_name() - ip_repo_path = abs_path_dir + '/' + project_name + '_prj' + '/solution1/impl/ip' - os.makedirs('xclbin_files', exist_ok=True) - os.chdir(abs_path_dir + '/xclbin_files') - # TODO Add other platforms - vitis_cmd = ( - "v++ -t hw --platform " - + platform - + " --link ../xo_files/" - + project_name - + "_kernel.xo -o'" - + project_name - + "_kernel.xclbin' --user_ip_repo_paths " - + ip_repo_path - ) - try: - os.system(vitis_cmd) - except Exception: - print("Something went wrong, check the Vitis/Vivado logs") - os.chdir(curr_dir) + # def make_xclbin(self, model, platform='xilinx_u250_xdma_201830_2'): + # """Create the xclbin for the given model and target platform. + + # Args: + # model (ModelGraph): Compiled and build model. + # platform (str, optional): Development/Deployment target platform, must be installed first. + # The host machine only requires the deployment target platform. Refer to the Getting Started section of + # the Alveo guide. Defaults to 'xilinx_u250_xdma_201830_2'. + # """ + # curr_dir = os.getcwd() + # abs_path_dir = os.path.abspath(model.config.get_output_dir()) + # os.chdir(abs_path_dir) + # os.makedirs('xo_files', exist_ok=True) + # try: + # os.system('vivado -mode batch -source design.tcl') + # except Exception: + # print("Something went wrong, check the Vivado logs") + # project_name = model.config.get_project_name() + # ip_repo_path = abs_path_dir + '/' + project_name + '_prj' + '/solution1/impl/ip' + # os.makedirs('xclbin_files', exist_ok=True) + # os.chdir(abs_path_dir + '/xclbin_files') + # # TODO Add other platforms + # vitis_cmd = ( + # "v++ -t hw --platform " + # + platform + # + " --link ../xo_files/" + # + project_name + # + "_kernel.xo -o'" + # + project_name + # + "_kernel.xclbin' --user_ip_repo_paths " + # + ip_repo_path + # ) + # try: + # os.system(vitis_cmd) + # except Exception: + # print("Something went wrong, check the Vitis/Vivado logs") + # os.chdir(curr_dir) def create_initial_config( self, @@ -141,8 +141,8 @@ def create_initial_config( config['AcceleratorConfig']['Precision']['Output'] = {} config['AcceleratorConfig']['Precision']['Input'] = input_type # float, double or ap_fixed config['AcceleratorConfig']['Precision']['Output'] = output_type # float, double or ap_fixed - if board.startswith('alveo'): - config['AcceleratorConfig']['Platform'] = platform + # if board.startswith('alveo'): + # config['AcceleratorConfig']['Platform'] = platform return config diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py index 7bd931ede3..70429ef0d7 100644 --- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py +++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py @@ -6,7 +6,7 @@ from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType -class VivadoAcceleratorConfig: +class VitisAcceleratorConfig: def __init__(self, config, model_inputs, model_outputs): self.config = config.config self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2') From 02659dd11e0f63c5d0b2d2c57ce7c371aebd99d8 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 19 Apr 2024 11:19:18 +0200 Subject: [PATCH 003/103] Change the accelerator config script --- .../vitis_accelerator/vitis_accelerator_config.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py index 70429ef0d7..b0bf4e894b 100644 --- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py +++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py @@ -54,10 +54,10 @@ def __init__(self, config, model_inputs, model_outputs): assert ( len(model_inputs) == 1 - ), "Only models with one input tensor are currently supported by VivadoAcceleratorBackend" + ), "Only models with one input tensor are currently supported by VitisAcceleratorBackend" assert ( len(model_outputs) == 1 - ), "Only models with one output tensor are currently supported by VivadoAcceleratorBackend" + ), "Only models with one output tensor are currently supported by VitisAcceleratorBackend" self.inp = model_inputs[0] self.out = model_outputs[0] inp_axi_t = self.input_type @@ -131,16 +131,16 @@ def get_clock_period(self): def get_driver_path(self): if self.board.startswith('alveo'): - return '../templates/vivado_accelerator/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file() + return '../templates/vitis_accelerator/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file() else: - return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file() + return '../templates/vitis_accelerator/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file() def get_driver_file(self): driver_ext = '.py' if self.driver == 'python' else '.h' return self.interface + '_driver' + driver_ext def get_krnl_rtl_src_dir(self): - return '../templates/vivado_accelerator/' + 'alveo/' + '/krnl_rtl_src' + return '../templates/vitis_accelerator/' + 'alveo/' + '/krnl_rtl_src' def get_input_type(self): return self.input_type @@ -157,6 +157,6 @@ def get_tcl_file_path(self): if tcl_script is None: raise Exception('No tcl script definition available for the desired interface in supported_board.json') if self.board.startswith('alveo'): - return '../templates/vivado_accelerator/' + 'alveo/' + '/tcl_scripts/' + tcl_script + return '../templates/vitis_accelerator/' + 'alveo/' + '/tcl_scripts/' + tcl_script else: - return '../templates/vivado_accelerator/' + self.board + '/tcl_scripts/' + tcl_script + return '../templates/vitis_accelerator/' + self.board + '/tcl_scripts/' + tcl_script From 56296b6734462efc0cf0c4421abe875e4016cb52 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 19 Apr 2024 14:22:53 +0200 Subject: [PATCH 004/103] Set the vitis accelerator template --- .../templates/vitis_accelerator/build_lib.sh | 17 +++++ .../vitis_accelerator/myproject_axi.cpp | 14 ++++ .../vitis_accelerator/myproject_axi.h | 10 +++ .../python_drivers/axi_stream_driver.py | 75 +++++++++++++++++++ .../pynq-z2/tcl_scripts/axi_lite_design.tcl | 26 +++++++ .../pynq-z2/tcl_scripts/axi_stream_design.tcl | 59 +++++++++++++++ .../python_drivers/axi_stream_driver.py | 75 +++++++++++++++++++ .../zcu102/tcl_scripts/axi_stream_design.tcl | 58 ++++++++++++++ 8 files changed, 334 insertions(+) create mode 100644 hls4ml/templates/vitis_accelerator/build_lib.sh create mode 100644 hls4ml/templates/vitis_accelerator/myproject_axi.cpp create mode 100644 hls4ml/templates/vitis_accelerator/myproject_axi.h create mode 100644 hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py create mode 100644 hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl create mode 100644 hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl create mode 100644 hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py create mode 100644 hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl diff --git a/hls4ml/templates/vitis_accelerator/build_lib.sh b/hls4ml/templates/vitis_accelerator/build_lib.sh new file mode 100644 index 0000000000..69a2bace57 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator/build_lib.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +CC=g++ +if [[ "$OSTYPE" == "linux-gnu" ]]; then + CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique" +elif [[ "$OSTYPE" == "darwin"* ]]; then + CFLAGS="-O3 -fPIC -std=c++11" +fi +INCFLAGS="-Ifirmware/ap_types/" +PROJECT=myproject +LIB_STAMP=mystamp + +${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o +${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o +${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o +${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_axi.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so +rm -f *.o diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator/myproject_axi.cpp new file mode 100644 index 0000000000..05797f1f7b --- /dev/null +++ b/hls4ml/templates/vitis_accelerator/myproject_axi.cpp @@ -0,0 +1,14 @@ +// hls-fpga-machine-learning insert include + +void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]) { + + // hls-fpga-machine-learning insert interface + + // hls-fpga-machine-learning insert local vars + + // hls-fpga-machine-learning insert enqueue + + // hls-fpga-machine-learning insert call + + // hls-fpga-machine-learning insert dequeue +} diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.h b/hls4ml/templates/vitis_accelerator/myproject_axi.h new file mode 100644 index 0000000000..a60dab39c4 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator/myproject_axi.h @@ -0,0 +1,10 @@ +#ifndef MYPROJECT_AXI_H_ +#define MYPROJECT_AXI_H_ + +#include +// hls-fpga-machine-learning insert include + +// hls-fpga-machine-learning insert definitions + +void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]); +#endif diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py new file mode 100644 index 0000000000..1aac79f2d3 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py @@ -0,0 +1,75 @@ +from datetime import datetime + +import numpy as np +from pynq import Overlay, allocate + + +class NeuralNetworkOverlay(Overlay): + def __init__( + self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None + ): + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + self.sendchannel = self.hier_0.axi_dma_0.sendchannel + self.recvchannel = self.hier_0.axi_dma_0.recvchannel + self.input_buffer = allocate(shape=x_shape, dtype=dtype) + self.output_buffer = allocate(shape=y_shape, dtype=dtype) + + def _print_dt(self, timea, timeb, N): + dt = timeb - timea + dts = dt.seconds + dt.microseconds * 10**-6 + rate = N / dts + print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)") + return dts, rate + + def predict(self, X, debug=False, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + if profile: + timea = datetime.now() + if encode is not None: + X = encode(X) + self.input_buffer[:] = X + self.sendchannel.transfer(self.input_buffer) + self.recvchannel.transfer(self.output_buffer) + if debug: + print("Transfer OK") + self.sendchannel.wait() + if debug: + print("Send OK") + self.recvchannel.wait() + if debug: + print("Receive OK") + # result = self.output_buffer.copy() + if decode is not None: + self.output_buffer = decode(self.output_buffer) + + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + return self.output_buffer, dts, rate + else: + return self.output_buffer diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl new file mode 100644 index 0000000000..4d23da26cc --- /dev/null +++ b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl @@ -0,0 +1,26 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force + +set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] +set_property ip_repo_paths ${project_name}_prj [current_project] +update_ip_catalog + +# Create Block Designer design +create_bd_design "design_1" +create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0 +apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0] +create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${project_name}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins ${project_name}_axi_0/s_axi_AXILiteS] + +make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top +add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl new file mode 100644 index 0000000000..aa06e8a6d2 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl @@ -0,0 +1,59 @@ +#@todo: try to remove startgroup and endgroup and see if it work +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force + +set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] +set_property ip_repo_paths ${project_name}_prj [current_project] +update_ip_catalog + +create_bd_design "design_1" + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0 +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0] + +startgroup +set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0] +endgroup + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 +endgroup + +set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP0] +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 +endgroup + +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r] +connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] + +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] + +group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0] + +make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top + +add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py new file mode 100644 index 0000000000..1aac79f2d3 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py @@ -0,0 +1,75 @@ +from datetime import datetime + +import numpy as np +from pynq import Overlay, allocate + + +class NeuralNetworkOverlay(Overlay): + def __init__( + self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None + ): + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + self.sendchannel = self.hier_0.axi_dma_0.sendchannel + self.recvchannel = self.hier_0.axi_dma_0.recvchannel + self.input_buffer = allocate(shape=x_shape, dtype=dtype) + self.output_buffer = allocate(shape=y_shape, dtype=dtype) + + def _print_dt(self, timea, timeb, N): + dt = timeb - timea + dts = dt.seconds + dt.microseconds * 10**-6 + rate = N / dts + print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)") + return dts, rate + + def predict(self, X, debug=False, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + if profile: + timea = datetime.now() + if encode is not None: + X = encode(X) + self.input_buffer[:] = X + self.sendchannel.transfer(self.input_buffer) + self.recvchannel.transfer(self.output_buffer) + if debug: + print("Transfer OK") + self.sendchannel.wait() + if debug: + print("Send OK") + self.recvchannel.wait() + if debug: + print("Receive OK") + # result = self.output_buffer.copy() + if decode is not None: + self.output_buffer = decode(self.output_buffer) + + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + return self.output_buffer, dts, rate + else: + return self.output_buffer diff --git a/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl new file mode 100644 index 0000000000..5d886c6f25 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl @@ -0,0 +1,58 @@ +#@todo: try to remove startgroup and endgroup and see if it work +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force + +set_property board_part xilinx.com:zcu102:part0:3.3 [current_project] +set_property ip_repo_paths ${project_name}_prj [current_project] +update_ip_catalog + +create_bd_design "design_1" +set_property ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project] +update_ip_catalog + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0 +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_0] + +set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0] + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 +endgroup +set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD] +endgroup + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD] +endgroup + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 +endgroup +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r] +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r] + +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] +group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0] + +make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top + +add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages From 7dd01737d79856ce790d78e6ab50597a4ac74131 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 19 Apr 2024 14:41:21 +0200 Subject: [PATCH 005/103] Set vitis accelerator writer --- hls4ml/writer/__init__.py | 4 +- hls4ml/writer/vitis_accelerator_writer.py | 431 ++++++++++++++++++++++ 2 files changed, 434 insertions(+), 1 deletion(-) create mode 100644 hls4ml/writer/vitis_accelerator_writer.py diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index f4eed945a1..b97ce99884 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -1,6 +1,7 @@ from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter -from hls4ml.writer.vitis_writer import VitisWriter +from hls4ml.writer.vitis_writer import VitisWrite +from hls4ml.writer.vitis_accelerator_writer import VitisAcceleratorWriter from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter from hls4ml.writer.vivado_writer import VivadoWriter from hls4ml.writer.writers import Writer, get_writer, register_writer # noqa: F401 @@ -8,5 +9,6 @@ register_writer('Vivado', VivadoWriter) register_writer('VivadoAccelerator', VivadoAcceleratorWriter) register_writer('Vitis', VitisWriter) +register_writer('VitisAccelerator', VitisAcceleratorWriter) register_writer('Quartus', QuartusWriter) register_writer('SymbolicExpression', SymbolicExpressionWriter) diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py new file mode 100644 index 0000000000..fed95905e2 --- /dev/null +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -0,0 +1,431 @@ +import os +from distutils.dir_util import copy_tree +from shutil import copyfile + +# from hls4ml.writer.vivado_writer import VivadoWriter +from hls4ml.writer.vitis_writer import VitisWriter + + +class VitisAcceleratorWriter(VitisWriter): + def __init__(self): + super().__init__() + self.vitis_accelerator_config = None + + def write_axi_wrapper(self, model): + '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces + Args: + model : The ModelGraph to write the wrapper for + ''' + inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_config.get_corrected_types() + indent = ' ' + + ####################### + # myproject_axi.h + ####################### + + filedir = os.path.dirname(os.path.abspath(__file__)) + f = open(os.path.join(filedir, '../templates/vitis_accelerator/myproject_axi.h')) + fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w') + + for line in f.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) + elif '// hls-fpga-machine-learning insert include' in line: + newline = f'#include "{model.config.get_project_name()}.h"\n' + elif 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert definitions' in line: + newline = '' + newline += f'static const unsigned N_IN = {inp.size()};\n' + newline += f'static const unsigned N_OUT = {out.size()};\n' + if self.vitis_accelerator_config.get_interface() == 'axi_stream': + newline += f'typedef {inp_axi_t} T_in;\n' + newline += f'typedef {out_axi_t} T_out;\n' + newline += ( + 'typedef struct in_struct {\n' + + indent + + 'T_in data;\n' + + indent + + 'ap_uint<1> last;\n' + + indent + + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + + indent + + 'in_struct(){this->data = 0; this->last = 0;};\n' + + indent + + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n' + + indent + + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n' + + indent + + 'operator float() const {return this->data;}\n' + + indent + + 'operator double() const {return this->data;}\n' + + indent + + 'in_struct(float data) {this->data = data; this->last = 0;}\n' + + indent + + 'in_struct(double data) {this->data = data; this->last = 0;}\n' + + '} input_axi_t;\n' + ) + newline += ( + 'typedef struct out_struct {\n' + + indent + + 'T_out data;\n' + + indent + + 'ap_uint<1> last;\n' + + indent + + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + + indent + + 'out_struct(){this->data = 0; this->last = 0;};\n' + + indent + + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n' + + indent + + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n' + + indent + + 'operator float() const {return this->data;}\n' + + indent + + 'operator double() const {return this->data;}\n' + + indent + + 'out_struct(float data) {this->data = data; this->last = 0;}\n' + + indent + + 'out_struct(double data) {this->data = data; this->last = 0;}\n' + + '} output_axi_t;\n' + ) + else: + newline += f'typedef {inp_axi_t} input_axi_t;\n' + newline += f'typedef {out_axi_t} output_axi_t;\n' + else: + newline = line + fout.write(newline) + f.close() + fout.close() + + ####################### + # myproject_axi.cpp + ####################### + + f = open(os.path.join(filedir, '../templates/vitis_accelerator/myproject_axi.cpp')) + fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w') + + io_type = model.config.get_config_value("IOType") + + for line in f.readlines(): + if 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert include' in line: + newline = f'#include "{model.config.get_project_name()}_axi.h"\n' + elif '// hls-fpga-machine-learning insert local vars' in line: + newline = '' + if self.vitis_accelerator_config.get_interface() == 'axi_stream': + newline += indent + 'bool is_last = false;\n' + if io_type == 'io_parallel': + newline += indent + inp.type.name + ' in_local[N_IN];\n' + newline += indent + out.type.name + ' out_local[N_OUT];\n' + elif io_type == 'io_stream': + newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n' + newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n' + newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format( + model.get_input_variables()[0].pragma[1] + ) + newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format( + model.get_output_variables()[0].pragma[1] + ) + elif '// hls-fpga-machine-learning insert call' in line: + newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n' + elif '// hls-fpga-machine-learning insert interface' in line: + if self.vitis_accelerator_config.get_interface() == 'axi_lite': + newline = '' + newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' + newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n' + newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n' + elif self.vitis_accelerator_config.get_interface() == 'axi_master': + newline = '' + newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n' + newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format( + model.get_input_variables()[0].pragma[1] + ) + newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format( + model.get_output_variables()[0].pragma[1] + ) + elif self.vitis_accelerator_config.get_interface() == 'axi_stream': + newline = '' + newline += indent + '#pragma HLS INTERFACE axis port=in\n' + newline += indent + '#pragma HLS INTERFACE axis port=out\n' + newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' + if model.config.get_config_value("IOType") == 'io_stream': + newline += indent + '#pragma HLS DATAFLOW\n' + elif '// hls-fpga-machine-learning insert enqueue' in line: + io_type = model.config.get_config_value("IOType") + if io_type == 'io_parallel': + newline = '' + newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n' + if self.vitis_accelerator_config.get_interface() == 'axi_stream': + newline += indent + indent + '#pragma HLS PIPELINE\n' + newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n' + newline += indent + indent + 'is_last |= (in[i].last == 1)? true: false;\n' + else: + newline += indent + indent + '#pragma HLS UNROLL\n' + newline += indent + indent + 'in_local[i] = in[i]; // Read input with cast\n' + newline += indent + '}\n' + elif io_type == 'io_stream': + newline = '' + newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n' + # newline += indent + indent + '#pragma HLS PIPELINE\n' + newline += indent + indent + '{input_t} ctype;\n' + newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n' + newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n' + # newline += indent + indent + indent + '#pragma HLS UNROLL\n' + if self.vitis_accelerator_config.get_interface() == 'axi_stream': + newline += ( + indent + + indent + + indent + + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n' + ) + newline += ( + indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n' + ) + else: + newline += ( + indent + + indent + + indent + + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j]);\n' + ) + newline += indent + indent + '}}\n' + newline += indent + indent + 'in_local.write(ctype);\n' + newline += indent + '}}\n' + newline = newline.format(input_t=inp.type.name) + elif '// hls-fpga-machine-learning insert dequeue' in line: + io_type = model.config.get_config_value("IOType") + if io_type == 'io_parallel': + newline = '' + newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n' + if self.vitis_accelerator_config.get_interface() == 'axi_stream': + newline += indent + indent + '#pragma HLS PIPELINE\n' + newline += indent + indent + 'out[i].data = out_local[i]; // Write output with cast\n' + newline += indent + indent + 'out[i].last = (is_last && (i == N_OUT - 1))? true : false;\n' + else: + newline += indent + indent + '#pragma HLS UNROLL\n' + newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n' + newline += indent + '}\n' + elif io_type == 'io_stream': + newline = '' + newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n' + # newline += indent + indent + '#pragma HLS PIPELINE\n' + newline += indent + indent + '{result_t} ctype = out_local.read();\n' + newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n' + # newline += indent + indent + indent + '#pragma HLS UNROLL\n' + if self.vitis_accelerator_config.get_interface() == 'axi_stream': + newline += ( + indent + + indent + + indent + + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n' + ) + newline += ( + indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j], last);\n' + ) + else: + newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n' + newline += indent + indent + '}}\n' + newline += indent + '}}\n' + newline = newline.format(result_t=out.type.name) + else: + newline = line + fout.write(newline) + f.close() + fout.close() + + def modify_build_script(self, model): + ''' + Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function + ''' + filedir = os.path.dirname(os.path.abspath(__file__)) + oldfile = f'{model.config.get_output_dir()}/build_prj.tcl' + newfile = f'{model.config.get_output_dir()}/build_prj_axi.tcl' + f = open(oldfile) + fout = open(newfile, 'w') + + for line in f.readlines(): + if 'set_top' in line: + newline = line[:-1] + '_axi\n' # remove the newline from the line end and append _axi for the new top + newline += f'add_files firmware/{model.config.get_project_name()}_axi.cpp -cflags "-std=c++0x"\n' + elif f'{model.config.get_project_name()}_cosim' in line: + newline = line.replace( + f'{model.config.get_project_name()}_cosim', + f'{model.config.get_project_name()}_axi_cosim', + ) + elif '${project_name}.tcl' in line: + newline = line.replace('${project_name}.tcl', '${project_name}_axi.tcl') + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + + ################### + # build_lib.sh + ################### + + f = open(os.path.join(filedir, '../templates/vitis_accelerator/build_lib.sh')) + fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w') + + for line in f.readlines(): + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) + + fout.write(line) + f.close() + fout.close() + + def write_wrapper_test(self, model): + ################### + # write myproject_test_wrapper.cpp + ################### + oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp' + newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp' + + f = open(oldfile) + fout = open(newfile, 'w') + + inp = model.get_input_variables()[0] + out = model.get_output_variables()[0] + + for line in f.readlines(): + if f'{model.config.get_project_name()}.h' in line: + newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') + elif inp.definition_cpp() in line: + newline = line.replace( + inp.definition_cpp(), 'input_axi_t inputs[N_IN]' + ) # TODO instead of replacing strings, how about we use proper variables and their definition? + elif out.definition_cpp() in line: + newline = line.replace(out.definition_cpp(), 'output_axi_t outputs[N_OUT]') + elif 'unsigned short' in line: + newline = '' + elif f'{model.config.get_project_name()}(' in line: + indent_amount = line.split(model.config.get_project_name())[0] + newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n' + elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: + newline = ( + line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'input_axi_t') + ) + elif out.size_cpp() in line or out.name in line or out.type.name in line: + newline = ( + line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'output_axi_t') + ) + else: + newline = line + if self.vitis_accelerator_config.get_interface() == 'axi_stream': + if 'nnet::fill_zero' in line: + indent = line.split('n')[0] + newline = indent + 'inputs[N_IN-1].last = 1;\n' + if 'copy_data' in line: + newline = newline.replace('copy_data', 'copy_data_axi') + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + + ################### + # write myproject_bridge_wrapper.cpp + ################### + oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp' + newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge_wrapper.cpp' + + f = open(oldfile) + fout = open(newfile, 'w') + + inp = model.get_input_variables()[0] + out = model.get_output_variables()[0] + + for line in f.readlines(): + if f'{model.config.get_project_name()}.h' in line: + newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') + elif inp.definition_cpp(name_suffix='_ap') in line: + newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'input_axi_t {inp.name}_ap[N_IN]') + elif out.definition_cpp(name_suffix='_ap') in line: + newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'output_axi_t {out.name}_ap[N_OUT]') + elif f'{model.config.get_project_name()}(' in line: + indent_amount = line.split(model.config.get_project_name())[0] + newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format( + model.config.get_project_name(), inp.name, out.name + ) + elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: + newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'input_axi_t') + elif out.size_cpp() in line or out.name in line or out.type.name in line: + newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, 'output_axi_t') + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + + def write_board_script(self, model): + ''' + Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAccelerator + ''' + filedir = os.path.dirname(os.path.abspath(__file__)) + copyfile( + os.path.join(filedir, self.vitis_accelerator_config.get_tcl_file_path()), + f'{model.config.get_output_dir()}/design.tcl', + ) + # Generic alveo board + if self.vitis_accelerator_config.get_board().startswith('alveo'): + src_dir = os.path.join(filedir, self.vitis_accelerator_config.get_krnl_rtl_src_dir()) + dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src' + copy_tree(src_dir, dst_dir) + + ################### + # project.tcl + ################### + f = open(f'{model.config.get_output_dir()}/project.tcl', 'w') + f.write('variable project_name\n') + f.write(f'set project_name "{model.config.get_project_name()}"\n') + f.write('variable backend\n') + f.write('set backend "vitisaccelerator"\n') + f.write('variable part\n') + f.write(f'set part "{self.vitis_accelerator_config.get_part()}"\n') + f.write('variable clock_period\n') + f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) + f.write('variable clock_uncertainty\n') + f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) + f.write('variable version\n') + f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) + if self.vitis_accelerator_config.get_interface() == 'axi_stream': + in_bit, out_bit = self.vitis_accelerator_config.get_io_bitwidth() + f.write(f'set bit_width_hls_output {in_bit}\n') + f.write(f'set bit_width_hls_input {out_bit}\n') + f.close() + + def write_driver(self, model): + filedir = os.path.dirname(os.path.abspath(__file__)) + copyfile( + os.path.join(filedir, self.vitis_accelerator_config.get_driver_path()), + ('{}/' + self.vitis_accelerator_config.get_driver_file()).format(model.config.get_output_dir()), + ) + + def write_new_tar(self, model): + os.remove(model.config.get_output_dir() + '.tar.gz') + super().write_tar(model) + + def write_hls(self, model): + """ + Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAccelerator/AXI interface + """ + # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package + from hls4ml.backends import VitisAcceleratorConfig + + self.vitis_accelerator_config = VitisAcceleratorConfig( + model.config, model.get_input_variables(), model.get_output_variables() + ) + super().write_hls(model) + self.write_board_script(model) + self.write_driver(model) + self.write_wrapper_test(model) + self.write_axi_wrapper(model) + self.modify_build_script(model) + self.write_new_tar(model) From 6f181b8f2d20ec941fca65f373cf5658991880a6 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 19 Apr 2024 14:42:02 +0200 Subject: [PATCH 006/103] Fix writes init --- hls4ml/writer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index b97ce99884..759a7115b1 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -1,6 +1,6 @@ from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter -from hls4ml.writer.vitis_writer import VitisWrite +from hls4ml.writer.vitis_writer import VitisWriter from hls4ml.writer.vitis_accelerator_writer import VitisAcceleratorWriter from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter from hls4ml.writer.vivado_writer import VivadoWriter From bd2e52e4951c8b9eda866518153254131701dfe1 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 10 May 2024 10:33:27 +0200 Subject: [PATCH 007/103] Include separable convolution resource implementation --- .../vitis_accelerator_backend.py | 2 +- .../vivado/nnet_utils/nnet_sepconv_stream.h | 84 +++++++++++++++++-- hls4ml/writer/vitis_accelerator_writer.py | 4 +- 3 files changed, 79 insertions(+), 11 deletions(-) diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py index 4c54e05328..85a6d02f2c 100644 --- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py +++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py @@ -34,7 +34,7 @@ def build( validation=validation, export=export, vsynth=vsynth, - fifo_opt=fifo_opt, + # fifo_opt=fifo_opt, ) # Get Config to view Board and Platform from hls4ml.backends import VitisAcceleratorConfig diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h index 9c16de1908..93532292d6 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h @@ -4,11 +4,77 @@ #include "hls_stream.h" #include "nnet_common.h" #include "nnet_conv_stream.h" +#include namespace nnet { template -void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], +void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + + const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan; + const int nout = CONFIG_T::n_chan; + + const int rufactor = MIN(CONFIG_T::reuse_factor, nin); + // const int multfactor = MIN(nin, CONFIG_T::reuse_factor); + // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor); + const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor); + // const int multscale = multiplier_limit; + + // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); + + #pragma HLS function_instantiate variable=weights,biases + //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_RESHAPE variable=data block factor=block_factor + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor + // std::cout << sizeof(CONFIG_T::n_chan) << std::endl; + +InitAccum: + for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + + +int out_index = 0; + +ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + int in_index = ir; + // int w_index = ir; + // int acc_step = 0; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + out_index = ((in_index % CONFIG_T::n_chan)); + acc[out_index] += static_cast(CONFIG_T::mult_config::template product::product(data[in_index], weights[in_index])); + + + in_index+=rufactor; + + } + } + +// Cast to "res_t" type +Result: + for (int ires = 0; ires < nout; ires++) { + // #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { #pragma HLS INLINE @@ -78,9 +144,9 @@ void depthwise_mult_buffer(hls::stream data_window[ #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - depthwise_product(data, res, weights, biases); + depthwise_product_latency(data, res, weights, biases); } else { - assert("Resource strategy for DepthwiseConv2D is not supported." && false); + depthwise_product_resource(data, res, weights, biases); } CastLoop: @@ -202,10 +268,11 @@ void compute_depthwise_output_buffer_1d(const data_T &in_elem, hls::stream(kernel_data, res_out, + depthwise_product_latency(kernel_data, res_out, weights, biases); } else { - assert("Resource strategy for DepthwiseConv1D is not supported." && false); + depthwise_product_resource(kernel_data, res_out, + weights, biases); } // Pack output @@ -267,10 +334,11 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem, // Dense multiply #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - depthwise_product(kernel_data, res_out, + depthwise_product_latency(kernel_data, res_out, weights, biases); } else { - assert("Resource strategy for DepthwiseConv2D is not supported." && false); + depthwise_product_resource(kernel_data, res_out, + weights, biases); } // Pack output @@ -303,4 +371,4 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem, } } // namespace nnet -#endif +#endif \ No newline at end of file diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index fed95905e2..c29f917882 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -163,7 +163,7 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'is_last |= (in[i].last == 1)? true: false;\n' else: newline += indent + indent + '#pragma HLS UNROLL\n' - newline += indent + indent + 'in_local[i] = in[i]; // Read input with cast\n' + newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n' newline += indent + '}\n' elif io_type == 'io_stream': newline = '' @@ -188,7 +188,7 @@ def write_axi_wrapper(self, model): indent + indent + indent - + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j]);\n' + + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n' ) newline += indent + indent + '}}\n' newline += indent + indent + 'in_local.write(ctype);\n' From b79524062ea0ad776fda50f31a7e5698f016c585 Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 29 May 2024 10:45:42 +0200 Subject: [PATCH 008/103] Separate depthwise resource strategy to 3 cases --- .../vivado/nnet_utils/nnet_sepconv_stream.h | 156 +++++++++++++++++- 1 file changed, 151 insertions(+), 5 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h index 93532292d6..8d8ff9712e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h @@ -9,7 +9,7 @@ namespace nnet { template -void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], +void depthwise_product_resource_rf_leq_nchan(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { @@ -34,7 +34,7 @@ void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_ typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor - // std::cout << sizeof(CONFIG_T::n_chan) << std::endl; + std::cout << "LEQ IMPLE" << std::endl; InitAccum: for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { @@ -42,6 +42,72 @@ void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_ acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; } +int out_index = 0; + +ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + int in_index = ir; + out_index = in_index % CONFIG_T::n_chan; + // int w_index = ir; + // int acc_step = 0; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + + acc[out_index] += static_cast(CONFIG_T::mult_config::template product::product(data[in_index], weights[in_index])); + + in_index+=rufactor; + + out_index+=rufactor; + out_index -= ((out_index) >= CONFIG_T::n_chan)*CONFIG_T::n_chan; + } + } + +// Cast to "res_t" type +Result: + for (int ires = 0; ires < nout; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + + +template +void depthwise_product_resource_rf_gt_nchan_rem0(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + + const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan; + const int nout = CONFIG_T::n_chan; + + const int rufactor = MIN(CONFIG_T::reuse_factor, nin); + // const int multfactor = MIN(nin, CONFIG_T::reuse_factor); + // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor); + const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor); + // const int multscale = multiplier_limit; + + // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); + + #pragma HLS function_instantiate variable=weights,biases + //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_RESHAPE variable=data block factor=block_factor + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor + std::cout << "REM0 IMPLE" << std::endl; + +InitAccum: + for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } int out_index = 0; @@ -56,23 +122,87 @@ int out_index = 0; MultLoop: for (int im = 0; im < block_factor; im++) { #pragma HLS UNROLL - out_index = ((in_index % CONFIG_T::n_chan)); + acc[out_index] += static_cast(CONFIG_T::mult_config::template product::product(data[in_index], weights[in_index])); + in_index+=rufactor; + } + out_index++; + out_index -= ((out_index) == CONFIG_T::n_chan)*CONFIG_T::n_chan; + } + +// Cast to "res_t" type +Result: + for (int ires = 0; ires < nout; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void depthwise_product_resource_rf_gt_nchan(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + + const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan; + const int nout = CONFIG_T::n_chan; + + const int rufactor = MIN(CONFIG_T::reuse_factor, nin); + // const int multfactor = MIN(nin, CONFIG_T::reuse_factor); + // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor); + const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor); + // const int multscale = multiplier_limit; + + // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); + + #pragma HLS function_instantiate variable=weights,biases + //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_RESHAPE variable=data block factor=block_factor + + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor + std::cout << "GT IMPLE" << std::endl; + +InitAccum: + for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +int out_index = 0; + +ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + int in_index = ir; + // int w_index = ir; + // int acc_step = 0; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + + out_index = in_index % CONFIG_T::n_chan; + acc[out_index] += static_cast(CONFIG_T::mult_config::template product::product(data[in_index], weights[in_index])); in_index+=rufactor; - } } // Cast to "res_t" type Result: for (int ires = 0; ires < nout; ires++) { - // #pragma HLS UNROLL + #pragma HLS UNROLL res[ires] = cast(acc[ires]); } } + template void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], @@ -124,6 +254,22 @@ void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_c } } +template +void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + + #pragma HLS INLINE recursive + + if (CONFIG_T::reuse_factor < CONFIG_T::n_chan) { + depthwise_product_resource_rf_leq_nchan(data, res, weights, biases); + } else if (CONFIG_T::reuse_factor % CONFIG_T::n_chan == 0) { + depthwise_product_resource_rf_gt_nchan_rem0(data, res, weights, biases); + } else { + depthwise_product_resource_rf_gt_nchan(data, res, weights, biases); + } +} + template void depthwise_mult_buffer(hls::stream data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T &res_pack, hls::stream &res_stream, unsigned &outputs_ready, From eeb04d4a9a842914181d5461b45297fa45a447a8 Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 29 May 2024 11:47:50 +0200 Subject: [PATCH 009/103] Complete vitis accelerator wrapper for io_stream case --- .../vitis_accelerator/myproject_axi.cpp | 2 +- .../vitis_accelerator/myproject_axi.h | 2 +- hls4ml/writer/vitis_accelerator_writer.py | 161 ++++++++++-------- 3 files changed, 94 insertions(+), 71 deletions(-) diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator/myproject_axi.cpp index 05797f1f7b..01238643ed 100644 --- a/hls4ml/templates/vitis_accelerator/myproject_axi.cpp +++ b/hls4ml/templates/vitis_accelerator/myproject_axi.cpp @@ -1,6 +1,6 @@ // hls-fpga-machine-learning insert include -void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]) { +void myproject_axi(hls::stream< my_pkt > &in, hls::stream< my_pkt > &out) { // hls-fpga-machine-learning insert interface diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.h b/hls4ml/templates/vitis_accelerator/myproject_axi.h index a60dab39c4..d49f98ba14 100644 --- a/hls4ml/templates/vitis_accelerator/myproject_axi.h +++ b/hls4ml/templates/vitis_accelerator/myproject_axi.h @@ -6,5 +6,5 @@ // hls-fpga-machine-learning insert definitions -void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]); +void myproject_axi(hls::stream< my_pkt > &in, hls::stream< my_pkt > &out); #endif diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index c29f917882..a2270fb610 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -32,6 +32,7 @@ def write_axi_wrapper(self, model): newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) elif '// hls-fpga-machine-learning insert include' in line: newline = f'#include "{model.config.get_project_name()}.h"\n' + newline = '#include "ap_axi_sdata.h' elif 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) elif '// hls-fpga-machine-learning insert definitions' in line: @@ -39,57 +40,58 @@ def write_axi_wrapper(self, model): newline += f'static const unsigned N_IN = {inp.size()};\n' newline += f'static const unsigned N_OUT = {out.size()};\n' if self.vitis_accelerator_config.get_interface() == 'axi_stream': - newline += f'typedef {inp_axi_t} T_in;\n' - newline += f'typedef {out_axi_t} T_out;\n' - newline += ( - 'typedef struct in_struct {\n' - + indent - + 'T_in data;\n' - + indent - + 'ap_uint<1> last;\n' - + indent - + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' - + indent - + 'in_struct(){this->data = 0; this->last = 0;};\n' - + indent - + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n' - + indent - + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n' - + indent - + 'operator float() const {return this->data;}\n' - + indent - + 'operator double() const {return this->data;}\n' - + indent - + 'in_struct(float data) {this->data = data; this->last = 0;}\n' - + indent - + 'in_struct(double data) {this->data = data; this->last = 0;}\n' - + '} input_axi_t;\n' - ) - newline += ( - 'typedef struct out_struct {\n' - + indent - + 'T_out data;\n' - + indent - + 'ap_uint<1> last;\n' - + indent - + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' - + indent - + 'out_struct(){this->data = 0; this->last = 0;};\n' - + indent - + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n' - + indent - + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n' - + indent - + 'operator float() const {return this->data;}\n' - + indent - + 'operator double() const {return this->data;}\n' - + indent - + 'out_struct(float data) {this->data = data; this->last = 0;}\n' - + indent - + 'out_struct(double data) {this->data = data; this->last = 0;}\n' - + '} output_axi_t;\n' - ) - else: + newline += f'typedef hls::axis<{inp_axi_t}, 0, 0, 0> my_pkt;;\n' + # newline += f'typedef {inp_axi_t} T_in;\n' + # newline += f'typedef {out_axi_t} T_out;\n' + # newline += ( + # 'typedef struct in_struct {\n' + # + indent + # + 'T_in data;\n' + # + indent + # + 'ap_uint<1> last;\n' + # + indent + # + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + # + indent + # + 'in_struct(){this->data = 0; this->last = 0;};\n' + # + indent + # + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n' + # + indent + # + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n' + # + indent + # + 'operator float() const {return this->data;}\n' + # + indent + # + 'operator double() const {return this->data;}\n' + # + indent + # + 'in_struct(float data) {this->data = data; this->last = 0;}\n' + # + indent + # + 'in_struct(double data) {this->data = data; this->last = 0;}\n' + # + '} input_axi_t;\n' + # ) + # newline += ( + # 'typedef struct out_struct {\n' + # + indent + # + 'T_out data;\n' + # + indent + # + 'ap_uint<1> last;\n' + # + indent + # + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' + # + indent + # + 'out_struct(){this->data = 0; this->last = 0;};\n' + # + indent + # + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n' + # + indent + # + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n' + # + indent + # + 'operator float() const {return this->data;}\n' + # + indent + # + 'operator double() const {return this->data;}\n' + # + indent + # + 'out_struct(float data) {this->data = data; this->last = 0;}\n' + # + indent + # + 'out_struct(double data) {this->data = data; this->last = 0;}\n' + # + '} output_axi_t;\n' + # ) + else: # TODO: handle this case newline += f'typedef {inp_axi_t} input_axi_t;\n' newline += f'typedef {out_axi_t} output_axi_t;\n' else: @@ -114,9 +116,9 @@ def write_axi_wrapper(self, model): newline = f'#include "{model.config.get_project_name()}_axi.h"\n' elif '// hls-fpga-machine-learning insert local vars' in line: newline = '' - if self.vitis_accelerator_config.get_interface() == 'axi_stream': - newline += indent + 'bool is_last = false;\n' - if io_type == 'io_parallel': + # if self.vitis_accelerator_config.get_interface() == 'axi_stream': + # newline += indent + 'bool is_last = false;\n' + if io_type == 'io_parallel': # TODO: handle io_parallel newline += indent + inp.type.name + ' in_local[N_IN];\n' newline += indent + out.type.name + ' out_local[N_OUT];\n' elif io_type == 'io_stream': @@ -131,12 +133,12 @@ def write_axi_wrapper(self, model): elif '// hls-fpga-machine-learning insert call' in line: newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n' elif '// hls-fpga-machine-learning insert interface' in line: - if self.vitis_accelerator_config.get_interface() == 'axi_lite': + if self.vitis_accelerator_config.get_interface() == 'axi_lite': # TODO: handle axi_lite newline = '' newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n' newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n' - elif self.vitis_accelerator_config.get_interface() == 'axi_master': + elif self.vitis_accelerator_config.get_interface() == 'axi_master': # TODO: handle axi_master newline = '' newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n' newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format( @@ -154,7 +156,7 @@ def write_axi_wrapper(self, model): newline += indent + '#pragma HLS DATAFLOW\n' elif '// hls-fpga-machine-learning insert enqueue' in line: io_type = model.config.get_config_value("IOType") - if io_type == 'io_parallel': + if io_type == 'io_parallel': # TODO: handle io_parallel newline = '' newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n' if self.vitis_accelerator_config.get_interface() == 'axi_stream': @@ -166,24 +168,37 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n' newline += indent + '}\n' elif io_type == 'io_stream': + newline = '' + newline += indent + 'my_pkt tmp_a;\n' + + newline = '' + newline += indent + 'my_pkt tmp_b;\n' + newline = '' newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n' - # newline += indent + indent + '#pragma HLS PIPELINE\n' + # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed newline += indent + indent + '{input_t} ctype;\n' - newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n' + # newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n' + # newline += indent + indent + 'pragma HLS aggregate variable=ctype compact=auto' # TODO: check if needed newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n' - # newline += indent + indent + indent + '#pragma HLS UNROLL\n' + # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed if self.vitis_accelerator_config.get_interface() == 'axi_stream': newline += ( indent + indent + indent - + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n' + + 'in.read(tmp_a);\n' ) newline += ( - indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n' + indent + + indent + + indent + + 'ctype[j] = tmp_a.data;\n' ) - else: + # newline += ( + # indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n' + # ) + else: # TODO: handle this case newline += ( indent + indent @@ -196,7 +211,7 @@ def write_axi_wrapper(self, model): newline = newline.format(input_t=inp.type.name) elif '// hls-fpga-machine-learning insert dequeue' in line: io_type = model.config.get_config_value("IOType") - if io_type == 'io_parallel': + if io_type == 'io_parallel': # TODO: handle this case newline = '' newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n' if self.vitis_accelerator_config.get_interface() == 'axi_stream': @@ -215,14 +230,22 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' if self.vitis_accelerator_config.get_interface() == 'axi_stream': + # newline += ( + # indent + # + indent + # + indent + # + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n' + # ) newline += ( - indent - + indent - + indent - + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n' + indent + indent + indent + f'tmp_b.data = ({inp_axi_t}) (ctype[j]);\n' ) + + newline += ( + indent + indent + indent + 'if(tmp_a.last == 1) {tmp_b.last = (((i+1)*(j+1))==N_OUT);}\n' + ) + newline += ( - indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j], last);\n' + indent + indent + indent + 'out.write(tmp_b);\n' ) else: newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n' From 7e47c859fd0ccf3030f99f84267df76cc2b9f343 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 31 May 2024 16:45:28 +0200 Subject: [PATCH 010/103] Fix call to wrong backend writer --- .../backends/vitis_accelerator/vitis_accelerator_backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py index 85a6d02f2c..2e3de9a1cd 100644 --- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py +++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py @@ -154,10 +154,10 @@ def get_writer_flow(self): def _register_flows(self): vivado_ip = 'vivado:ip' - writer_passes = ['make_stamp', 'vivadoaccelerator:write_hls'] + writer_passes = ['make_stamp', 'vitisaccelerator:write_hls'] self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name) self._default_flow = vivado_ip - fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes + # fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes - register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name) + # register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name) From 5a2a38fe56f425c22156659da7bd1508f5263a5c Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 31 May 2024 17:55:15 +0200 Subject: [PATCH 011/103] Fix vitis accelerator writer --- hls4ml/writer/vitis_accelerator_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index a2270fb610..382ff658ad 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -241,7 +241,7 @@ def write_axi_wrapper(self, model): ) newline += ( - indent + indent + indent + 'if(tmp_a.last == 1) {tmp_b.last = (((i+1)*(j+1))==N_OUT);}\n' + indent + indent + indent + 'if(tmp_a.last == 1) {{tmp_b.last = (((i+1)*(j+1))==N_OUT);}}\n' ) newline += ( From 99f9429f8aef1b47d73897a48cef4a2b688fb0d6 Mon Sep 17 00:00:00 2001 From: steltze Date: Tue, 11 Jun 2024 11:16:02 +0200 Subject: [PATCH 012/103] Fix include in axi wrapper header file writer --- .../templates/vivado/ap_types/ap_axi_sdata.h | 441 ++++++++++++++++++ .../vivado/nnet_utils/nnet_sepconv_stream.h | 6 +- hls4ml/writer/vitis_accelerator_writer.py | 4 +- 3 files changed, 445 insertions(+), 6 deletions(-) create mode 100755 hls4ml/templates/vivado/ap_types/ap_axi_sdata.h diff --git a/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h new file mode 100755 index 0000000000..e01c8a8cd1 --- /dev/null +++ b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h @@ -0,0 +1,441 @@ +// 67d7842dbbe25473c3c32b93c0da8047785f30d78e8a024de1b57352245f9689 +/***************************************************************************** + * + * Author: Xilinx, Inc. + * + * This text contains proprietary, confidential information of + * Xilinx, Inc. , is distributed by under license from Xilinx, + * Inc., and may be used, copied and/or disclosed only pursuant to + * the terms of a valid license agreement with Xilinx, Inc. + * + * XILINX IS PROVIDING THIS DESIGN, CODE, OR INFORMATION "AS IS" + * AS A COURTESY TO YOU, SOLELY FOR USE IN DEVELOPING PROGRAMS AND + * SOLUTIONS FOR XILINX DEVICES. BY PROVIDING THIS DESIGN, CODE, + * OR INFORMATION AS ONE POSSIBLE IMPLEMENTATION OF THIS FEATURE, + * APPLICATION OR STANDARD, XILINX IS MAKING NO REPRESENTATION + * THAT THIS IMPLEMENTATION IS FREE FROM ANY CLAIMS OF INFRINGEMENT, + * AND YOU ARE RESPONSIBLE FOR OBTAINING ANY RIGHTS YOU MAY REQUIRE + * FOR YOUR IMPLEMENTATION. XILINX EXPRESSLY DISCLAIMS ANY + * WARRANTY WHATSOEVER WITH RESPECT TO THE ADEQUACY OF THE + * IMPLEMENTATION, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OR + * REPRESENTATIONS THAT THIS IMPLEMENTATION IS FREE FROM CLAIMS OF + * INFRINGEMENT, IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE. + * + * Xilinx products are not intended for use in life support appliances, + * devices, or systems. Use in such applications is expressly prohibited. + * +#- (c) Copyright 2011-2022 Xilinx, Inc. All rights reserved. +#- +#- This file contains confidential and proprietary information +#- of Xilinx, Inc. and is protected under U.S. and +#- international copyright and other intellectual property +#- laws. +#- +#- DISCLAIMER +#- This disclaimer is not a license and does not grant any +#- rights to the materials distributed herewith. Except as +#- otherwise provided in a valid license issued to you by +#- Xilinx, and to the maximum extent permitted by applicable +#- law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND +#- WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES +#- AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING +#- BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON- +#- INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and +#- (2) Xilinx shall not be liable (whether in contract or tort, +#- including negligence, or under any other theory of +#- liability) for any loss or damage of any kind or nature +#- related to, arising under or in connection with these +#- materials, including for any direct, or any indirect, +#- special, incidental, or consequential loss or damage +#- (including loss of data, profits, goodwill, or any type of +#- loss or damage suffered as a result of any action brought +#- by a third party) even if such damage or loss was +#- reasonably foreseeable or Xilinx had been advised of the +#- possibility of the same. +#- +#- CRITICAL APPLICATIONS +#- Xilinx products are not designed or intended to be fail- +#- safe, or for use in any application requiring fail-safe +#- performance, such as life-support or safety devices or +#- systems, Class III medical devices, nuclear facilities, +#- applications related to the deployment of airbags, or any +#- other applications that could lead to death, personal +#- injury, or severe property or environmental damage +#- (individually and collectively, "Critical +#- Applications"). Customer assumes the sole risk and +#- liability of any use of Xilinx products in Critical +#- Applications, subject only to applicable laws and +#- regulations governing limitations on product liability. +#- +#- THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS +#- PART OF THIS FILE AT ALL TIMES. +#- ************************************************************************ + + * + *****************************************************************************/ + +/* + * This file contains the definition of the data types for AXI streaming. + * ap_axi_s is a signed interpretation of the AXI stream + * ap_axi_u is an unsigned interpretation of the AXI stream + */ + +#ifndef __AP__AXI_SDATA__ +#define __AP__AXI_SDATA__ + +#include +#include "ap_int.h" +//#include "ap_fixed.h" +template +struct ap_fixed; +template +struct ap_ufixed; + +namespace hls { + +template constexpr std::size_t bitwidth = sizeof(T) * CHAR_BIT; + +template constexpr std::size_t bitwidth> = W; +template constexpr std::size_t bitwidth> = W; +template +constexpr std::size_t bitwidth> = _AP_W; +template +constexpr std::size_t bitwidth> = _AP_W; + +template +constexpr std::size_t bytewidth = (bitwidth + CHAR_BIT - 1) / CHAR_BIT; + +template struct axis { + static constexpr std::size_t NewWUser = (WUser == 0) ? 1 : WUser; + static constexpr std::size_t NewWId = (WId == 0) ? 1 : WId; + static constexpr std::size_t NewWDest = (WDest == 0) ? 1 : WDest; + T data; + ap_uint> keep; + ap_uint> strb; + ap_uint user; + ap_uint<1> last; + ap_uint id; + ap_uint dest; + + ap_uint *get_user_ptr() { +#pragma HLS inline + return (WUser == 0) ? nullptr : &user; + } + ap_uint *get_id_ptr() { +#pragma HLS inline + return (WId == 0) ? nullptr : &id; + } + ap_uint *get_dest_ptr() { +#pragma HLS inline + return (WDest == 0) ? nullptr : &dest; + } +}; + +} // namespace hls + +template +using ap_axis = hls::axis, WUser, WId, WDest>; + +template +using ap_axiu = hls::axis, WUser, WId, WDest>; + +// Isolate out qdma_axis from hls::axis for special APIs. +template +struct qdma_axis; + +template struct qdma_axis { + // private: + static constexpr std::size_t kBytes = (WData + 7) / 8; + + ap_uint data; + ap_uint keep; + ap_uint<1> strb; + ap_uint<1> user; + ap_uint<1> last; + ap_uint<1> id; + ap_uint<1> dest; + + ap_uint<1> *get_strb_ptr() { +#pragma HLS inline + return nullptr; + } + ap_uint<1> *get_user_ptr() { +#pragma HLS inline + return nullptr; + } + ap_uint<1> *get_id_ptr() { +#pragma HLS inline + return nullptr; + } + ap_uint<1> *get_dest_ptr() { +#pragma HLS inline + return nullptr; + } + + // public: + ap_uint get_data() const { +#pragma HLS inline + return data; + } + ap_uint get_keep() const { +#pragma HLS inline + return keep; + } + ap_uint<1> get_last() const { +#pragma HLS inline + return last; + } + + void set_data(const ap_uint &d) { +#pragma HLS inline + data = d; + } + void set_keep(const ap_uint &k) { +#pragma HLS inline + keep = k; + } + void set_last(const ap_uint<1> &l) { +#pragma HLS inline + last = l; + } + void keep_all() { +#pragma HLS inline + ap_uint k = 0; + keep = ~k; + } + + qdma_axis() { +#pragma HLS inline + ; + } + qdma_axis(ap_uint d) : data(d) { +#pragma HLS inline + ; + } + qdma_axis(ap_uint d, ap_uint k) : data(d), keep(k) { +#pragma HLS inline + ; + } + qdma_axis(ap_uint d, ap_uint k, ap_uint<1> l) + : data(d), keep(k), last(l) { +#pragma HLS inline + ; + } + qdma_axis(const qdma_axis &d) + : data(d.data), keep(d.keep), last(d.last) { +#pragma HLS inline + ; + } + qdma_axis &operator=(const qdma_axis &d) { +#pragma HLS inline + data = d.data; + keep = d.keep; + last = d.last; + return *this; + } +}; + +#ifdef AESL_SYN +#if ((__clang_major__ != 3) || (__clang_minor__ != 1)) +#include "hls_stream.h" +namespace hls { + +template +class stream> final { + typedef axis __STREAM_T__; + +public: + /// Constructors + INLINE stream() {} + + INLINE stream(const char *name) { (void)name; } + + /// Make copy constructor and assignment operator private +private: + INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {} + +public: + /// Overload >> and << operators to implement read() and write() + INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); } + + INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); } + + /// empty & full + bool empty() { +#pragma HLS inline + bool tmp = __fpga_axis_valid(&V.data, &V.keep, &V.strb, V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr()); + return !tmp; + } + + bool full() { +#pragma HLS inline + bool tmp = __fpga_axis_ready(&V.data, &V.keep, &V.strb, V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr()); + return !tmp; + } + + /// Blocking read + void read(__STREAM_T__ &dout) { +#pragma HLS inline + __STREAM_T__ tmp; + __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + dout = tmp; + } + + __STREAM_T__ read() { +#pragma HLS inline + __STREAM_T__ tmp; + __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + return tmp; + } + + /// Blocking write + void write(const __STREAM_T__ &din) { +#pragma HLS inline + __STREAM_T__ tmp = din; + __fpga_axis_push(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + } + + /// Non-Blocking read + bool read_nb(__STREAM_T__ &dout) { +#pragma HLS inline + __STREAM_T__ tmp; + if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, + &tmp.keep, &tmp.strb, tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) { + dout = tmp; + return true; + } else { + return false; + } + } + + /// Non-Blocking write + bool write_nb(const __STREAM_T__ &in) { +#pragma HLS inline + __STREAM_T__ tmp = in; + bool full_n = __fpga_axis_nb_push( + &V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, V.get_id_ptr(), + V.get_dest_ptr(), &tmp.data, &tmp.keep, &tmp.strb, tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr()); + return full_n; + } + +private: + __STREAM_T__ V NO_CTOR; +}; + +// specialization for qdma +template +class stream> { + typedef qdma_axis __STREAM_T__; + +public: + /// Constructors + INLINE stream() {} + + INLINE stream(const char *name) { (void)name; } + + /// Make copy constructor and assignment operator private +private: + INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {} + +public: + /// Overload >> and << operators to implement read() and write() + INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); } + + INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); } + + /// empty & full + bool empty() { +#pragma HLS inline + bool tmp = __fpga_axis_valid(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr()); + return !tmp; + } + + bool full() { +#pragma HLS inline + bool tmp = __fpga_axis_ready(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr()); + return !tmp; + } + + /// Blocking read + void read(__STREAM_T__ &dout) { +#pragma HLS inline + __STREAM_T__ tmp; + __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, + &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr()); + dout = tmp; + } + + __STREAM_T__ read() { +#pragma HLS inline + __STREAM_T__ tmp; + __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + return tmp; + } + + /// Blocking write + void write(const __STREAM_T__ &din) { +#pragma HLS inline + __STREAM_T__ tmp = din; + __fpga_axis_push(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + } + + /// Non-Blocking read + bool read_nb(__STREAM_T__ &dout) { +#pragma HLS inline + __STREAM_T__ tmp; + + if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, + &tmp.keep, &tmp.strb, tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) { + dout = tmp; + return true; + } else { + return false; + } + } + + /// Non-Blocking write + bool write_nb(const __STREAM_T__ &in) { +#pragma HLS inline + __STREAM_T__ tmp = in; + bool full_n = __fpga_axis_nb_push( + &V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, V.get_id_ptr(), + V.get_dest_ptr(), &tmp.data, &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr()); + return full_n; + } + +private: + __STREAM_T__ V NO_CTOR; +}; + +} // namespace hls +#endif +#endif +#endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h index 8d8ff9712e..462bf2571b 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h @@ -34,7 +34,7 @@ void depthwise_product_resource_rf_leq_nchan(data_T data[CONFIG_T::kernel_size * typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor - std::cout << "LEQ IMPLE" << std::endl; + // std::cout << "LEQ IMPLE" << std::endl; InitAccum: for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { @@ -101,7 +101,7 @@ void depthwise_product_resource_rf_gt_nchan_rem0(data_T data[CONFIG_T::kernel_si typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor - std::cout << "REM0 IMPLE" << std::endl; + // std::cout << "REM0 IMPLE" << std::endl; InitAccum: for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { @@ -165,7 +165,7 @@ void depthwise_product_resource_rf_gt_nchan(data_T data[CONFIG_T::kernel_size * typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor - std::cout << "GT IMPLE" << std::endl; + // std::cout << "GT IMPLE" << std::endl; InitAccum: for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index 382ff658ad..a6510d4733 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -32,7 +32,7 @@ def write_axi_wrapper(self, model): newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) elif '// hls-fpga-machine-learning insert include' in line: newline = f'#include "{model.config.get_project_name()}.h"\n' - newline = '#include "ap_axi_sdata.h' + newline += '#include "ap_axi_sdata.h"\n' elif 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) elif '// hls-fpga-machine-learning insert definitions' in line: @@ -171,10 +171,8 @@ def write_axi_wrapper(self, model): newline = '' newline += indent + 'my_pkt tmp_a;\n' - newline = '' newline += indent + 'my_pkt tmp_b;\n' - newline = '' newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n' # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed newline += indent + indent + '{input_t} ctype;\n' From b9609dc5ed5636506be1cf02f708a1542e0bf158 Mon Sep 17 00:00:00 2001 From: steltze Date: Tue, 11 Jun 2024 15:08:20 +0200 Subject: [PATCH 013/103] Change python-cpp bridge writer --- hls4ml/writer/vitis_accelerator_writer.py | 27 ++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index a6510d4733..650cf77100 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -318,10 +318,10 @@ def write_wrapper_test(self, model): newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') elif inp.definition_cpp() in line: newline = line.replace( - inp.definition_cpp(), 'input_axi_t inputs[N_IN]' + inp.definition_cpp(), 'hls::stream< my_pkt > inputs' ) # TODO instead of replacing strings, how about we use proper variables and their definition? elif out.definition_cpp() in line: - newline = line.replace(out.definition_cpp(), 'output_axi_t outputs[N_OUT]') + newline = line.replace(out.definition_cpp(), 'hls::stream< my_pkt > outputs') elif 'unsigned short' in line: newline = '' elif f'{model.config.get_project_name()}(' in line: @@ -329,11 +329,11 @@ def write_wrapper_test(self, model): newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n' elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: newline = ( - line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'input_axi_t') + line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'hls::stream< my_pkt >') ) elif out.size_cpp() in line or out.name in line or out.type.name in line: newline = ( - line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'output_axi_t') + line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'hls::stream< my_pkt >') ) else: newline = line @@ -365,18 +365,25 @@ def write_wrapper_test(self, model): if f'{model.config.get_project_name()}.h' in line: newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') elif inp.definition_cpp(name_suffix='_ap') in line: - newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'input_axi_t {inp.name}_ap[N_IN]') + newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {inp.name}_ap') elif out.definition_cpp(name_suffix='_ap') in line: - newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'output_axi_t {out.name}_ap[N_OUT]') + newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {out.name}_ap') elif f'{model.config.get_project_name()}(' in line: indent_amount = line.split(model.config.get_project_name())[0] newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format( model.config.get_project_name(), inp.name, out.name ) - elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: - newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'input_axi_t') - elif out.size_cpp() in line or out.name in line or out.type.name in line: - newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, 'output_axi_t') + # elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: + # newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'hls::stream< my_pkt >') + # elif out.size_cpp() in line or out.name in line or out.type.name in line: + # newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, 'hls::stream< my_pkt >') + + elif ("nnet::convert_data Date: Tue, 11 Jun 2024 15:13:27 +0200 Subject: [PATCH 014/103] Fix tlast handling in axis wrapper writer --- hls4ml/writer/vitis_accelerator_writer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index 650cf77100..76013fcf6e 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -206,6 +206,8 @@ def write_axi_wrapper(self, model): newline += indent + indent + '}}\n' newline += indent + indent + 'in_local.write(ctype);\n' newline += indent + '}}\n' + newline += indent + 'tmp_b = tmp_a;\n' + newline += indent + 'tmp_b.last = 0;\n' newline = newline.format(input_t=inp.type.name) elif '// hls-fpga-machine-learning insert dequeue' in line: io_type = model.config.get_config_value("IOType") From 014a7b2ec43a9218b3ec5c39d1aa6ea17b6f750f Mon Sep 17 00:00:00 2001 From: steltze Date: Tue, 11 Jun 2024 17:33:51 +0200 Subject: [PATCH 015/103] Extend convert_data to handle stream type, use that for the bridge --- .../templates/vivado/nnet_utils/nnet_helpers.h | 16 ++++++++++++++++ hls4ml/writer/vitis_accelerator_writer.py | 17 ++++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h index b8c2a48d19..3938af347c 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h @@ -2,6 +2,7 @@ #define NNET_HELPERS_H #include "hls_stream.h" +#include "ap_axi_sdata.h" #include #include #include @@ -161,6 +162,21 @@ template void convert_data(hls::stre } } +template void convert_data(srcType *src, hls::stream> &dst) { + for (size_t i = 0; i < SIZE; i++) { + hls::axis ctype; + ctype.data = dstType(src[i]); + dst.write(ctype); + } +} + +template void convert_data(hls::stream> &src, dstType *dst) { + for (size_t i = 0; i < SIZE; i++) { + hls::axis ctype = src.read(); + dst[i] = dstType(ctype.data); + } +} + extern bool trace_enabled; extern std::map *trace_outputs; extern size_t trace_type_size; diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index 76013fcf6e..cd9e349b4b 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -308,6 +308,8 @@ def write_wrapper_test(self, model): ################### oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp' newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp' + + inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_config.get_corrected_types() f = open(oldfile) fout = open(newfile, 'w') @@ -375,17 +377,10 @@ def write_wrapper_test(self, model): newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format( model.config.get_project_name(), inp.name, out.name ) - # elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: - # newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'hls::stream< my_pkt >') - # elif out.size_cpp() in line or out.name in line or out.type.name in line: - # newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, 'hls::stream< my_pkt >') - - elif ("nnet::convert_data Date: Fri, 14 Jun 2024 10:55:07 +0200 Subject: [PATCH 016/103] Add zcu102 to the supported boards json --- .../vitis_accelerator/supported_boards.json | 6 ++ hls4ml/writer/vitis_accelerator_writer.py | 56 ------------------- 2 files changed, 6 insertions(+), 56 deletions(-) diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator/supported_boards.json index 5f44560ccd..4a54ea2924 100644 --- a/hls4ml/backends/vitis_accelerator/supported_boards.json +++ b/hls4ml/backends/vitis_accelerator/supported_boards.json @@ -4,5 +4,11 @@ "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl"}, "python_drivers": {"axi_stream": "axi_stream_driver.py"}, "c_drivers": {} + }, + "zcu102": { + "part": "xczu9eg-ffvb1156-2-e", + "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "c_drivers": {} } } diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index cd9e349b4b..70573bb5c2 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -41,56 +41,6 @@ def write_axi_wrapper(self, model): newline += f'static const unsigned N_OUT = {out.size()};\n' if self.vitis_accelerator_config.get_interface() == 'axi_stream': newline += f'typedef hls::axis<{inp_axi_t}, 0, 0, 0> my_pkt;;\n' - # newline += f'typedef {inp_axi_t} T_in;\n' - # newline += f'typedef {out_axi_t} T_out;\n' - # newline += ( - # 'typedef struct in_struct {\n' - # + indent - # + 'T_in data;\n' - # + indent - # + 'ap_uint<1> last;\n' - # + indent - # + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' - # + indent - # + 'in_struct(){this->data = 0; this->last = 0;};\n' - # + indent - # + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n' - # + indent - # + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n' - # + indent - # + 'operator float() const {return this->data;}\n' - # + indent - # + 'operator double() const {return this->data;}\n' - # + indent - # + 'in_struct(float data) {this->data = data; this->last = 0;}\n' - # + indent - # + 'in_struct(double data) {this->data = data; this->last = 0;}\n' - # + '} input_axi_t;\n' - # ) - # newline += ( - # 'typedef struct out_struct {\n' - # + indent - # + 'T_out data;\n' - # + indent - # + 'ap_uint<1> last;\n' - # + indent - # + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n' - # + indent - # + 'out_struct(){this->data = 0; this->last = 0;};\n' - # + indent - # + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n' - # + indent - # + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n' - # + indent - # + 'operator float() const {return this->data;}\n' - # + indent - # + 'operator double() const {return this->data;}\n' - # + indent - # + 'out_struct(float data) {this->data = data; this->last = 0;}\n' - # + indent - # + 'out_struct(double data) {this->data = data; this->last = 0;}\n' - # + '} output_axi_t;\n' - # ) else: # TODO: handle this case newline += f'typedef {inp_axi_t} input_axi_t;\n' newline += f'typedef {out_axi_t} output_axi_t;\n' @@ -230,12 +180,6 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' if self.vitis_accelerator_config.get_interface() == 'axi_stream': - # newline += ( - # indent - # + indent - # + indent - # + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n' - # ) newline += ( indent + indent + indent + f'tmp_b.data = ({inp_axi_t}) (ctype[j]);\n' ) From 290896b73cfaf35d941734640642a81e56d014f9 Mon Sep 17 00:00:00 2001 From: steltze Date: Thu, 20 Jun 2024 14:40:08 +0200 Subject: [PATCH 017/103] Fix some c synthesis warnings --- hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h | 2 +- hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h | 2 +- hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h | 2 +- hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h | 2 +- hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h | 4 +++- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h index 20b6fecb49..aad5d9a430 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h @@ -86,7 +86,7 @@ void separable_conv_1d_cl(hls::stream &data, hls::stream &res, #pragma HLS DATAFLOW hls::stream depthwise_res; - unsigned res_depth = CONFIG_T::depthwise_config::out_width; + const unsigned res_depth = CONFIG_T::depthwise_config::out_width; #pragma HLS STREAM variable=depthwise_res depth=res_depth depthwise_conv_1d_buffer_cl(data, depthwise_res, diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h index a3747990e0..a119fb9e2a 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h @@ -120,7 +120,7 @@ void separable_conv_2d_cl(hls::stream &data, hls::stream &res, #pragma HLS DATAFLOW hls::stream depthwise_res; - unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; + const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; #pragma HLS STREAM variable=depthwise_res depth=res_depth depthwise_conv_2d_buffer_cl(data, depthwise_res, diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h index 254fc5067b..ce097399c0 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h @@ -106,7 +106,7 @@ void separable_conv_1d_cl(hls::stream &data, hls::stream &res, #pragma HLS DATAFLOW hls::stream depthwise_res; - unsigned res_depth = CONFIG_T::depthwise_config::out_width; + const unsigned res_depth = CONFIG_T::depthwise_config::out_width; #pragma HLS STREAM variable=depthwise_res depth=res_depth depthwise_conv_1d_cl(data, depthwise_res, depthwise_weights, diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h index d56ed6d9a4..c4e0654890 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h @@ -130,7 +130,7 @@ void separable_conv_2d_cl(hls::stream &data, hls::stream &res, #pragma HLS DATAFLOW hls::stream depthwise_res; - unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; + const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; #pragma HLS STREAM variable=depthwise_res depth=res_depth depthwise_conv_2d_cl(data, depthwise_res, depthwise_weights, diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h index 462bf2571b..dea028d53b 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h @@ -207,7 +207,7 @@ template void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { - #pragma HLS INLINE + // #pragma HLS INLINE typename CONFIG_T::accum_t mult[CONFIG_T::kernel_size * CONFIG_T::n_chan]; typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; @@ -239,8 +239,10 @@ void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_c // Accumulate multiplication result Accum1: for (int ii = 0; ii < CONFIG_T::kernel_size; ii++) { + // #pragma HLS PIPELINE II=1 rewind Accum2: for (int jj = 0; jj < CONFIG_T::n_chan; jj++) { + // #pragma HLS UNROLL int index = ii * CONFIG_T::n_chan + jj; acc[jj] += mult[index]; } From c9dfcf267395c8f4a6313175f554fce88ab8b973 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Fri, 12 Apr 2024 01:41:43 +0200 Subject: [PATCH 018/103] Group more tests per YAML to reduce the number of envs created --- .gitlab-ci.yml | 2 ++ test/pytest/generate_ci_yaml.py | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5a491d2f7b..a4aa6d507a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,6 +6,8 @@ stages: generator: stage: generate image: python:3.8-alpine + variables: + N_TESTS_PER_YAML: 5 tags: - k8s-default before_script: diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py index 107cabdbbb..7a495267ab 100644 --- a/test/pytest/generate_ci_yaml.py +++ b/test/pytest/generate_ci_yaml.py @@ -1,4 +1,6 @@ import glob +import itertools +import os import yaml @@ -15,6 +17,14 @@ EXAMPLEMODEL: {} """ +n_test_files_per_yml = int(os.environ.get('N_TESTS_PER_YAML', 4)) + + +def batched(iterable, chunk_size): + iterator = iter(iterable) + while chunk := tuple(itertools.islice(iterator, chunk_size)): + yield chunk + def uses_example_model(test_filename): with open(test_filename) as f: @@ -24,9 +34,12 @@ def uses_example_model(test_filename): yml = None tests = glob.glob('test_*.py') -for test in tests: - name = test.replace('test_', '').replace('.py', '') - new_yml = yaml.safe_load(template.format(name, f'test_{name}.py', int(uses_example_model(test)))) +for test_batch in batched(tests, n_test_files_per_yml): + name = '+'.join([test.replace('test_', '').replace('.py', '') for test in test_batch]) + test_files = ' '.join(list(test_batch)) + uses_example_models = int(any([uses_example_model(test) for test in test_batch])) + + new_yml = yaml.safe_load(template.format(name, test_files, uses_example_models)) if yml is None: yml = new_yml else: From d3b8e20f9af537e4325619ccb3f3d619b3fe667b Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 26 Mar 2024 00:22:41 +0100 Subject: [PATCH 019/103] Support negative_slope in quantized_relu --- hls4ml/converters/keras/qkeras.py | 4 ++++ hls4ml/model/profiling.py | 1 + hls4ml/utils/config.py | 7 ++++-- test/pytest/test_qkeras.py | 38 +++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/hls4ml/converters/keras/qkeras.py b/hls4ml/converters/keras/qkeras.py index a8038da46d..e610177196 100644 --- a/hls4ml/converters/keras/qkeras.py +++ b/hls4ml/converters/keras/qkeras.py @@ -166,6 +166,10 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader) layer['slope_prec'] = FixedPrecisionType(width=2, integer=0, signed=False) layer['shift_prec'] = FixedPrecisionType(width=2, integer=0, signed=False) layer['activation'] = activation_config['class_name'].replace('quantized_', 'hard_') + elif activation_config['class_name'] == 'quantized_relu' and activation_config['config']['negative_slope'] != 0: + layer['class_name'] = 'LeakyReLU' + layer['activation'] = activation_config['class_name'].replace('quantized_', 'leaky_') + layer['activ_param'] = activation_config['config']['negative_slope'] else: layer['class_name'] = 'Activation' layer['activation'] = activation_config['class_name'].replace('quantized_', '') diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py index 904ecc3d35..7cdef74ff3 100644 --- a/hls4ml/model/profiling.py +++ b/hls4ml/model/profiling.py @@ -589,6 +589,7 @@ def get_ymodel_keras(keras_model, X): name = layer.name if ( hasattr(layer, "activation") + and layer.activation is not None and layer.activation.__name__ != "linear" and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation)) ): diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py index 5d7ca1ae72..7294dcf6fe 100644 --- a/hls4ml/utils/config.py +++ b/hls4ml/utils/config.py @@ -79,8 +79,11 @@ def _get_precision_from_quantizer(quantizer): rnd = "AP_RND_CONV" overflow = "AP_SAT" if quantizer['class_name'] in ('quantized_relu', 'quantized_relu_po2'): - signed = False - integer -= 1 + if quantizer['config']['negative_slope'] != 0.0: + signed = True + else: + signed = False + integer -= 1 elif quantizer['class_name'] == 'quantized_tanh': overflow = "AP_SAT_SYM" if quantizer['config']['symmetric'] else "AP_SAT" integer = 1 diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py index 1812776684..61a2b15a4a 100644 --- a/test/pytest/test_qkeras.py +++ b/test/pytest/test_qkeras.py @@ -307,6 +307,44 @@ def test_quantizer(randX_1000_1, quantizer, backend, io_type): np.testing.assert_array_equal(y_qkeras, y_hls4ml) +@pytest.mark.parametrize( + 'quantizer', + [ + (quantized_relu(4, negative_slope=0.5)), + (quantized_relu(8, 4, negative_slope=1.0)), + (quantized_relu(10, 2, negative_slope=0.25)), + ], +) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) +def test_relu_negative_slope(randX_1000_1, quantizer, backend, io_type): + ''' + Test a a transformation of quantized_relu with negative_slope to leaky_relu activation layer. + ''' + X = randX_1000_1 + X = -X # Make it negative so leaky relu does something + X = np.round(X * 2**10) * 2**-10 # make it an exact ap_fixed<16,6> + model = Sequential() + model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer')) + model.compile() + + config = hls4ml.utils.config_from_keras_model(model, granularity='name') + output_dir = str( + test_root_path + / 'hls4mlprj_qkeras_leaky_relu_{}_{}_neg_slope_{}_{}_{}'.format( + quantizer.bits, quantizer.integer, quantizer.negative_slope, backend, io_type + ) + ) + hls_model = hls4ml.converters.convert_from_keras_model( + model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type + ) + hls_model.compile() + + y_qkeras = model.predict(X) + y_hls4ml = hls_model.predict(X) + np.testing.assert_allclose(y_hls4ml, y_qkeras, rtol=1e-5, atol=0) + + @pytest.mark.parametrize( 'weight_quantizer,activation_quantizer,', [ From b32984fa7265f501bd225f71c5b1798f21caba87 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Mar 2024 20:57:43 +0000 Subject: [PATCH 020/103] [pre-commit.ci] auto fixes from pre-commit hooks --- test/pytest/test_qkeras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py index 61a2b15a4a..f068e4e503 100644 --- a/test/pytest/test_qkeras.py +++ b/test/pytest/test_qkeras.py @@ -322,7 +322,7 @@ def test_relu_negative_slope(randX_1000_1, quantizer, backend, io_type): Test a a transformation of quantized_relu with negative_slope to leaky_relu activation layer. ''' X = randX_1000_1 - X = -X # Make it negative so leaky relu does something + X = -X # Make it negative so leaky relu does something X = np.round(X * 2**10) * 2**-10 # make it an exact ap_fixed<16,6> model = Sequential() model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer')) From 98273a044b11e8e17b122ab55056dbc8d444a5ae Mon Sep 17 00:00:00 2001 From: Vladimir Date: Tue, 16 Apr 2024 21:30:43 +0200 Subject: [PATCH 021/103] Fix activation check in profiling --- hls4ml/model/profiling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py index 7cdef74ff3..84a83de23e 100644 --- a/hls4ml/model/profiling.py +++ b/hls4ml/model/profiling.py @@ -588,10 +588,10 @@ def get_ymodel_keras(keras_model, X): # Note that if the layer is a standalone activation layer then skip this name = layer.name if ( - hasattr(layer, "activation") + hasattr(layer, 'activation') and layer.activation is not None - and layer.activation.__name__ != "linear" and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation)) + and layer.activation.__name__ != 'linear' ): tmp_activation = layer.activation layer.activation = None From 1640c4bcd2f73a0cf0ee9772cded4f863b82c703 Mon Sep 17 00:00:00 2001 From: dgburnette <36940078+dgburnette@users.noreply.github.com> Date: Mon, 15 Apr 2024 07:12:17 -0700 Subject: [PATCH 022/103] Stage initial set of changes for the Catapult backend (#956) * Stage initial set of changes for the Catapult backend * applied some changes for issues reported by pre-commit. But pre-commit still reorders backends/__init__.py incorrectly * final changes for clean pre-commit * minor edits * Checkin * Add file * pre-commit format * add in nnet_utils files * format changes for pre-commit * run flows by netlist type * update design pragmas on some blocks. cleaned up TCL script * move AC submodules under hls4ml/templates/catapult * merged in latest changes from mainline * remove bad submodules * recreate AC submodules in hls4ml/templates/catapult * pre-commit fixes * pre-commit fixes * turn on Catapult backend testing * removed io_parallel testing for Catapult backend * add Catapult * added Catapult * added Catapult * added Catapult to some pytests * Added concept of ProjectDir to distinguish the project directory of the HLS tool from the ProjectName which is used for the cpp file and top function name * better handling of c++ testbench data files. enhanced directory naming. * fix syntax * workaround from Giuseppe * Add concept of ProjectDir for Catapult which is different from ProjectName that gets used for the top function name and the cpp files * add new file from Giuseppe * improvements to project management, reporting and testbench * include new file in generation of parameters.h * add hard_tanh for io_parallel. formatting * Full path to the header nnet_helpers.h is necessary in the include (check if this is not an issue with recent versions of Catapult) * Avoid ceiling function from the math library: ceil(n/d) ---> (n+d-1)/n * These are mostly workarounds for the BUP synyhesis of a testing model (should these changes make in something more general?) * revert format back to what clang-format currently enforces * simplification from Giuesspe * Fixes for bottom-up handling of libraries * pre-commit format fixes * fix loops * consolidate prj scripts * cleanup pragmas * switch from using ssh to https for submodules * fix include path for non-catapult install * update pytest environment * CL 1100381 * CL 1098112 * roll in latest changes. turn off Catapult variants of test_binary_cnn and test_cnn_mnist_qkeras for now * fix test failure * disable Catapult test for pytorch until it is supported * disable Catapult for pytorch tests * Simpler submodule initialization for CI --------- Co-authored-by: David Burnette Co-authored-by: Giuseppe Di Guglielmo Co-authored-by: Jovan Mitrevski Co-authored-by: Vladimir Loncar --- .gitmodules | 9 + hls4ml/backends/__init__.py | 3 + hls4ml/backends/catapult/__init__.py | 0 hls4ml/backends/catapult/catapult_backend.py | 515 ++++++++ hls4ml/backends/catapult/passes/__init__.py | 0 .../catapult/passes/broadcast_stream.py | 117 ++ .../backends/catapult/passes/conv_same_pad.py | 109 ++ .../backends/catapult/passes/conv_stream.py | 52 + .../catapult/passes/convolution_templates.py | 508 ++++++++ .../catapult/passes/convolution_winograd.py | 175 +++ .../catapult/passes/core_templates.py | 216 ++++ .../passes/fifo_depth_optimization.py | 104 ++ .../catapult/passes/garnet_templates.py | 249 ++++ .../catapult/passes/merge_templates.py | 106 ++ hls4ml/backends/catapult/passes/pointwise.py | 92 ++ .../catapult/passes/pooling_templates.py | 109 ++ .../catapult/passes/quantization_templates.py | 36 + .../catapult/passes/recurrent_templates.py | 175 +++ .../catapult/passes/reshaping_templates.py | 132 ++ .../catapult/passes/resource_strategy.py | 48 + .../catapult/passes/transform_types.py | 52 + hls4ml/backends/fpga/fpga_types.py | 65 + hls4ml/converters/__init__.py | 6 +- hls4ml/model/graph.py | 6 + hls4ml/model/layers.py | 1 + hls4ml/model/profiling.py | 5 +- hls4ml/report/__init__.py | 3 + hls4ml/report/catapult_report.py | 256 ++++ hls4ml/templates/catapult/ac_math | 1 + hls4ml/templates/catapult/ac_simutils | 1 + hls4ml/templates/catapult/ac_types | 1 + hls4ml/templates/catapult/build_lib.sh | 21 + hls4ml/templates/catapult/build_prj.tcl | 356 ++++++ hls4ml/templates/catapult/catapult_synth.tcl | 3 + hls4ml/templates/catapult/firmware/defines.h | 15 + .../templates/catapult/firmware/myproject.cpp | 29 + .../templates/catapult/firmware/myproject.h | 15 + .../templates/catapult/firmware/parameters.h | 15 + .../templates/catapult/myproject_bridge.cpp | 72 ++ hls4ml/templates/catapult/myproject_test.cpp | 164 +++ .../catapult/nnet_utils/ap_shift_reg.h | 136 ++ .../templates/catapult/nnet_utils/hls_math.h | 24 + .../catapult/nnet_utils/nnet_activation.h | 1107 +++++++++++++++++ .../nnet_utils/nnet_activation_stream.h | 922 ++++++++++++++ .../catapult/nnet_utils/nnet_array.h | 52 + .../catapult/nnet_utils/nnet_batchnorm.h | 127 ++ .../nnet_utils/nnet_batchnorm_stream.h | 113 ++ .../catapult/nnet_utils/nnet_code_gen.h | 32 + .../catapult/nnet_utils/nnet_common.h | 66 + .../catapult/nnet_utils/nnet_conv1d.h | 62 + .../catapult/nnet_utils/nnet_conv1d_latency.h | 198 +++ .../nnet_utils/nnet_conv1d_resource.h | 241 ++++ .../catapult/nnet_utils/nnet_conv1d_stream.h | 94 ++ .../catapult/nnet_utils/nnet_conv2d.h | 84 ++ .../catapult/nnet_utils/nnet_conv2d_latency.h | 392 ++++++ .../nnet_utils/nnet_conv2d_resource.h | 275 ++++ .../catapult/nnet_utils/nnet_conv2d_stream.h | 117 ++ .../catapult/nnet_utils/nnet_conv_stream.h | 398 ++++++ .../catapult/nnet_utils/nnet_dense.h | 49 + .../nnet_utils/nnet_dense_compressed.h | 106 ++ .../catapult/nnet_utils/nnet_dense_latency.h | 92 ++ .../catapult/nnet_utils/nnet_dense_resource.h | 262 ++++ .../catapult/nnet_utils/nnet_dense_stream.h | 72 ++ .../catapult/nnet_utils/nnet_embed.h | 47 + .../catapult/nnet_utils/nnet_embed_stream.h | 34 + .../catapult/nnet_utils/nnet_garnet.h | 816 ++++++++++++ .../catapult/nnet_utils/nnet_helpers.h | 461 +++++++ .../catapult/nnet_utils/nnet_image.h | 41 + .../catapult/nnet_utils/nnet_image_stream.h | 66 + .../templates/catapult/nnet_utils/nnet_math.h | 178 +++ .../catapult/nnet_utils/nnet_merge.h | 232 ++++ .../catapult/nnet_utils/nnet_merge_stream.h | 380 ++++++ .../templates/catapult/nnet_utils/nnet_mult.h | 127 ++ .../catapult/nnet_utils/nnet_padding.h | 145 +++ .../catapult/nnet_utils/nnet_padding_stream.h | 95 ++ .../catapult/nnet_utils/nnet_pooling.h | 362 ++++++ .../catapult/nnet_utils/nnet_pooling_stream.h | 601 +++++++++ .../nnet_utils/nnet_recr_activations.h | 56 + .../catapult/nnet_utils/nnet_recurrent.h | 572 +++++++++ .../nnet_utils/nnet_sepconv1d_stream.h | 127 ++ .../catapult/nnet_utils/nnet_sepconv2d.h | 82 ++ .../nnet_utils/nnet_sepconv2d_stream.h | 152 +++ .../catapult/nnet_utils/nnet_sepconv_stream.h | 315 +++++ .../catapult/nnet_utils/nnet_stream.h | 156 +++ .../catapult/nnet_utils/nnet_types.h | 64 + .../templates/vivado_accelerator/build_lib.sh | 0 hls4ml/writer/__init__.py | 2 + hls4ml/writer/catapult_writer.py | 929 ++++++++++++++ test/pytest/ci-template.yml | 3 +- test/pytest/test_activations.py | 2 +- test/pytest/test_batchnorm.py | 2 +- test/pytest/test_batchnorm_pytorch.py | 2 +- test/pytest/test_clone_flatten.py | 2 +- test/pytest/test_cnn_mnist.py | 2 +- test/pytest/test_conv1d.py | 4 + test/pytest/test_embed.py | 4 +- test/pytest/test_globalpooling.py | 4 +- test/pytest/test_keras_h5_loader.py | 2 +- test/pytest/test_keras_nested_model.py | 4 +- test/pytest/test_pointwiseconv.py | 4 + test/pytest/test_pooling.py | 4 +- test/pytest/test_repack_stream.py | 4 +- test/pytest/test_reshape.py | 2 +- test/pytest/test_sepconv1d.py | 2 +- test/pytest/test_sepconv2d.py | 2 +- test/pytest/test_softmax.py | 4 +- test/pytest/test_softsign.py | 2 +- test/pytest/test_upsampling.py | 2 +- test/pytest/test_zeropadding.py | 2 +- 109 files changed, 14932 insertions(+), 30 deletions(-) create mode 100644 hls4ml/backends/catapult/__init__.py create mode 100644 hls4ml/backends/catapult/catapult_backend.py create mode 100644 hls4ml/backends/catapult/passes/__init__.py create mode 100644 hls4ml/backends/catapult/passes/broadcast_stream.py create mode 100755 hls4ml/backends/catapult/passes/conv_same_pad.py create mode 100755 hls4ml/backends/catapult/passes/conv_stream.py create mode 100755 hls4ml/backends/catapult/passes/convolution_templates.py create mode 100644 hls4ml/backends/catapult/passes/convolution_winograd.py create mode 100755 hls4ml/backends/catapult/passes/core_templates.py create mode 100755 hls4ml/backends/catapult/passes/fifo_depth_optimization.py create mode 100755 hls4ml/backends/catapult/passes/garnet_templates.py create mode 100755 hls4ml/backends/catapult/passes/merge_templates.py create mode 100755 hls4ml/backends/catapult/passes/pointwise.py create mode 100755 hls4ml/backends/catapult/passes/pooling_templates.py create mode 100755 hls4ml/backends/catapult/passes/quantization_templates.py create mode 100755 hls4ml/backends/catapult/passes/recurrent_templates.py create mode 100755 hls4ml/backends/catapult/passes/reshaping_templates.py create mode 100755 hls4ml/backends/catapult/passes/resource_strategy.py create mode 100755 hls4ml/backends/catapult/passes/transform_types.py create mode 100755 hls4ml/report/catapult_report.py create mode 160000 hls4ml/templates/catapult/ac_math create mode 160000 hls4ml/templates/catapult/ac_simutils create mode 160000 hls4ml/templates/catapult/ac_types create mode 100755 hls4ml/templates/catapult/build_lib.sh create mode 100755 hls4ml/templates/catapult/build_prj.tcl create mode 100644 hls4ml/templates/catapult/catapult_synth.tcl create mode 100755 hls4ml/templates/catapult/firmware/defines.h create mode 100755 hls4ml/templates/catapult/firmware/myproject.cpp create mode 100755 hls4ml/templates/catapult/firmware/myproject.h create mode 100755 hls4ml/templates/catapult/firmware/parameters.h create mode 100755 hls4ml/templates/catapult/myproject_bridge.cpp create mode 100755 hls4ml/templates/catapult/myproject_test.cpp create mode 100644 hls4ml/templates/catapult/nnet_utils/ap_shift_reg.h create mode 100755 hls4ml/templates/catapult/nnet_utils/hls_math.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_activation.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_array.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_common.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_embed.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_garnet.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_helpers.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_image.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_image_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_math.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_merge.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_merge_stream.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_mult.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_padding.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_padding_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_pooling.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_recr_activations.h create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_stream.h create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_types.h mode change 100644 => 100755 hls4ml/templates/vivado_accelerator/build_lib.sh create mode 100755 hls4ml/writer/catapult_writer.py diff --git a/.gitmodules b/.gitmodules index 3513213a23..98c3df68fd 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,12 @@ [submodule "example-models"] path = example-models url = https://github.com/hls-fpga-machine-learning/example-models.git +[submodule "hls4ml/templates/catapult/ac_types"] + path = hls4ml/templates/catapult/ac_types + url = https://github.com/hlslibs/ac_types.git +[submodule "hls4ml/templates/catapult/ac_simutils"] + path = hls4ml/templates/catapult/ac_simutils + url = https://github.com/hlslibs/ac_simutils.git +[submodule "hls4ml/templates/catapult/ac_math"] + path = hls4ml/templates/catapult/ac_math + url = https://github.com/hlslibs/ac_math.git diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 91a9272e74..f1eebd3c1f 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -7,6 +7,8 @@ from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401 +from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip + from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip from hls4ml.backends.vitis_accelerator.vitis_accelerator_backend import VitisAcceleratorBackend # isort: skip from hls4ml.backends.vitis_accelerator.vitis_accelerator_config import VitisAcceleratorConfig # noqa: F401 @@ -16,4 +18,5 @@ register_backend('Vitis', VitisBackend) register_backend('VitisAccelerator', VitisAcceleratorBackend) register_backend('Quartus', QuartusBackend) +register_backend('Catapult', CatapultBackend) register_backend('SymbolicExpression', SymbolicExpressionBackend) diff --git a/hls4ml/backends/catapult/__init__.py b/hls4ml/backends/catapult/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py new file mode 100644 index 0000000000..5556154dcb --- /dev/null +++ b/hls4ml/backends/catapult/catapult_backend.py @@ -0,0 +1,515 @@ +import os +import sys + +import numpy as np + +from hls4ml.backends import FPGABackend +from hls4ml.backends.fpga.fpga_types import ACTypeConverter, CatapultArrayVariableConverter, HLSTypeConverter +from hls4ml.model.attributes import ChoiceAttribute, ConfigurableAttribute, TypeAttribute +from hls4ml.model.flow import register_flow +from hls4ml.model.layers import ( + GRU, + LSTM, + Conv1D, + Conv2D, + Dense, + DepthwiseConv2D, + Embedding, + GarNet, + GarNetStack, + GlobalPooling1D, + GlobalPooling2D, + Layer, + Pooling1D, + Pooling2D, + SeparableConv1D, + SeparableConv2D, + SimpleRNN, + Softmax, +) +from hls4ml.model.optimizer import get_backend_passes, layer_optimizer +from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType +from hls4ml.report import parse_catapult_report +from hls4ml.utils.fixed_point_utils import ceil_log2 + + +class CatapultBackend(FPGABackend): + def __init__(self): + super().__init__('Catapult') + self._register_layer_attributes() + self._register_flows() + + def _register_layer_attributes(self): + # Add RNN-specific attributes, recurrent_reuse_factor and static implementation + rnn_layers = [ + SimpleRNN, + LSTM, + GRU, + ] + + for layer in rnn_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1)) + attrs.append(ConfigurableAttribute('static', value_type=bool, default=True)) + attrs.append(ConfigurableAttribute('table_size', default=1024)) + attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) + self.attribute_map[layer] = attrs + + # Add ParallelizationFactor to Conv1D/2D + pf_layers = [ + Conv1D, + Conv2D, + ] + + for layer in pf_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append(ConfigurableAttribute('parallelization_factor', default=1)) + self.attribute_map[layer] = attrs + + # Add ConvImplementation to Convolution+Pooling layers + cnn_layers = [Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Pooling1D, Pooling2D] + + for layer in cnn_layers: + attrs = self.attribute_map.get(layer, []) + # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer')) + attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) + self.attribute_map[layer] = attrs + + sep_conv_layers = [SeparableConv1D, SeparableConv2D] + for layer in sep_conv_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append(TypeAttribute('dw_output', default=FixedPrecisionType(18, 8))) + self.attribute_map[layer] = attrs + + def _register_flows(self): + initializers = self._get_layer_initializers() + init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name) + + streaming_passes = [ + 'catapult:reshape_stream', + 'catapult:clone_output', + 'catapult:insert_zero_padding_before_conv1d', + 'catapult:insert_zero_padding_before_conv2d', + 'catapult:broadcast_stream', + ] + streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name) + + quantization_passes = [ + 'catapult:merge_batch_norm_quantized_tanh', + 'catapult:quantize_dense_output', + 'fuse_consecutive_batch_normalization', + 'catapult:xnor_pooling', + ] + quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name) + + optimization_passes = [ + 'catapult:remove_final_reshape', + 'catapult:optimize_pointwise_conv', + 'catapult:inplace_parallel_reshape', + 'catapult:inplace_stream_flatten', + 'catapult:skip_softmax', + 'catapult:fix_softmax_table_size', + ] + optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) + + catapult_types = [ + 'catapult:transform_types', + 'catapult:register_bram_weights', + 'catapult:generate_conv_streaming_instructions', + 'catapult:apply_resource_strategy', + 'catapult:generate_conv_im2col', + ] + catapult_types_flow = register_flow('specific_types', catapult_types, requires=[init_flow], backend=self.name) + + templates = self._get_layer_templates() + template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name) + + writer_passes = ['make_stamp', 'catapult:write_hls'] + self._writer_flow = register_flow('write', writer_passes, requires=['catapult:ip'], backend=self.name) + + fifo_depth_opt_passes = [ + 'catapult:fifo_depth_optimization' + ] + writer_passes # After optimization, a new project will be written + + register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[self._writer_flow], backend=self.name) + + all_passes = get_backend_passes(self.name) + + extras = [ + # Ideally this should be empty + opt_pass + for opt_pass in all_passes + if opt_pass + not in initializers + + streaming_passes + + quantization_passes + + optimization_passes + + catapult_types + + templates + + writer_passes + + fifo_depth_opt_passes + ] + + if len(extras) > 0: + extras_flow = register_flow('extras', extras, requires=[init_flow], backend=self.name) + else: + extras_flow = None + + ip_flow_requirements = [ + 'optimize', + init_flow, + streaming_flow, + quantization_flow, + optimization_flow, + catapult_types_flow, + extras_flow, + template_flow, + ] + ip_flow_requirements = list(filter(None, ip_flow_requirements)) + + self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) + + def get_default_flow(self): + return self._default_flow + + def get_writer_flow(self): + return self._writer_flow + + def create_initial_config( + self, + tech='fpga', + part='xcku115-flvb2104-2-i', + asiclibs='nangate-45nm', + fifo=None, + clock_period=5, + io_type='io_parallel', + ): + config = {} + + config['Technology'] = tech + if tech == 'fpga': + config['Part'] = part if part is not None else 'xcvu13p-flga2577-2-e' + else: + config['ASICLibs'] = asiclibs if asiclibs is not None else 'nangate-45nm' + config['ClockPeriod'] = clock_period + config['FIFO'] = fifo + config['IOType'] = io_type + config['HLSConfig'] = {} + + return config + + def build( + self, + model, + reset=False, + csim=True, + synth=True, + cosim=False, + validation=False, + vhdl=False, + verilog=True, + export=False, + vsynth=False, + fifo_opt=False, + bitfile=False, + ran_frame=5, + sw_opt=False, + power=False, + da=False, + bup=False, + ): + # print(f'ran_frame value: {ran_frame}') # Add this line for debugging + catapult_exe = 'catapult' + if 'linux' in sys.platform: + cmd = 'command -v ' + catapult_exe + ' > /dev/null' + found = os.system(cmd) + if found != 0: + catapult_exe = os.getenv('MGC_HOME') + '/bin/catapult' + cmd = 'command -v ' + catapult_exe + ' > /dev/null' + found = os.system(cmd) + if found != 0: + catapult_exe = os.getenv('CATAPULT_HOME') + '/bin/catapult' + cmd = 'command -v ' + catapult_exe + ' > /dev/null' + if found != 0: + raise Exception('Catapult HLS installation not found. Make sure "catapult" is on PATH.') + + curr_dir = os.getcwd() + # this execution moves into the hls4ml-generated "output_dir" and runs the build_prj.tcl script. + os.chdir(model.config.get_output_dir()) + ccs_args = f'"reset={reset} csim={csim} synth={synth} cosim={cosim} validation={validation}' + ccs_args += f' export={export} vsynth={vsynth} fifo_opt={fifo_opt} bitfile={bitfile} ran_frame={ran_frame}' + ccs_args += f' sw_opt={sw_opt} power={power} da={da} vhdl={vhdl} verilog={verilog} bup={bup}"' + ccs_invoke = catapult_exe + ' -product ultra -shell -f build_prj.tcl -eval \'set ::argv ' + ccs_args + '\'' + print(ccs_invoke) + os.system(ccs_invoke) + os.chdir(curr_dir) + + return parse_catapult_report(model.config.get_output_dir()) + + def _validate_conv_strategy(self, layer): + if layer.model.config.pipeline_style.lower() != 'dataflow': + print(f'WARNING: Layer {layer.name} requires "dataflow" pipeline style. Switching to "dataflow" pipeline style.') + layer.model.config.pipeline_style = 'dataflow' + + @layer_optimizer(Layer) + def init_base_layer(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('reuse_factor', reuse_factor) + + target_cycles = layer.model.config.get_target_cycles(layer) + layer.set_attr('target_cycles', target_cycles) + + @layer_optimizer(Dense) + def init_dense(self, layer): + index_t = IntegerPrecisionType(width=1, signed=False) + compression = layer.model.config.get_compression(layer) + if layer.model.config.is_resource_strategy(layer): + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + if compression: + layer.set_attr('strategy', 'compressed') + index_t = layer.get_weights('weight').type.index_precision + else: + layer.set_attr('strategy', 'resource') + else: + layer.set_attr('strategy', 'latency') + layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t)) + + # TODO consolidate these functions into a single `init_conv` + @layer_optimizer(Conv1D) + def init_conv1d(self, layer): + if len(layer.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv1D + layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1)) + + if layer.model.config.is_resource_strategy(layer): + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + else: + layer.set_attr('strategy', 'latency') + + out_width = layer.get_output_variable().shape[0] + chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1) + valid_pf = self.get_valid_conv_partition_splits(1, out_width) + if chosen_pf not in valid_pf: + closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf) + valid_pf_str = ','.join(map(str, valid_pf)) + print( + f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".' + f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.' + ) + else: + closest_pf = chosen_pf + layer.set_attr('n_partitions', out_width // closest_pf) + + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + + self._validate_conv_strategy(layer) + + @layer_optimizer(SeparableConv1D) + def init_sepconv1d(self, layer): + if layer.model.config.is_resource_strategy(layer): + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + else: + layer.set_attr('strategy', 'latency') + + layer.set_attr( + 'n_partitions', 1 + ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + + # Set the output type of the depthwise phase + dw_out_precision, _ = layer.model.config.get_precision(layer, 'dw_output') + dw_out_name = layer.name + '_dw_out_t' + if layer.model.config.get_config_value('IOType') == 'io_stream': + dw_output_t = PackedType(dw_out_name, dw_out_precision, layer.get_attr('n_chan'), n_pack=1) + else: + dw_output_t = NamedType(dw_out_name, dw_out_precision) + layer.set_attr('dw_output_t', dw_output_t) + + @layer_optimizer(Conv2D) + def init_conv2d(self, layer): + if len(layer.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D + layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1)) + + if layer.model.config.is_resource_strategy(layer): + layer.set_attr('strategy', 'resource') + self.set_target_reuse_factor(layer) + n_in, n_out = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + else: + layer.set_attr('strategy', 'latency') + + out_height = layer.get_output_variable().shape[0] + out_width = layer.get_output_variable().shape[1] + chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1) + valid_pf = self.get_valid_conv_partition_splits(out_height, out_width) + if chosen_pf not in valid_pf: + closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf) + valid_pf_str = ','.join(map(str, valid_pf)) + print( + f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".' + f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.' + ) + else: + closest_pf = chosen_pf + layer.set_attr('n_partitions', out_height * out_width // closest_pf) + + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + + self._validate_conv_strategy(layer) + + @layer_optimizer(SeparableConv2D) + def init_sepconv2d(self, layer): + if layer.model.config.is_resource_strategy(layer): + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + else: + layer.set_attr('strategy', 'latency') + + layer.set_attr( + 'n_partitions', 1 + ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + + # Set the output type of the depthwise phase + dw_out_precision, _ = layer.model.config.get_precision(layer, 'dw_output') + dw_out_name = layer.name + '_dw_out_t' + if layer.model.config.get_config_value('IOType') == 'io_stream': + dw_output_t = PackedType(dw_out_name, dw_out_precision, layer.get_attr('n_chan'), n_pack=1) + else: + dw_output_t = NamedType(dw_out_name, dw_out_precision) + layer.set_attr('dw_output_t', dw_output_t) + + @layer_optimizer(DepthwiseConv2D) + def init_depconv2d(self, layer): + if layer.model.config.is_resource_strategy(layer): + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + else: + layer.set_attr('strategy', 'latency') + + layer.set_attr( + 'n_partitions', 1 + ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + + # Set the output type of the depthwise phase + dw_out_precision, _ = layer.model.config.get_precision(layer, 'dw_output') + dw_out_name = layer.name + '_dw_out_t' + if layer.model.config.get_config_value('IOType') == 'io_stream': + dw_output_t = PackedType(dw_out_name, dw_out_precision, layer.get_attr('n_chan'), n_pack=1) + else: + dw_output_t = NamedType(dw_out_name, dw_out_precision) + layer.set_attr('dw_output_t', dw_output_t) + + def _set_pooling_accum_t(self, layer, pool_size): + extra_bits = ceil_log2(pool_size) + accum_t = layer.get_attr('accum_t') + accum_t.precision.width += extra_bits * 2 + if isinstance(accum_t.precision, FixedPrecisionType): + accum_t.precision.integer += extra_bits + + @layer_optimizer(Pooling1D) + def init_pooling1d(self, layer): + pool_size = layer.get_attr('pool_width') + self._set_pooling_accum_t(layer, pool_size) + + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + + @layer_optimizer(Pooling2D) + def init_pooling2d(self, layer): + pool_size = layer.get_attr('pool_height') * layer.get_attr('pool_width') + self._set_pooling_accum_t(layer, pool_size) + + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) + + @layer_optimizer(GlobalPooling1D) + def init_global_pooling1d(self, layer): + pool_size = layer.get_attr('n_in') + self._set_pooling_accum_t(layer, pool_size) + + @layer_optimizer(GlobalPooling2D) + def init_global_pooling2d(self, layer): + pool_size = layer.get_attr('in_height') * layer.get_attr('in_width') + self._set_pooling_accum_t(layer, pool_size) + + @layer_optimizer(Softmax) + def init_softmax(self, layer): + if layer.model.config.get_config_value('IOType') == 'io_parallel': + assert ( + len(layer.get_input_variable().shape) == 1 + ), 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.' + + @layer_optimizer(Embedding) + def init_embed(self, layer): + if layer.attributes['n_in'] is None: + raise Exception('Input length of Embedding layer must be specified.') + + @layer_optimizer(LSTM) + def init_lstm(self, layer): + # TODO Allow getting recurrent reuse factor from the config + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + if layer.model.config.is_resource_strategy(layer): + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'resource') + else: + layer.set_attr('strategy', 'latency') + + layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', IntegerPrecisionType(width=1, signed=False))) + + @layer_optimizer(GRU) + def init_gru(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + if layer.model.config.is_resource_strategy(layer): + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'resource') + else: + layer.set_attr('strategy', 'latency') + + layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', IntegerPrecisionType(width=1, signed=False))) + + @layer_optimizer(GarNet) + def init_garnet(self, layer): + reuse_factor = layer.attributes['reuse_factor'] + + var_converter = CatapultArrayVariableConverter( + type_converter=HLSTypeConverter(precision_converter=ACTypeConverter()) + ) + + # A bit controversial but we are going to set the partitioning of the input here + in_layer = layer.model.graph[layer.inputs[0]] + in_var = layer.get_input_variable(layer.inputs[0]) + partition_factor = in_var.shape[1] * (in_var.shape[0] // reuse_factor) + in_pragma = ('partition', 'cyclic', partition_factor) + new_in_var = var_converter.convert(in_var, pragma=in_pragma) + in_layer.set_attr(layer.inputs[0], new_in_var) + + if layer.attributes['collapse']: + out_pragma = 'partition' + else: + partition_factor = layer._output_features * (layer.attributes['n_vertices'] // reuse_factor) + out_pragma = ('partition', 'cyclic', partition_factor) + + out_name, out_var = next(iter(layer.variables.items())) + new_out_var = var_converter.convert(out_var, pragma=out_pragma) + + layer.set_attr(out_name, new_out_var) + + @layer_optimizer(GarNetStack) + def init_garnet_stack(self, layer): + self.init_garnet(layer) diff --git a/hls4ml/backends/catapult/passes/__init__.py b/hls4ml/backends/catapult/passes/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/catapult/passes/broadcast_stream.py b/hls4ml/backends/catapult/passes/broadcast_stream.py new file mode 100644 index 0000000000..97019e074b --- /dev/null +++ b/hls4ml/backends/catapult/passes/broadcast_stream.py @@ -0,0 +1,117 @@ +import numpy as np + +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Concatenate, Layer, Merge, register_layer +from hls4ml.model.optimizer import OptimizerPass + + +class Broadcast(Layer): + '''Inserted between layers for broadcasting.''' + + def initialize(self): + shape = self.attributes['target_shape'] + if shape[0] is None: + shape = shape[1:] + dims = [f'N_SIZE_{i}_{self.index}' for i in range(1, len(shape) + 1)] + self.add_output_variable(shape, dims) + + +broadcast_function_template = 'nnet::broadcast_stream<{input_t}, {output_t}, {config}>({input}, {output});' +broadcast_config_template = """struct config{index} : nnet::broadcast_config {{ + static const unsigned in_width = {in_width}; + static const unsigned in_height = {in_height}; + static const unsigned in_chan = {in_chan}; + static const unsigned out_width = {out_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_chan = {out_chan}; +}};\n""" +broadcast_include_list = ['nnet_utils/nnet_stream.h'] + + +class BroadcastConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Broadcast) + self.template = broadcast_config_template + + def format(self, node): + params = self._default_config_params(node) + params['in_height'] = node.get_input_variable().shape[0] + params['in_width'] = node.get_input_variable().shape[1] + params['in_chan'] = node.get_input_variable().shape[2] + params['out_height'] = node.get_output_variable().shape[0] + params['out_width'] = node.get_output_variable().shape[1] + params['out_chan'] = node.get_output_variable().shape[2] + + return self.template.format(**params) + + +class BroadcastFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Broadcast, include_header=broadcast_include_list) + self.template = broadcast_function_template + + def format(self, node): + params = self._default_function_params(node) + return self.template.format(**params) + + +def register_broadcast_stream(backend): + # Register the layer types to the layer map + register_layer('Broadcast', Broadcast) + + # Register the optimization passes + backend.register_pass('broadcast_stream', BroadcastStream) + + # Register template passes + backend.register_template(BroadcastConfigTemplate) + backend.register_template(BroadcastFunctionTemplate) + + +class BroadcastStream(OptimizerPass): + def match(self, node): + if isinstance(node, Merge) and not isinstance(node, Concatenate): + inp1 = node.get_input_variable(node.inputs[0]) + inp2 = node.get_input_variable(node.inputs[1]) + return inp1.shape != inp2.shape + else: + return False + + def transform(self, model, node): + if model.config.backend.name not in ['Catapult'] or model.config.get_config_value('IOType') != 'io_stream': + return False + + inp = [node.get_input_variable(inp_name) for inp_name in node.inputs] + + if np.prod(inp[0].shape) > np.prod(inp[1].shape): + idx = 1 + attrs = {'target_shape': inp[0].shape} + else: + idx = 0 + attrs = {'target_shape': inp[1].shape} + + def supported_broadcast(inp_shape, target_shape): + # Must be (H, W, C) + if not len(inp_shape) == 3: + return False + # Supported: (1, 1, C) -> (H, W, C) + if inp_shape[0] == inp_shape[1] == 1 and inp_shape[2] == target_shape[2]: + return True + # Supported: (H, W, 1) -> (H, W, C) + if inp_shape[2] == 1 and inp_shape[0] == target_shape[0] and inp_shape[1] == target_shape[1]: + return True + return False + + brdcst_inp = node.inputs[idx] + inp_shape = node.get_input_variable(brdcst_inp).shape + target_shape = attrs['target_shape'] + if not supported_broadcast(inp_shape, target_shape): + raise RuntimeError( + f'Unsupported broadcast type for stream: {inp_shape} -> {target_shape};' + + 'Only (1, 1, C) -> (H, W, C) and (H, W, 1) -> (H, W, C) currently supported' + ) + brdcst_out = 'broadcast_' + brdcst_inp + brdcst_layer = model.make_node('Broadcast', brdcst_out, attrs, [brdcst_inp].copy()) + model.insert_node(brdcst_layer, before=node, input_idx=idx) + node.inputs[idx] = brdcst_out + + return True diff --git a/hls4ml/backends/catapult/passes/conv_same_pad.py b/hls4ml/backends/catapult/passes/conv_same_pad.py new file mode 100755 index 0000000000..bb8354a3d0 --- /dev/null +++ b/hls4ml/backends/catapult/passes/conv_same_pad.py @@ -0,0 +1,109 @@ +from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D +from hls4ml.model.optimizer import OptimizerPass + + +class InsertZeroPaddingBeforeConv1D(OptimizerPass): + name = 'insert_zero_padding_before_conv1d' + + def match(self, node): + is_match = ( + isinstance(node, (Conv1D, SeparableConv1D)) + and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal')) + and node.get_attr('filt_width') != 1 + ) + return is_match + + def transform(self, model, node): + if model.config.get_config_value('IOType') != 'io_stream': + return False + + # Get the padding parameters from Conv1D layer + pad_left = node.get_attr('pad_left') + pad_right = node.get_attr('pad_right') + + # Check if no padding needs to be done + if pad_left == pad_right == 0: + return False + + out_width = pad_left + node.get_attr('in_width') + pad_right + + attrs = { + 'pad_left': pad_left, + 'pad_right': pad_right, + 'in_width': node.get_attr('in_width'), + 'out_width': out_width, + 'n_chan': node.get_attr('n_chan'), + 'data_format': node.get_attr('data_format', 'channels_last'), + } + + # Switch Conv1D layer padding to 'valid' + node.set_attr('padding', 'valid') + node.set_attr('pad_left', 0) + node.set_attr('pad_right', 0) + node.set_attr('in_width', out_width) + + # Insert new ZeroPadding1D node above Conv1D + padding_layer = model.make_node('ZeroPadding1D', 'zp1d_' + node.name, attrs, node.inputs.copy()) + padding_layer.get_output_variable().type.precision = node.get_input_variable().type.precision + model.insert_node(padding_layer) + + return True + + +class InsertZeroPaddingBeforeConv2D(OptimizerPass): + name = 'insert_zero_padding_before_conv2d' + + def match(self, node): + is_match = ( + isinstance(node, (Conv2D, SeparableConv2D)) + and node.get_attr('padding') == 'same' + and node.get_attr('filt_height') != 1 + and node.get_attr('filt_width') != 1 + ) + return is_match + + def transform(self, model, node): + if model.config.get_config_value('IOType') != 'io_stream': + return False + + # Get the padding parameters from Conv2D layer + pad_top = node.get_attr('pad_top') + pad_bottom = node.get_attr('pad_bottom') + pad_left = node.get_attr('pad_left') + pad_right = node.get_attr('pad_right') + + # Check if no padding neeeds to be done + if pad_top == pad_bottom == pad_left == pad_right == 0: + return False + + out_height = pad_top + node.get_attr('in_height') + pad_bottom + out_width = pad_left + node.get_attr('in_width') + pad_right + + attrs = { + 'pad_top': pad_top, + 'pad_bottom': pad_bottom, + 'pad_left': pad_left, + 'pad_right': pad_right, + 'in_height': node.get_attr('in_height'), + 'in_width': node.get_attr('in_width'), + 'out_height': out_height, + 'out_width': out_width, + 'n_chan': node.get_attr('n_chan'), + 'data_format': node.get_attr('data_format', 'channels_last'), + } + + # Switch Conv2D layer padding to 'valid' + node.set_attr('padding', 'valid') + node.set_attr('pad_top', 0) + node.set_attr('pad_bottom', 0) + node.set_attr('pad_left', 0) + node.set_attr('pad_right', 0) + node.set_attr('in_height', out_height) + node.set_attr('in_width', out_width) + + # Insert new ZeroPadding2D node above Conv2D + padding_layer = model.make_node('ZeroPadding2D', 'zp2d_' + node.name, attrs, node.inputs.copy()) + padding_layer.get_output_variable().type.precision = node.get_input_variable().type.precision + model.insert_node(padding_layer, before=node) + + return True diff --git a/hls4ml/backends/catapult/passes/conv_stream.py b/hls4ml/backends/catapult/passes/conv_stream.py new file mode 100755 index 0000000000..e0bb853d83 --- /dev/null +++ b/hls4ml/backends/catapult/passes/conv_stream.py @@ -0,0 +1,52 @@ +from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D +from hls4ml.model.optimizer import OptimizerPass + + +class GenerateConvStreamingInstructions(OptimizerPass): + '''Generates the instructions for streaming implementation of CNNs''' + + def match(self, node): + return isinstance(node, (Conv1D, SeparableConv1D, Conv2D, SeparableConv2D)) + + def transform(self, model, node): + node_class = node.__class__.__name__ + if '1D' in node_class: + self._generate_1d_instructions(node) + elif '2D' in node_class: + self._generate_2d_instructions(node) + else: + raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') + + def _generate_1d_instructions(self, node): + if node.model.config.get_config_value('IOType') == 'io_stream': + min_w, instructions = node.model.config.backend.compute_conv1d_instructions( + node.get_input_variable().shape[0], + node.get_input_variable().shape[1], + node.get_attr('filt_width'), + node.get_attr('stride_width'), + ) + instructions_str = ','.join(str(i) for i in instructions) + node.set_attr('min_width', min_w) + node.set_attr('instructions', instructions_str) + else: + # these are unused; just put dummy values + node.set_attr('min_width', node.get_attr('in_width')) + node.set_attr('instructions', '0') + + def _generate_2d_instructions(self, node): + if node.model.config.get_config_value('IOType') == 'io_stream': + min_h, min_w, instructions = node.model.config.backend.compute_conv2d_instructions( + node.get_input_variable().shape[0], + node.get_input_variable().shape[1], + node.get_input_variable().shape[2], + node.get_attr('filt_height'), + node.get_attr('stride_height'), + ) + instructions_str = ','.join(str(i) for i in instructions) + node.set_attr('min_height', min_h) + node.set_attr('min_width', min_w) + node.set_attr('instructions', instructions_str) + else: + node.set_attr('min_height', node.get_attr('in_height')) + node.set_attr('min_width', node.get_attr('in_width')) + node.set_attr('instructions', '0') diff --git a/hls4ml/backends/catapult/passes/convolution_templates.py b/hls4ml/backends/catapult/passes/convolution_templates.py new file mode 100755 index 0000000000..8014a4ac8e --- /dev/null +++ b/hls4ml/backends/catapult/passes/convolution_templates.py @@ -0,0 +1,508 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import ( + Conv1D, + Conv2D, + Conv2DBatchnorm, + DepthwiseConv1D, + DepthwiseConv2D, + SeparableConv1D, + SeparableConv2D, +) + +# Shared multiplication template + +conv_mult_config_template = """struct config{index}_mult : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned reuse_factor = {reuse}; + static const unsigned strategy = nnet::{strategy}; + static const unsigned n_zeros = {nzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + template + using product = nnet::product::{product_type}; +}};\n""" + +# Conv1D templates + +conv1d_config_template = """struct config{index} : nnet::conv1d_config {{ + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned filt_width = {filt_width}; + static const unsigned kernel_size = filt_width; + static const unsigned n_filt = {n_filt}; + static const unsigned stride_width = {stride_width}; + static const unsigned dilation = {dilation}; + static const unsigned out_width = {out_width}; + static const unsigned reuse_factor = {reuse}; + static const unsigned n_zeros = {nzeros}; + static const unsigned multiplier_limit = + DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor; + static const bool store_weights_in_bram = false; + static const unsigned strategy = nnet::{strategy}; + static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned min_width = {min_width}; + static const ac_int pixels[min_width]; + static const unsigned n_partitions = {n_partitions}; + static const unsigned n_pixels = out_width / n_partitions; + template + using fill_buffer = nnet::{fill_fn}; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; + template + using scale_index = nnet::{scale_index_type}; +}}; +// really this allocation of pixels array ought to be in a .cpp file +#ifndef INCLUDED_MC_TESTBENCH_H +const ac_int config{index}::pixels[] = {{{instructions}}}; +#endif\n""" + +conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +depthconv1d_function_template = ( + 'nnet::depthwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) + +conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h'] + + +class Conv1DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Conv1D, DepthwiseConv1D)) + self.template = conv1d_config_template + self.mult_template = conv_mult_config_template + + def format(self, node): + params = self._default_config_params(node) + params['dilation'] = node.get_attr('dilation', 1) + params['nzeros'] = node.get_weights('weight').nzeros + + params['config_t'] = f'config{node.index}_mult' + if node.get_attr('in_width') == node.get_attr('min_width'): + params['scale_index_type'] = 'scale_index_unscaled' + else: + params['scale_index_type'] = 'scale_index_regular' + + if node.model.config.get_config_value('IOType') == 'io_parallel': + params['fill_fn'] = f'fill_buffer_{node.index}' + else: + params['fill_fn'] = 'FillConv1DBuffer' + + conv_config = self.template.format(**params) + + mult_params = self._default_config_params(node) + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['nzeros'] = node.get_weights('weight').nzeros + mult_params['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_config = self.mult_template.format(**mult_params) + + return mult_config + '\n' + conv_config + + +class Conv1DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Conv1D, include_header=conv1d_include_list) + self.template = conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class DepthwiseConv1DFunctionTemplate(Conv1DFunctionTemplate): + def __init__(self): + super(Conv1DFunctionTemplate, self).__init__(DepthwiseConv1D, include_header=sepconv1d_include_list) + self.template = depthconv1d_function_template + + +# Conv2D Templates + +conv2d_config_template = """struct config{index} : nnet::conv2d_config {{ + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned filt_height = {filt_height}; + static const unsigned filt_width = {filt_width}; + static const unsigned kernel_size = filt_height * filt_width; + static const unsigned n_filt = {n_filt}; + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + static const unsigned reuse_factor = {reuse}; + static const unsigned n_zeros = {nzeros}; + static const unsigned multiplier_limit = + DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor; + static const bool store_weights_in_bram = false; + static const unsigned strategy = nnet::{strategy}; + static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned min_height = {min_height}; + static const unsigned min_width = {min_width}; + static const ac_int pixels[min_height * min_width]; + static const unsigned n_partitions = {n_partitions}; + static const unsigned n_pixels = out_height * out_width / n_partitions; + template + using fill_buffer = nnet::{fill_fn}; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; + template + using scale_index_height = nnet::{scale_index_height_type}; + template + using scale_index_width = nnet::{scale_index_width_type}; +}}; +// really this allocation of pixels array ought to be in a .cpp file +#ifndef INCLUDED_MC_TESTBENCH_H +const ac_int config{index}::pixels[] = {{{instructions}}}; +#endif\n""" + +conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +depthconv2d_function_template = ( + 'nnet::depthwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) + +conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h'] + + +class Conv2DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm, DepthwiseConv2D)) + self.template = conv2d_config_template + self.mult_template = conv_mult_config_template + + def format(self, node): + params = self._default_config_params(node) + params['dilation'] = node.get_attr('dilation', 1) + params['nzeros'] = node.get_weights('weight').nzeros + + params['config_t'] = f'config{node.index}_mult' + + if node.get_attr('in_height') == node.get_attr('min_height'): + params['scale_index_height_type'] = 'scale_index_unscaled' + else: + params['scale_index_height_type'] = 'scale_index_regular' + + if node.get_attr('in_width') == node.get_attr('min_width'): + params['scale_index_width_type'] = 'scale_index_unscaled' + else: + params['scale_index_width_type'] = 'scale_index_regular' + + if node.model.config.get_config_value('IOType') == 'io_parallel': + params['fill_fn'] = f'fill_buffer_{node.index}' + else: + params['fill_fn'] = 'FillConv2DBuffer' + + conv_config = self.template.format(**params) + + mult_params = self._default_config_params(node) + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['nzeros'] = node.get_weights('weight').nzeros + mult_params['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_config = self.mult_template.format(**mult_params) + + return mult_config + '\n' + conv_config + + +class Conv2DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm), include_header=conv2d_include_list) + self.template = conv2d_function_template + + def format(self, node): + params = self._default_function_params(node) + params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class DepthwiseConv2DFunctionTemplate(Conv2DFunctionTemplate): + def __init__(self): + super(Conv2DFunctionTemplate, self).__init__(DepthwiseConv2D, include_header=sepconv2d_include_list) + self.template = depthconv2d_function_template + + +# SeparableConv1D/2D Templates + +sepconv_config_template = """struct config{index} {{ + typedef {depthwise_config} depthwise_config; + typedef {pointwise_config} pointwise_config; +}};\n""" + +sepconv1d_function_template = ( + 'nnet::separable_conv_1d_{data_format}<{input_t}, {dw_output_t}, {output_t}, {config}>(' + '{input}, {output}, {d}, {p}, {z}, {b});' +) +sepconv2d_function_template = ( + 'nnet::separable_conv_2d_{data_format}<{input_t}, {dw_output_t}, {output_t}, {config}>(' + '{input}, {output}, {d}, {p}, {z}, {b});' +) + +sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h'] +sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h'] + + +class SeparableConv1DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SeparableConv1D) + self.template = sepconv_config_template + self.depthwise_template = conv1d_config_template + self.pointwise_template = conv1d_config_template + self.depthwise_mult_template = conv_mult_config_template + self.pointwise_mult_template = conv_mult_config_template + + def format(self, node): + # Separable master config + params = {} + params['index'] = node.index + params['depthwise_config'] = f'config{node.index}_depthwise' + params['pointwise_config'] = f'config{node.index}_pointwise' + sep_config = self.template.format(**params) + + # Depthwise config + params = self._default_config_params(node) + # Override bias and bias_t since these are zeros in depthwise step of SepConv1D + params['bias'] = params['zero_bias'] + params['bias_t'] = params['zero_bias_t'] + params['n_filt'] = params['n_chan'] # In depthwise step n_chan == n_filt + params['dilation'] = node.get_attr('dilation', 1) + params['nzeros'] = node.get_weights('depthwise').nzeros + params['index'] = str(node.index) + '_depthwise' + params['weight_t'] = node.get_weights('depthwise').type + params['fill_fn'] = 'FillConv1DBuffer' + + if node.get_attr('unscaled'): + params['scale_index_type'] = 'scale_index_unscaled' + else: + params['scale_index_type'] = 'scale_index_regular' + + params['config_t'] = f'config{node.index}_depthwise_mult' + depthwise_config = self.depthwise_template.format(**params) + + # Depthwise mult config + mult_params = self._default_config_params(node) + mult_params['index'] = str(node.index) + '_depthwise' + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_chan') + mult_params['nzeros'] = node.get_weights('depthwise').nzeros + mult_params['weight_t'] = node.get_weights('depthwise').type + mult_params['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision + ) + depthwise_mult_config = self.depthwise_mult_template.format(**mult_params) + + # Pointwise config + params = self._default_config_params(node) + if node.get_attr('data_format') == 'channels_last': + params['in_width'] = node.get_output_variable().shape[0] + else: + params['in_width'] = node.get_output_variable().shape[1] + + params['filt_width'] = 1 + params['stride_width'] = 1 + params['dilation'] = node.get_attr('dilation', 1) + params['nzeros'] = node.get_weights('pointwise').nzeros + params['index'] = str(node.index) + '_pointwise' + params['weight_t'] = node.get_weights('pointwise').type + params['min_width'] = params['in_width'] + params['instructions'] = '0' + params['fill_fn'] = 'FillConv1DBuffer' + + if node.get_attr('unscaled'): + params['scale_index_type'] = 'scale_index_unscaled' + else: + params['scale_index_type'] = 'scale_index_regular' + + params['config_t'] = f'config{node.index}_pointwise_mult' + pointwise_config = self.pointwise_template.format(**params) + + # Pointwise mult config + mult_params = self._default_config_params(node) + mult_params['index'] = str(node.index) + '_pointwise' + mult_params['n_in'] = node.get_attr('n_chan') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['nzeros'] = node.get_weights('pointwise').nzeros + mult_params['weight_t'] = node.get_weights('pointwise').type + mult_params['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision + ) + pointwise_mult_config = self.pointwise_mult_template.format(**mult_params) + + return ( + depthwise_mult_config + + '\n' + + depthwise_config + + '\n' + + pointwise_mult_config + + '\n' + + pointwise_config + + '\n' + + sep_config + ) + + +class SeparableConv1DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SeparableConv1D, include_header=sepconv1d_include_list) + self.template = sepconv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + params['dw_output_t'] = node.get_attr('dw_output_t').name + params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl' + params['d'] = node.get_weights('depthwise').name + params['p'] = node.get_weights('pointwise').name + params['b'] = node.get_weights('bias').name + params['z'] = node.get_weights('zero_bias').name + + return self.template.format(**params) + + +class SeparableConv2DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SeparableConv2D) + self.template = sepconv_config_template + self.depthwise_template = conv2d_config_template + self.pointwise_template = conv2d_config_template + self.depthwise_mult_template = conv_mult_config_template + self.pointwise_mult_template = conv_mult_config_template + + def format(self, node): + # Separable master config + params = {} + params['index'] = node.index + params['depthwise_config'] = f'config{node.index}_depthwise' + params['pointwise_config'] = f'config{node.index}_pointwise' + sep_config = self.template.format(**params) + + # Depthwise config + params = self._default_config_params(node) + # Override bias and bias_t since these are zeros in depthwise step of SepConv2D + params['bias'] = params['zero_bias'] + params['bias_t'] = params['zero_bias_t'] + params['n_filt'] = params['n_chan'] # In depthwise step n_chan == n_filt + params['dilation'] = node.get_attr('dilation', 1) + params['nzeros'] = node.get_weights('depthwise').nzeros + params['index'] = str(node.index) + '_depthwise' + params['weight_t'] = node.get_weights('depthwise').type + params['fill_fn'] = 'FillConv2DBuffer' + + if node.get_attr('unscaled_h'): + params['scale_index_height_type'] = 'scale_index_unscaled' + else: + params['scale_index_height_type'] = 'scale_index_regular' + + if node.get_attr('unscaled_w'): + params['scale_index_width_type'] = 'scale_index_unscaled' + else: + params['scale_index_width_type'] = 'scale_index_regular' + + params['config_t'] = f'config{node.index}_depthwise_mult' + depthwise_config = self.depthwise_template.format(**params) + + # Depthwise mult config + mult_params = self._default_config_params(node) + mult_params['index'] = str(node.index) + '_depthwise' + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_chan') + mult_params['nzeros'] = node.get_weights('depthwise').nzeros + mult_params['weight_t'] = node.get_weights('depthwise').type + mult_params['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision + ) + depthwise_mult_config = self.depthwise_mult_template.format(**mult_params) + + # Pointwise config + params = self._default_config_params(node) + if node.get_attr('data_format') == 'channels_last': + params['in_height'] = node.get_output_variable().shape[0] + params['in_width'] = node.get_output_variable().shape[1] + else: + params['in_height'] = node.get_output_variable().shape[1] + params['in_width'] = node.get_output_variable().shape[2] + + params['filt_height'] = params['filt_width'] = 1 + params['stride_height'] = params['stride_width'] = 1 + params['dilation'] = node.get_attr('dilation', 1) + params['nzeros'] = node.get_weights('pointwise').nzeros + params['index'] = str(node.index) + '_pointwise' + params['weight_t'] = node.get_weights('pointwise').type + params['min_height'] = params['in_height'] + params['min_width'] = params['in_width'] + params['instructions'] = '0' + params['fill_fn'] = 'FillConv2DBuffer' + + if node.get_attr('unscaled_h'): + params['scale_index_height_type'] = 'scale_index_unscaled' + else: + params['scale_index_height_type'] = 'scale_index_regular' + + if node.get_attr('unscaled_w'): + params['scale_index_width_type'] = 'scale_index_unscaled' + else: + params['scale_index_width_type'] = 'scale_index_regular' + params['config_t'] = f'config{node.index}_pointwise_mult' + pointwise_config = self.pointwise_template.format(**params) + + # Pointwise mult config + mult_params = self._default_config_params(node) + mult_params['index'] = str(node.index) + '_pointwise' + mult_params['n_in'] = node.get_attr('n_chan') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['nzeros'] = node.get_weights('pointwise').nzeros + mult_params['weight_t'] = node.get_weights('pointwise').type + mult_params['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision + ) + pointwise_mult_config = self.pointwise_mult_template.format(**mult_params) + + return ( + depthwise_mult_config + + '\n' + + depthwise_config + + '\n' + + pointwise_mult_config + + '\n' + + pointwise_config + + '\n' + + sep_config + ) + + +class SeparableConv2DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SeparableConv2D, include_header=sepconv2d_include_list) + self.template = sepconv2d_function_template + + def format(self, node): + params = self._default_function_params(node) + params['dw_output_t'] = node.get_attr('dw_output_t').name + params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl' + params['d'] = node.get_weights('depthwise').name + params['p'] = node.get_weights('pointwise').name + params['b'] = node.get_weights('bias').name + params['z'] = node.get_weights('zero_bias').name + + return self.template.format(**params) diff --git a/hls4ml/backends/catapult/passes/convolution_winograd.py b/hls4ml/backends/catapult/passes/convolution_winograd.py new file mode 100644 index 0000000000..8b25ab41b8 --- /dev/null +++ b/hls4ml/backends/catapult/passes/convolution_winograd.py @@ -0,0 +1,175 @@ +import math + +import numpy as np + +from hls4ml.model.layers import Conv1D, Conv2D +from hls4ml.model.optimizer import OptimizerPass + + +class ApplyWinogradKernelTransformation(OptimizerPass): + ''' + Transforms the weights of a Conv2D kernel to a format suitable for Wingorad convolution + For further information, refer to Lavin & Gray, 2015 - Fast Algorithms for Convolutional Neural Networks + ''' + + def match(self, node): + node_matches = isinstance(node, (Conv1D, Conv2D)) + + # This optimizer works only after the Resource Strategy Optimizer, since order of transposition matters + weights_transformed = node.get_attr('_weights_transposed', False) is True + + # User opted for Winograd + implementation_is_winograd = ( + node.get_attr('implementation', 'combination') == 'combination' + or node.get_attr('implementation', 'combination') == 'winograd' + ) + + parallel_io_type = node.model.config.get_config_value('IOType') == 'io_parallel' + + # Winograd algorithm-specific conditions + if isinstance(node, Conv1D): + # Winograd only applies to specific kernel sizes + # Current implementation only supports fs = 3; easily extendable to other filter sizes + filter_size_matches = node.get_attr('filt_width', 3) == 3 + + # Winograd's minimal filtering algorithm doesn't work with stride != 1 + stride_is_one = node.get_attr('stride_width', 1) == 1 + + # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once + loop_itr_gt_one = node.get_attr('out_width') > 2 + + winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one and parallel_io_type + + elif isinstance(node, (Conv2D)): + # Winograd only applies to specific kernel sizes + # Current implementation only supports fs = 3; easily extendable to other filter sizes + filter_size_matches = node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3 + + # Winograd's minimal filtering algorithm doesn't work with striede != 1 + stride_is_one = node.get_attr('stride_height', 1) == 1 and node.get_attr('stride_width', 1) == 1 + + # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once + loop_itr_gt_one = node.get_attr('out_height') > 2 and node.get_attr('out_width') > 2 + + padding_is_equal = node.get_attr('pad_top', 0) == node.get_attr('pad_bottom', 0) and node.get_attr( + 'pad_left', 0 + ) == node.get_attr('pad_right', 0) + + winograd_conditions = ( + filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one and parallel_io_type + ) + + else: + winograd_conditions = False + + # Check any previous transformations + already_transformed = node.get_attr('_winograd_transformation_applied', False) is True + + if not winograd_conditions and node.get_attr('implementation', 'combination') == 'winograd': + raise RuntimeError( + 'Not possible to use Winograd algorithm with current architecture. ' + 'Please set implementation to im2col or combination' + ) + + return ( + node_matches + and weights_transformed + and winograd_conditions + and not already_transformed + and implementation_is_winograd + ) + + def transform(self, model, node): + if isinstance(node, Conv1D): + if node.get_attr('filt_width', 3) == 3: + # First, transpose to a format suitable for the Winograd algorithm (F, C, W) + # Note, this assumes a format post-resource strategy optimizer, that is (F, W, C) + # Therefore, (F, W, C) => (F, C, W) + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 2, 1]) + + # Temporary copy of data + weights = node.weights['weight'].data + + # Expand weight dimensionality (3) => (4) + node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4)) + + # Transformation matrices for 3x1 kernels + G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]]) + + # Transformation GfG' + for filter in range(0, weights.data.shape[0]): + for channel in range(0, weights.data.shape[1]): + node.weights['weight'].data[filter][channel] = np.matmul(G, weights[filter][channel]) + node.weights['weight'].data_length = node.weights['weight'].data.size + + # Winograd's minimal filtering algorithm transforms the weight matrix + # This transformation consists of addition and division (by 2&4) of the weight matrix + # Therefore, increase precision (if needed), to accomodate for new weights + # This error is only noticeable for low precisions, such as those used with QKeras + + # Integer precision is only updated if it exceeds the one defined in hls4ml config + maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max())) + if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer: + node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1 + node.weights['weight'].type.precision.width += ( + maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer + ) + + # Fractional precision is increased by 2 bits (division by 4), + # for low-precision (less than 8) fractional weights + if node.weights['weight'].type.precision.fractional < 8: + node.weights['weight'].type.precision.width += 2 + + # Modified kernel size + node.set_attr('impl_filt_width', 4) + + elif isinstance(node, Conv2D): + if node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3: + # First, transpose to a format suitable for the Winograd algorithm (F, C, H, W) + # Note, this assumes a format post-resource strategy optimizer, that is (F, H, W, C) + # Therefore, (F, H, W, C) => (F, C, H, W) + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 3, 1, 2]) + + # Temporary copy of data + weights = node.weights['weight'].data + + # Expand weight dimensionality (3x3) => (4x4) + node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4, 4)) + + # Transformation matrices for 3x3 kernels + G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]]) + GT = np.array([[1, 0.5, 0.5, 0], [0, 0.5, -0.5, 0], [0, 0.5, 0.5, 1]]) + + # Transformation GfG' + for filter in range(0, weights.data.shape[0]): + for channel in range(0, weights.data.shape[1]): + node.weights['weight'].data[filter][channel] = np.matmul(np.matmul(G, weights[filter][channel]), GT) + node.weights['weight'].data_length = node.weights['weight'].data.size + + # Winograd's minimal filtering algorithm transforms the weight matrix + # This transformation consists of addition and division (by 2&4) of the weight matrix + # Therefore, increase precision (if needed), to accomodate for new weights + # This error is only noticeable for low precisions, such as those used with QKeras + + # Integer precision is only updated if it exceeds the one defined in hls4ml config + maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max())) + if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer: + node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1 + node.weights['weight'].type.precision.width += ( + maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer + ) + + # Fractional precision is increased by 2 bits (division by 4), + # for low-precision (less than 8) fractional weights + if node.weights['weight'].type.precision.fractional < 8: + node.weights['weight'].type.precision.width += 2 + + # Modified kernel size + node.set_attr('impl_filt_height', 4) + node.set_attr('impl_filt_width', 4) + else: + raise Exception(f'Unexpected layer {node.class_name} with Winograd kernel optimizer') + + node.set_attr('_winograd_transformation_applied', True) + + return False diff --git a/hls4ml/backends/catapult/passes/core_templates.py b/hls4ml/backends/catapult/passes/core_templates.py new file mode 100755 index 0000000000..2088923428 --- /dev/null +++ b/hls4ml/backends/catapult/passes/core_templates.py @@ -0,0 +1,216 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax + +# Dense templates + +dense_config_template = """struct config{index} : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned strategy = nnet::{strategy}; + static const unsigned reuse_factor = {reuse}; + static const unsigned n_zeros = {nzeros}; + static const unsigned n_nonzeros = {nonzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; + static const bool store_weights_in_bram = false; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {index_t.name} index_t; + template + using product = nnet::product::{product_type}; +}};\n""" + +dense_function_template = 'nnet::dense<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' + +dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] + + +class DenseConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_config_template + + def format(self, node): + params = self._default_config_params(node) + params['nzeros'] = node.get_weights('weight').nzeros + params['nonzeros'] = node.get_weights('weight').nonzeros + params['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + + return self.template.format(**params) + + +class DenseFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Dense, include_header=dense_include_list) + self.template = dense_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +# BatchNormalization templates + +batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor); + static const bool store_weights_in_bram = false; + typedef {bias_t.name} bias_t; + typedef {scale_t.name} scale_t; + template + using product = nnet::product::{product_type}; +}};\n""" + +batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});' + +batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + + +class BatchNormalizationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalization) + self.template = batchnorm_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + params['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) + + return self.template.format(**params) + + +class BatchNormalizationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalization, include_header=batchnorm_include_list) + self.template = batchnorm_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) + + +# Activation templates + +activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n""" + +hard_activ_config_template = """struct {type}_config{index} {{ + static const unsigned n_in = {n_in}; + static const {slope_t.name} slope; + static const {shift_t.name} shift; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; +}}; +// really this allocation of pixels array ought to be in a .cpp file +#ifndef INCLUDED_MC_TESTBENCH_H +const {slope_t.name} {type}_config{index}::slope = {slope}; +const {shift_t.name} {type}_config{index}::shift = {shift}; +#endif\n""" + +softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + static const unsigned axis = {axis}; + static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation}; + typedef {exp_table_t.name} exp_table_t; + typedef {inv_table_t.name} inv_table_t; +}};\n""" + +activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' +param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' + +activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] + + +class ActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Activation, ParametrizedActivation, PReLU)) + self.template = activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class HardActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(HardActivation) + self.template = hard_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class SoftmaxConfigTemplate(ActivationConfigTemplate): + def __init__(self): + super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ + self.template = softmax_config_template + + +class ActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list) + self.template = activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + + return self.template.format(**params) + + +class ParametrizedActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(ParametrizedActivation, include_header=activ_include_list) + self.template = param_activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node._get_act_function_name() + params['param'] = node.get_attr('activ_param', 1.0) + params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + + return self.template.format(**params) + + +class PReLUFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(PReLU, include_header=activ_include_list) + self.template = param_activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['param'] = node.get_weights('alpha').name + params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + + return self.template.format(**params) diff --git a/hls4ml/backends/catapult/passes/fifo_depth_optimization.py b/hls4ml/backends/catapult/passes/fifo_depth_optimization.py new file mode 100755 index 0000000000..4d92e98de1 --- /dev/null +++ b/hls4ml/backends/catapult/passes/fifo_depth_optimization.py @@ -0,0 +1,104 @@ +import json + +from pyDigitalWaveTools.vcd.parser import VcdParser + +from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass + + +def populate_values(values, name, data, depth): + def get_values(x): + return int(x[1][1:], 2) + + values.append({'name': name, 'data': [], 'max': 0, 'depth': 0}) + values[-1]['data'] = [get_values(x) for x in data] + values[-1]['max'] = max(values[-1]['data']) + values[-1]['depth'] = int(depth[0][1][1:], 2) + return values + + +def set_big_fifos(vars_to_profile, profiling_fifo_depth): + for v in vars_to_profile.values(): + if v.pragma: + v.pragma = (v.pragma[0], profiling_fifo_depth) + + +def get_vcd_data(model): + model.write() + model.build(reset=False, csim=True, synth=True, cosim=True, validation=False, export=False, vsynth=False, fifo_opt=True) + + with open( + model.config.get_output_dir() + + '/' + + model.config.get_project_name() + + '_prj' + + '/solution1/sim/verilog/fifo_opt.vcd' + ) as vcd_file: + vcd = VcdParser() + vcd.parse(vcd_file) + data = vcd.scope.toJson() + return data + + +def generate_max_depth_file(model, maxs): + with open(model.config.get_output_dir() + '/max_depth.json', 'w') as f: + json.dump(maxs, f, indent=4) + + +def set_fifo_depth(model, maxs): + for v in model.output_vars.values(): + if v.pragma: + filtered_max = [x['max'] for x in maxs if v.name in x['name']] + if len(filtered_max) == 0: + continue + if len(filtered_max) > 1: + print('WARNING! Check names of FIFOs') + v.pragma = (v.pragma[0], filtered_max[0] + 1) + + +class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass): + def __init__(self): + self.values = [] + + def transform(self, model): + # use `large_fifo_depth = 0` to keep the default fifo depth + profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000) + + # check axi-stream or io-stream, if not one the 2 exit + if not (model.config.get_config_value('IOType') == 'io_stream'): + raise RuntimeError('To use this optimization you have to set `IOType` field to `io_stream` in the HLS config') + + # initialize all the fifos to `profiling_fifo_depth` so that they will be automatically implemented in BRAMs + # and so they will be profiled + if profiling_fifo_depth: + vars_to_profile = { + k: v + for k, v in model.output_vars.items() + if v != model.get_output_variables()[0] and v != model.get_input_variables()[0] + } + + set_big_fifos(vars_to_profile, profiling_fifo_depth) + + data = get_vcd_data(model) + + if len(data['children']) == 0: + print( + "FIFO depth optimization found no FIFOs implemented using BRAMs in the design, no optimization is possible." + ) + print("Consider increasing profiling_fifo_depth.") + return False + + n_elem = len(data['children'][0]['children'][0]['children']) + for i in range(n_elem): + name = data['children'][0]['children'][0]['children'][i]['name'] + data_p = data['children'][0]['children'][0]['children'][i]['children'][0]['data'] + depth = data['children'][0]['children'][0]['children'][i]['children'][1]['data'] + populate_values(self.values, name, data_p, depth) + + maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values] + + generate_max_depth_file(model, maxs) + + set_fifo_depth(model, maxs) + + print('[hls4ml] - FIFO optimization completed') + return False diff --git a/hls4ml/backends/catapult/passes/garnet_templates.py b/hls4ml/backends/catapult/passes/garnet_templates.py new file mode 100755 index 0000000000..f73f627683 --- /dev/null +++ b/hls4ml/backends/catapult/passes/garnet_templates.py @@ -0,0 +1,249 @@ +import numpy as np + +from hls4ml.backends.fpga.fpga_types import ACTypeConverter +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import GarNet, GarNetStack +from hls4ml.model.types import FixedPrecisionType + +# GarNet templates + +garnet_common_config_template = """ + static const unsigned n_vertices = {n_vertices}; + static const unsigned n_vertices_width = {n_vertices_width}; + static const unsigned n_in_features = {n_in_features}; + static const unsigned distance_width = {distance_width}; + static const unsigned output_collapse = {collapse_type}; + static const bool mean_by_nvert = {mean_by_nvert}; + + typedef {norm_t} norm_t; + typedef ac_fixed<{distance_width}, {distance_nint}, true, AC_TRN, AC_SAT> distance_t; + typedef {edge_weight_t} edge_weight_t; + typedef {edge_weight_aggr_t} edge_weight_aggr_t; + typedef {aggr_t} aggr_t; + typedef {output_t} output_t; + + static const unsigned reuse_factor = {reuse}; + static const unsigned log2_reuse_factor = {log2_reuse}; +""" + +garnet_config_template = """struct config{index} : nnet::garnet_config {{""" +garnet_config_template += garnet_common_config_template +garnet_config_template += """ + static const unsigned n_propagate = {n_propagate}; + static const unsigned n_aggregators = {n_aggregators}; + static const unsigned n_out_features = {n_out_features}; + + typedef {input_transform_weights_t} input_transform_weights_t; + typedef {input_transform_biases_t} input_transform_biases_t; + typedef {aggregator_distance_weights_t} aggregator_distance_weights_t; + typedef {aggregator_distance_biases_t} aggregator_distance_biases_t; + typedef {output_transform_weights_t} output_transform_weights_t; + typedef {output_transform_biases_t} output_transform_biases_t; + + static const input_transform_weights_t (&input_transform_weights)[{input_transform_weights_size}]; + static const input_transform_biases_t (&input_transform_biases)[{input_transform_biases_size}]; + static const aggregator_distance_weights_t (&aggregator_distance_weights)[{aggregator_distance_weights_size}]; + static const aggregator_distance_biases_t (&aggregator_distance_biases)[{aggregator_distance_biases_size}]; + static const output_transform_weights_t (&output_transform_weights)[{output_transform_weights_size}]; + static const output_transform_biases_t (&output_transform_biases)[{output_transform_biases_size}]; + + typedef config{index} base_t; +}}; + +const config{index}::input_transform_weights_t (&config{index}::input_transform_weights)[{input_transform_weights_size}] = {input_transform_weights}; +const config{index}::input_transform_biases_t (&config{index}::input_transform_biases)[{input_transform_biases_size}] = {input_transform_biases}; +const config{index}::aggregator_distance_weights_t (&config{index}::aggregator_distance_weights)[{aggregator_distance_weights_size}] = {aggregator_distance_weights}; +const config{index}::aggregator_distance_biases_t (&config{index}::aggregator_distance_biases)[{aggregator_distance_biases_size}] = {aggregator_distance_biases}; +const config{index}::output_transform_weights_t (&config{index}::output_transform_weights)[{output_transform_weights_size}] = {output_transform_weights}; +const config{index}::output_transform_biases_t (&config{index}::output_transform_biases)[{output_transform_biases_size}] = {output_transform_biases}; +""" # noqa: E501 + +garnet_function_template = ( + 'nnet::garnet{impl}<{input_t}, {integer_input_t}, {output_t}, {config}>({input}, {nvtx}, {output});' +) + +garnet_include_list = ['nnet_utils/nnet_garnet.h'] + + +class GarNetConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(GarNet) + self.template = (garnet_config_template,) + + def get_transforms_config(self, node, params): + params['n_in_features'] = node.attributes['n_in_features'] + params['n_propagate'] = node.attributes['n_propagate'] + params['n_aggregators'] = node.get_weights('aggregator_distance_biases').shape[0] + params['n_out_features'] = node.get_weights('output_transform_biases').shape[0] + + for wname, weights in node.weights.items(): + params[wname] = weights.name + params[f'{wname}_t'] = weights.type.name + params[f'{wname}_size'] = weights.data_length + + def format(self, node): + params = self._default_config_params(node) + + params['n_vertices'] = node.attributes['n_vertices'] + params['n_vertices_width'] = int(np.log2(params['n_vertices'])) + params['distance_width'] = 12 + params['distance_nint'] = min(4, params['distance_width'] - 6) # this is tuned + params['log2_reuse'] = int(np.log2(params['reuse'])) + + # Define default precisions for various internal arrays (can be overridden from the config file) + # We always give 10 digits for the subintegral part + fwidth = 10 + # Integral precision for aggr_t depends on how large the temporary sum for weighed feature mean will be + aggr_intw = max(params['log2_reuse'], params['n_vertices_width'] - params['log2_reuse']) + 3 # safety factor 2**3 + aggr_w = aggr_intw + fwidth + # edge_weight_aggr_t does not need the safety factor + ew_aggr_intw = aggr_intw - 3 + ew_aggr_w = ew_aggr_intw + fwidth + # Integral precision for norm is fixed to 4 + norm_intw = 4 + norm_w = norm_intw + fwidth + + vspecs = [ + ('edge_weight', FixedPrecisionType(10, 0, signed=False)), + ('edge_weight_aggr', FixedPrecisionType(ew_aggr_w, ew_aggr_intw, signed=False)), + ('aggr', FixedPrecisionType(aggr_w, aggr_intw)), + ('norm', FixedPrecisionType(norm_w, norm_intw, signed=False)), + ] + precision_converter = ACTypeConverter() + for vname, default_precision in vspecs: + params[f'{vname}_t'], type_name = node.model.config.get_precision(node, var=vname) + if type_name.endswith('default_t'): + params[f'{vname}_t'] = precision_converter.convert(default_precision).definition_cpp() + else: + params[f'{vname}_t'] = precision_converter.convert(params[f'{vname}_t']).definition_cpp() + params['output_t'] = node.get_output_variable().type.name + + if node.attributes['collapse'] in ['mean', 'max']: + params['collapse_type'] = 'collapse_{}'.format(node.attributes['collapse']) + else: + params['collapse_type'] = 'no_collapse' + + params['mean_by_nvert'] = str(node.attributes['mean_by_nvert']).lower() + + self.get_transforms_config(node, params) + + return self.template[0].format(**params) + + +class GarNetFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(GarNet, include_header=garnet_include_list) + self.template = garnet_function_template + + def format(self, node): + params = self._default_function_params(node) + + data = node.get_input_variable(node.inputs[0]) + integer_input = node.get_input_variable(node.inputs[1]) + params['input_t'] = data.type.name + params['input'] = data.name + + params['integer_input_t'] = integer_input.type.name + params['nvtx'] = integer_input.name + + if node.ref_impl: + params['impl'] = '_ref' + else: + params['impl'] = '' + + return self.template.format(**params) + + +# GarNetStack Templates + +garnet_stack_base_config_template = """struct config{index}_base : nnet::garnet_config {{""" +garnet_stack_base_config_template += garnet_common_config_template +garnet_stack_base_config_template += """ + static const bool is_stack = true; + + typedef config{index}_base base_t; +}}; + +struct config{index} : config{index}_base {{ + static const unsigned n_sublayers = {n_sublayers}; + + template + struct sublayer_t : config{index}_base {{}}; +}}; + +{sublayer_configs} +""" + +garnet_stack_sublayer_config_template = """template<> +struct config{index}::sublayer_t<{il}> : config{index}_base {{ + static const unsigned n_in_features = {n_in_features}; + static const unsigned n_propagate = {n_propagate}; + static const unsigned n_aggregators = {n_aggregators}; + static const unsigned n_out_features = {n_out_features}; + + typedef {input_transform_weights_t} input_transform_weights_t; + typedef {input_transform_biases_t} input_transform_biases_t; + typedef {aggregator_distance_weights_t} aggregator_distance_weights_t; + typedef {aggregator_distance_biases_t} aggregator_distance_biases_t; + typedef {output_transform_biases_t} output_transform_biases_t; + + static const input_transform_weights_t (&input_transform_weights)[{input_transform_weights_size}]; + static const input_transform_biases_t (&input_transform_biases)[{input_transform_biases_size}]; + static const aggregator_distance_weights_t (&aggregator_distance_weights)[{aggregator_distance_weights_size}]; + static const aggregator_distance_biases_t (&aggregator_distance_biases)[{aggregator_distance_biases_size}]; + static const output_transform_biases_t (&output_transform_biases)[{output_transform_biases_size}]; + + typedef config{index}::sublayer_t<{next}> next_layer_t; +}}; + +const config{index}::sublayer_t<{il}>::input_transform_weights_t (&config{index}::sublayer_t<{il}>::input_transform_weights)[{input_transform_weights_size}] = {input_transform_weights}; +const config{index}::sublayer_t<{il}>::input_transform_biases_t (&config{index}::sublayer_t<{il}>::input_transform_biases)[{input_transform_biases_size}] = {input_transform_biases}; +const config{index}::sublayer_t<{il}>::aggregator_distance_weights_t (&config{index}::sublayer_t<{il}>::aggregator_distance_weights)[{aggregator_distance_weights_size}] = {aggregator_distance_weights}; +const config{index}::sublayer_t<{il}>::aggregator_distance_biases_t (&config{index}::sublayer_t<{il}>::aggregator_distance_biases)[{aggregator_distance_biases_size}] = {aggregator_distance_biases}; +const config{index}::sublayer_t<{il}>::output_transform_biases_t (&config{index}::sublayer_t<{il}>::output_transform_biases)[{output_transform_biases_size}] = {output_transform_biases}; +""" # noqa: E501 + +garnet_stack_config_template = (garnet_stack_base_config_template, garnet_stack_sublayer_config_template) +garnet_stack_function_template = ( + 'nnet::garnet_stack<{input_t}, {integer_input_t}, {output_t}, {config}>({input}, {nvtx}, {output});' +) + + +class GarNetStackConfigTemplate(GarNetConfigTemplate): + def __init__(self): + super(GarNetConfigTemplate, self).__init__(GarNetStack) + self.template = garnet_stack_config_template + + def get_transforms_config(self, node, params): + _, sublayer_template = self.template + + params['n_sublayers'] = node.attributes['n_sublayers'] + params['n_in_features'] = node.attributes['n_in_features'][0] + params['n_out_features'] = node.attributes['n_out_features'][-1] + + sublayer_configs = [] + for il in range(node.attributes['n_sublayers'] - 1, -1, -1): + sub_params = {'index': node.index, 'il': il} + + for p in ['n_in_features', 'n_propagate', 'n_aggregators', 'n_out_features']: + sub_params[p] = node.attributes[p][il] + + for wname, weights in node._sublayer_weights[il].items(): + sub_params[wname] = weights.name + sub_params[f'{wname}_t'] = weights.type.name + sub_params[f'{wname}_size'] = weights.data_length + + if il != node.attributes['n_sublayers'] - 1: + sub_params['next'] = il + 1 + else: + sub_params['next'] = 0 + + sublayer_configs.append(sublayer_template.format(**sub_params)) + + params['sublayer_configs'] = '\n'.join(sublayer_configs) + + +class GarNetStackFunctionTemplate(GarNetFunctionTemplate): + def __init__(self): + super(GarNetFunctionTemplate, self).__init__(GarNetStack, include_header=garnet_include_list) + self.template = garnet_stack_function_template diff --git a/hls4ml/backends/catapult/passes/merge_templates.py b/hls4ml/backends/catapult/passes/merge_templates.py new file mode 100755 index 0000000000..ff6928679c --- /dev/null +++ b/hls4ml/backends/catapult/passes/merge_templates.py @@ -0,0 +1,106 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Concatenate, Dot, Merge + +# Merge templates + +merge_config_template = """struct config{index} : nnet::merge_config {{ + static const unsigned n_elem = {n_elem}; +}};\n""" + +merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});' + +merge_include_list = ['nnet_utils/nnet_merge.h', 'nnet_utils/nnet_merge_stream.h'] + + +class MergeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Merge) + self.template = merge_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_elem'] = node.get_input_variable(node.inputs[0]).size_cpp() + + return self.template.format(**params) + + +class MergeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Merge, Concatenate, Dot), include_header=merge_include_list) + self.template = merge_function_template + + def format(self, node): + params = {} + params['merge'] = node.get_attr('op').lower() + params['config'] = f'config{node.index}' + params['input1_t'] = node.get_input_variable(node.inputs[0]).type.name + params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name + params['output_t'] = node.get_output_variable().type.name + params['input1'] = node.get_input_variable(node.inputs[0]).name + params['input2'] = node.get_input_variable(node.inputs[1]).name + params['output'] = node.get_output_variable().name + + return self.template.format(**params) + + +# Dot templates + +dot_config_template = """struct config{index} : nnet::dot_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned reuse_factor = {reuse}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor); + typedef {accum_t.name} accum_t; + template + using product = nnet::product::{product_type}; +}};\n""" + + +class DotConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Dot) + self.template = dot_config_template + + def format(self, node): + inp1 = node.get_input_variable(node.inputs[0]) + inp2 = node.get_input_variable(node.inputs[1]) + params = self._default_config_params(node) + params['n_out'] = 1 + params['n_in'] = inp1.shape[0] + params['product_type'] = get_backend('catapult').product_type(inp1.type.precision, inp2.type.precision) + + return self.template.format(**params) + + +# Concatenate templates + +concat_config_template = """struct config{index} : nnet::concat_config {{ + static const unsigned n_elem1_0 = {n_elem1_0}; + static const unsigned n_elem1_1 = {n_elem1_1}; + static const unsigned n_elem1_2 = {n_elem1_2}; + static const unsigned n_elem2_0 = {n_elem2_0}; + static const unsigned n_elem2_1 = {n_elem2_1}; + static const unsigned n_elem2_2 = {n_elem2_2}; + + static const int axis = {axis}; +}};\n""" + + +class ConcatenateConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Concatenate) + self.template = concat_config_template + + def format(self, node): + params = self._default_config_params(node) + for i in range(3): + params.setdefault(f'n_elem1_{i}', 0) + params.setdefault(f'n_elem2_{i}', 0) + inp1 = node.get_input_variable(node.inputs[0]) + inp2 = node.get_input_variable(node.inputs[1]) + for i, (s1, s2) in enumerate(zip(inp1.shape, inp2.shape)): + params[f'n_elem1_{i}'] = s1 + params[f'n_elem2_{i}'] = s2 + + return self.template.format(**params) diff --git a/hls4ml/backends/catapult/passes/pointwise.py b/hls4ml/backends/catapult/passes/pointwise.py new file mode 100755 index 0000000000..2dd982b5d4 --- /dev/null +++ b/hls4ml/backends/catapult/passes/pointwise.py @@ -0,0 +1,92 @@ +from copy import copy + +import numpy as np + +from hls4ml.backends.catapult.passes.convolution_templates import ( + Conv1DConfigTemplate, + Conv1DFunctionTemplate, + Conv2DConfigTemplate, + Conv2DFunctionTemplate, + conv1d_config_template, + conv2d_config_template, + conv_mult_config_template, +) +from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D +from hls4ml.model.layers import register_layer +from hls4ml.model.optimizer import OptimizerPass + +pointwise_conv1d_function_template = ( + 'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) +pointwise_conv2d_function_template = ( + 'nnet::pointwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) + +sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h'] +sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h'] + + +class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate): + def __init__(self): + super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D) + self.template = conv1d_config_template + self.mult_template = conv_mult_config_template + + +class PointwiseConv1DFunctionTemplate(Conv1DFunctionTemplate): + def __init__(self): + super(Conv1DFunctionTemplate, self).__init__(PointwiseConv1D, include_header=sepconv1d_include_list) + self.template = pointwise_conv1d_function_template + + +class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate): + def __init__(self): + super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D) + self.template = conv2d_config_template + self.mult_template = conv_mult_config_template + + +class PointwiseConv2DFunctionTemplate(Conv2DFunctionTemplate): + def __init__(self): + super(Conv2DFunctionTemplate, self).__init__(PointwiseConv2D, include_header=sepconv2d_include_list) + self.template = pointwise_conv2d_function_template + + +def register_pointwise(backend): + # Register the layer types to the layer map + register_layer('PointwiseConv1D', PointwiseConv1D) + register_layer('PointwiseConv2D', PointwiseConv2D) + + # Register the optimization passes + backend.register_pass('optimize_pointwise_conv', OptimizePointwiseConv) + + # Register template passes + backend.register_template(PointwiseConv1DConfigTemplate) + backend.register_template(PointwiseConv1DFunctionTemplate) + backend.register_template(PointwiseConv2DConfigTemplate) + backend.register_template(PointwiseConv2DFunctionTemplate) + + +class OptimizePointwiseConv(OptimizerPass): + def match(self, node): + return ( + node.class_name in ('Conv1D', 'Conv2D') + and node.get_attr('filt_height', 1) == 1 + and node.get_attr('filt_width') == 1 + ) + + def transform(self, model, node): + dim = node.__class__.__name__[-2:] # '1D' or '2D' + pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy()) + if len(node.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D + expand_axis = tuple(range(int(dim[0]))) + pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis) + pw_node.weights['bias'].data = node.weights['bias'].data + # Set strategy to ensure lowercase string is passed to the template + if model.config.is_resource_strategy(pw_node): + pw_node.set_attr('strategy', 'resource') + else: + pw_node.set_attr('strategy', 'latency') + model.replace_node(node, pw_node) + + return True diff --git a/hls4ml/backends/catapult/passes/pooling_templates.py b/hls4ml/backends/catapult/passes/pooling_templates.py new file mode 100755 index 0000000000..77205a5df7 --- /dev/null +++ b/hls4ml/backends/catapult/passes/pooling_templates.py @@ -0,0 +1,109 @@ +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import GlobalPooling1D, GlobalPooling2D, Pooling1D, Pooling2D + +# Pooling templates + +pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_filt = {n_filt}; + static const unsigned pool_width = {pool_width}; + + static const unsigned filt_width = pool_width; + static const unsigned n_chan = n_filt; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const bool count_pad = {count_pad}; + static const unsigned stride_width = {stride_width}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned reuse_factor = {reuse}; + typedef {accum_t.name} accum_t; +}};\n""" + +pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_filt = {n_filt}; + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + static const unsigned pool_height = {pool_height}; + static const unsigned pool_width = {pool_width}; + + static const unsigned filt_height = pool_height; + static const unsigned filt_width = pool_width; + static const unsigned n_chan = n_filt; + + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const bool count_pad = {count_pad}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation}; + static const unsigned reuse_factor = {reuse}; + typedef {accum_t.name} accum_t; +}};\n""" + +global_pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + static const unsigned reuse_factor = {reuse}; + typedef {accum_t.name} accum_t; +}};\n""" + +global_pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_filt = {n_filt}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + static const unsigned reuse_factor = {reuse}; + typedef {accum_t.name} accum_t; +}};\n""" + +pooling1d_function_template = 'nnet::pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +pooling2d_function_template = 'nnet::pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +global_pooling1d_function_template = ( + 'nnet::global_pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +) +global_pooling2d_function_template = ( + 'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +) + +pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h'] + + +class PoolingConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D)) + self.templates = { + 'Pooling1D': pooling1d_config_template, + 'Pooling2D': pooling2d_config_template, + 'GlobalPooling1D': global_pooling1d_config_template, + 'GlobalPooling2D': global_pooling2d_config_template, + } + + def format(self, node): + params = self._default_config_params(node) + return self.templates[node.class_name].format(**params) + + +class PoolingFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D), include_header=pooling_include_list) + self.templates = { + 'Pooling1D': pooling1d_function_template, + 'Pooling2D': pooling2d_function_template, + 'GlobalPooling1D': global_pooling1d_function_template, + 'GlobalPooling2D': global_pooling2d_function_template, + } + + def format(self, node): + params = self._default_function_params(node) + params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl' + + return self.templates[node.class_name].format(**params) diff --git a/hls4ml/backends/catapult/passes/quantization_templates.py b/hls4ml/backends/catapult/passes/quantization_templates.py new file mode 100755 index 0000000000..7086b187f9 --- /dev/null +++ b/hls4ml/backends/catapult/passes/quantization_templates.py @@ -0,0 +1,36 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.catapult.passes.core_templates import ( + batchnorm_config_template, + batchnorm_function_template, + batchnorm_include_list, +) +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.optimizer.passes.qkeras import ApplyAlpha + + +class ApplyAlphaConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(ApplyAlpha) + self.template = batchnorm_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + params['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) + + return self.template.format(**params) + + +class ApplyAlphaFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(ApplyAlpha, include_header=batchnorm_include_list) + self.template = batchnorm_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) diff --git a/hls4ml/backends/catapult/passes/recurrent_templates.py b/hls4ml/backends/catapult/passes/recurrent_templates.py new file mode 100755 index 0000000000..4079f25721 --- /dev/null +++ b/hls4ml/backends/catapult/passes/recurrent_templates.py @@ -0,0 +1,175 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import GRU, LSTM + +# recurrent multiplication template + +recr_mult_config_template = """struct config{index} : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned strategy = nnet::{strategy}; + static const unsigned reuse_factor = {reuse}; + static const unsigned n_zeros = {nzeros}; + static const unsigned n_nonzeros = {nonzeros}; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; + static const bool store_weights_in_bram = false; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {index_t.name} index_t; + template + using product = nnet::product::{product_type}; +}};\n""" + +# activation templates + +activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n""" + +recr_activ_config_template = """struct {type}_config{index}_recr : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n""" + +# LSTM + GRU templates + +recr_config_template = """struct config{index} : nnet::{recr_type}_config {{ + typedef {accum_t.name} accum_t; + typedef {weight_t.name} weight_t; // Matrix + typedef {bias_t.name} bias_t; // Vector + typedef {config_mult_t1} mult_config1; + typedef {config_mult_t2} mult_config2; + typedef {recr_act_t} ACT_CONFIG_{RECR_TYPE}; + template + using activation_recr = nnet::activation::{recurrent_activation}; + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_state = {n_state}; + static const unsigned n_sequence = {n_sequence}; + static const unsigned n_sequence_out = {n_sequence_out}; + static const unsigned io_type = nnet::{strategy}; + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; + static const bool use_static = {static}; +}};\n""" + +recr_function_template = 'nnet::{recr_type}_stack<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {wr}, {b}, {br});' + +recr_include_list = ['nnet_utils/nnet_recurrent.h'] + + +class RecurrentConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((LSTM, GRU)) + self.template = recr_config_template + self.act_template = activ_config_template + self.recr_act_template = recr_activ_config_template + self.mult1_template = recr_mult_config_template + self.mult2_template = recr_mult_config_template + + def format(self, node): + params = self._default_config_params(node) + + params['n_in'] = node.get_input_variable().dim_names[1] + params['n_sequence'] = node.get_input_variable().dim_names[0] + if node.get_attr('return_sequences'): + params['n_sequence_out'] = node.get_output_variable().dim_names[0] + params['n_state'] = node.get_output_variable().dim_names[1] + params['n_out'] = node.get_output_variable().dim_names[1] + else: + params['n_sequence_out'] = 1 + params['n_state'] = node.get_output_variable().dim_names[0] + params['n_out'] = node.get_output_variable().dim_names[0] + params['config_mult_t1'] = f'config{node.index}_1' + params['config_mult_t2'] = f'config{node.index}_2' + params['recr_act_t'] = '{}_config{}_recr'.format(node.get_attr('recurrent_activation'), node.index) + params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + params['strategy'] = node.get_attr('strategy') + params['static'] = 'true' if node.attributes['static'] else 'false' + params['recr_type'] = node.class_name.lower() + params['RECR_TYPE'] = node.class_name + + if node.class_name == 'LSTM': + n_recr_mult = 4 + else: # GRU + n_recr_mult = 3 + + recr_config = self.template.format(**params) + + act_params = self._default_config_params(node) + recr_act_params = self._default_config_params(node) + + act_params['type'] = node.get_attr('activation') + recr_act_params['type'] = node.get_attr('recurrent_activation') + if node.get_attr('return_sequences'): + act_params['n_in'] = node.get_output_variable().dim_names[1] + recr_act_params['n_in'] = node.get_output_variable().dim_names[1] + ' * %i' % (n_recr_mult - 1) + else: + act_params['n_in'] = node.get_output_variable().dim_names[0] + recr_act_params['n_in'] = node.get_output_variable().dim_names[0] + ' * %i' % (n_recr_mult - 1) + + act_config = self.act_template.format(**act_params) + recr_act_config = self.recr_act_template.format(**recr_act_params) + + mult_params1 = self._default_config_params(node) + mult_params2 = self._default_config_params(node) + + mult_params1['n_in'] = node.get_input_variable().dim_names[1] + if node.get_attr('return_sequences'): + mult_params1['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult + else: + mult_params1['n_out'] = node.get_output_variable().dim_names[0] + ' * %i' % n_recr_mult + mult_params1['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_params1['reuse'] = params['reuse'] + mult_params1['index'] = str(node.index) + '_1' + mult_params1['nzeros'] = node.get_weights('weight').nzeros + mult_params1['nonzeros'] = node.get_weights('weight').nonzeros + if node.get_attr('return_sequences'): + mult_params2['n_in'] = node.get_output_variable().dim_names[1] + mult_params2['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult + else: + mult_params2['n_in'] = node.get_output_variable().dim_names[0] + mult_params2['n_out'] = node.get_output_variable().dim_names[0] + ' * %i' % n_recr_mult + mult_params2['product_type'] = get_backend('catapult').product_type( + node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision + ) + mult_params2['reuse'] = node.attributes['recurrent_reuse_factor'] + mult_params2['index'] = str(node.index) + '_2' + mult_params2['nzeros'] = node.get_weights('recurrent_weight').nzeros + mult_params2['nonzeros'] = node.get_weights('recurrent_weight').nonzeros + + mult_config1 = self.mult1_template.format(**mult_params1) + mult_config2 = self.mult2_template.format(**mult_params2) + + return mult_config1 + '\n' + mult_config2 + '\n' + recr_act_config + '\n' + act_config + '\n' + recr_config + + +class RecurrentFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((LSTM, GRU), include_header=recr_include_list) + self.template = recr_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + params['wr'] = node.get_weights('recurrent_weight').name + params['br'] = node.get_weights('recurrent_bias').name + params['activation'] = node.get_attr('activation') + params['recurrent_activation'] = node.get_attr('recurrent_activation') + params['recr_type'] = node.class_name.lower() + + return self.template.format(**params) diff --git a/hls4ml/backends/catapult/passes/reshaping_templates.py b/hls4ml/backends/catapult/passes/reshaping_templates.py new file mode 100755 index 0000000000..ec6705eb29 --- /dev/null +++ b/hls4ml/backends/catapult/passes/reshaping_templates.py @@ -0,0 +1,132 @@ +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Resize, Transpose, ZeroPadding1D, ZeroPadding2D + +# ZeroPadding templates + +zeropad1d_config_template = """struct config{index} : nnet::padding1d_config {{ + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned out_width = {out_width}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; +}};\n""" + +zeropad2d_config_template = """struct config{index} : nnet::padding2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; +}};\n""" + +zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' + +padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h'] + + +class ZeroPaddingConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D)) + self.templates = { + 'ZeroPadding1D': zeropad1d_config_template, + 'ZeroPadding2D': zeropad2d_config_template, + } + + def format(self, node): + params = self._default_config_params(node) + return self.templates[node.class_name].format(**params) + + +class ZeroPaddingFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D), include_header=padding_include_list) + self.templates = { + 'ZeroPadding1D': zeropad1d_function_template, + 'ZeroPadding2D': zeropad2d_function_template, + } + + def format(self, node): + params = self._default_function_params(node) + params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl' + + return self.templates[node.class_name].format(**params) + + +# Resize templates + +resize_config_template = """struct config{index} : nnet::resize_config {{ + static const unsigned height = {in_height}; + static const unsigned width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned new_height = {out_height}; + static const unsigned new_width = {out_width}; +}};\n""" + +resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {config}>({input}, {output});' + +resize_include_list = ['nnet_utils/nnet_image.h', 'nnet_utils/nnet_image_stream.h'] + + +class ResizeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Resize) + self.template = resize_config_template + + def format(self, node): + params = self._default_config_params(node) + + return self.template.format(**params) + + +class ResizeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Resize, include_header=resize_include_list) + self.template = resize_function_template + + def format(self, node): + params = self._default_function_params(node) + params['algorithm'] = node.get_attr('algorithm') + + return self.template.format(**params) + + +# Transpose templates + +transpose_config_template = """struct config{index} : nnet::transpose_config {{ + static const unsigned depth = {depth}; + static const unsigned height = {height}; + static const unsigned width = {width}; + static constexpr unsigned perm[3] = {{{perm_str}}}; +}};\n""" + +transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});' + +transpose_include_list = ['nnet_utils/nnet_array.h', 'nnet_utils/nnet_stream.h'] + + +class TransposeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Transpose) + self.template = transpose_config_template + + def format(self, node): + params = self._default_config_params(node) + + return self.template.format(**params) + + +class TransposeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Transpose, include_header=transpose_include_list) + self.template = transpose_function_template + + def format(self, node): + params = self._default_function_params(node) + params['dim'] = node.get_attr('dim') + + return self.template.format(**params) diff --git a/hls4ml/backends/catapult/passes/resource_strategy.py b/hls4ml/backends/catapult/passes/resource_strategy.py new file mode 100755 index 0000000000..63e6e0b4db --- /dev/null +++ b/hls4ml/backends/catapult/passes/resource_strategy.py @@ -0,0 +1,48 @@ +import numpy as np + +from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SeparableConv1D, SeparableConv2D +from hls4ml.model.optimizer import OptimizerPass + + +class ApplyResourceStrategy(OptimizerPass): + '''Transposes the weights to use the dense_resource matrix multiply routine''' + + def match(self, node): + node_matches = isinstance(node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU)) + is_resource_strategy = node.get_attr('strategy', '').lower() == 'resource' + already_transformed = node.get_attr('_weights_transposed', False) is True + + return node_matches and is_resource_strategy and not already_transformed + + def transform(self, model, node): + if isinstance(node, Dense): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + elif isinstance(node, Conv1D): + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[2, 0, 1]) # (W,C,F) => (F,W,C) + elif isinstance(node, SeparableConv1D): + node.weights['depthwise'].data = np.transpose( + node.weights['depthwise'].data, axes=[2, 0, 1] + ) # (W,C,F) => (F,W,C) + node.weights['pointwise'].data = np.transpose( + node.weights['pointwise'].data, axes=[2, 0, 1] + ) # (W,C,F) => (F,W,C) + elif isinstance(node, Conv2D): + node.weights['weight'].data = np.transpose( + node.weights['weight'].data, axes=[3, 0, 1, 2] + ) # (H,W,C,F) => (F,H,W,C) + elif isinstance(node, SeparableConv2D): + node.weights['depthwise'].data = np.transpose( + node.weights['depthwise'].data, axes=[3, 0, 1, 2] + ) # (H,W,C,F) => (F,H,W,C) + node.weights['pointwise'].data = np.transpose( + node.weights['pointwise'].data, axes=[3, 0, 1, 2] + ) # (H,W,C,F) => (F,H,W,C) + elif isinstance(node, (LSTM, GRU)): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + else: + raise Exception(f'Unexpected layer {node.class_name} with resource strategy') + + node.set_attr('_weights_transposed', True) + + return False diff --git a/hls4ml/backends/catapult/passes/transform_types.py b/hls4ml/backends/catapult/passes/transform_types.py new file mode 100755 index 0000000000..4ef3548cb6 --- /dev/null +++ b/hls4ml/backends/catapult/passes/transform_types.py @@ -0,0 +1,52 @@ +from hls4ml.backends.fpga.fpga_types import ( + ACTypeConverter, + CatapultArrayVariableConverter, + CatapultInplaceArrayVariableConverter, + CatapultInplaceStreamVariableConverter, + CatapultStreamVariableConverter, + HLSTypeConverter, + StaticWeightVariableConverter, +) +from hls4ml.model.optimizer import GlobalOptimizerPass +from hls4ml.model.types import InplaceTensorVariable + + +class TransformTypes(GlobalOptimizerPass): + def __init__(self): + self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter()) + self.array_var_converter = CatapultArrayVariableConverter(type_converter=self.type_converter) + self.inplace_array_var_converter = CatapultInplaceArrayVariableConverter(type_converter=self.type_converter) + self.stream_var_converter = CatapultStreamVariableConverter(type_converter=self.type_converter) + self.inplace_stream_var_converter = CatapultInplaceStreamVariableConverter(type_converter=self.type_converter) + self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter) + + def transform(self, model, node): + io_type = node.model.config.get_config_value('IOType') + + for out_name, var in node.variables.items(): + if io_type == 'io_stream': + if isinstance(var, InplaceTensorVariable): + new_var = self.inplace_stream_var_converter.convert(var) + else: + new_var = self.stream_var_converter.convert(var) + elif io_type == 'io_serial': + new_var = self.array_var_converter.convert(var, pragma='stream') + elif io_type == 'io_parallel': + if out_name in node.model.inputs: + new_var = self.array_var_converter.convert(var, pragma='reshape') + elif isinstance(var, InplaceTensorVariable): + new_var = self.inplace_array_var_converter.convert(var, pragma='') + else: + new_var = self.array_var_converter.convert(var, pragma='partition') + else: + raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.__class__.__name__})') + + node.set_attr(out_name, new_var) + + for w_name, weight in node.weights.items(): + new_weight = self.weight_var_converter.convert(weight) + node.set_attr(w_name, new_weight) + + for t_name, type in node.types.items(): + new_type = self.type_converter.convert(type) + node.set_attr(t_name, new_type) diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py index c5327dab8c..408f1320e4 100644 --- a/hls4ml/backends/fpga/fpga_types.py +++ b/hls4ml/backends/fpga/fpga_types.py @@ -248,6 +248,13 @@ def definition_cpp(self, name_suffix='', as_reference=False): ) +class CatapultArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}] /* {pragma} */'.format( + type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma + ) + + class VivadoInplaceArrayVariableDefinition(VariableDefinition): def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' @@ -258,6 +265,11 @@ def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' +class CatapultInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + class ArrayVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -285,6 +297,11 @@ def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition) +class CatapultArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultArrayVariableDefinition) + + class VivadoInplaceArrayVariableConverter(ArrayVariableConverter): def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition) @@ -297,6 +314,13 @@ def __init__(self, type_converter): ) +class CatapultInplaceArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceArrayVariableDefinition + ) + + # endregion # region StructMemberVariable @@ -309,6 +333,13 @@ def definition_cpp(self, name_suffix='', as_reference=False): ) +class CatapultStructMemberVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}]'.format( + type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp() + ) + + class StructMemberVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -338,6 +369,13 @@ def __init__(self, type_converter): ) +class CatapultStructMemberVariableConverter(StructMemberVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStructMemberVariableDefinition + ) + + # endregion # region StreamVariable @@ -371,6 +409,21 @@ def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' +class CatapultStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if as_reference: # Function parameter + return f'ac_channel<{self.type.name}> &{self.name}{name_suffix}' + else: # Declaration (string name arg not implemented in ac_channel) + return 'ac_channel<{type}> {name}{suffix}/*("{name}")*/'.format( + type=self.type.name, name=self.name, suffix=name_suffix + ) + + +class CatapultInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + class StreamVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -402,6 +455,11 @@ def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition) +class CatapultStreamVariableConverter(StreamVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStreamVariableDefinition) + + # endregion # region InplaceStreamVariable @@ -435,6 +493,13 @@ def __init__(self, type_converter): ) +class CatapultInplaceStreamVariableConverter(InplaceStreamVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceStreamVariableDefinition + ) + + # endregion # region WeightsVariable diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py index b69dbec0f0..3bd6d06c3b 100644 --- a/hls4ml/converters/__init__.py +++ b/hls4ml/converters/__init__.py @@ -196,7 +196,7 @@ def convert_from_keras_model( output_data_tb (str, optional): String representing the path of output data in .npy or .dat format that will be used during csim and cosim. backend (str, optional): Name of the backend to use, e.g., 'Vivado' - or 'Quartus'. + or 'Quartus' or 'Catapult'. board (str, optional): One of target boards specified in `supported_board.json` file. If set to `None` a default device of a backend will be used. See documentation of the backend used. part (str, optional): The FPGA part. If set to `None` a default part of a backend will be used. @@ -258,7 +258,7 @@ def convert_from_pytorch_model( used during csim and cosim. Defaults to None. output_data_tb (str, optional): String representing the path of output data in .npy or .dat format that will be used during csim and cosim. Defaults to None. - backend (str, optional): Name of the backend to use, e.g., 'Vivado' or 'Quartus'. Defaults to 'Vivado'. + backend (str, optional): Name of the backend to use, e.g., 'Vivado' or 'Quartus' or 'Catapult'. Defaults to 'Vivado'. board (str, optional): One of target boards specified in `supported_board.json` file. If set to `None` a default device of a backend will be used. See documentation of the backend used. part (str, optional): The FPGA part. If set to `None` a default part of a backend will be used. @@ -332,7 +332,7 @@ def convert_from_onnx_model( output_data_tb (str, optional): String representing the path of output data in .npy or .dat format that will be used during csim and cosim. backend (str, optional): Name of the backend to use, e.g., 'Vivado' - or 'Quartus'. + or 'Quartus' or 'Catapult'. board (str, optional): One of target boards specified in `supported_board.json` file. If set to `None` a default device of a backend will be used. See documentation of the backend used. part (str, optional): The FPGA part. If set to `None` a default part of a backend will be used. diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index a6b5c29e89..04ec33294d 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -60,6 +60,12 @@ def get_config_value(self, key, default=None): def get_project_name(self): return self.get_config_value('ProjectName') + def get_project_dir(self): + if self.get_config_value('ProjectDir') is not None: + return self.get_config_value('ProjectDir') + else: + return self.get_config_value('ProjectName') + '_prj' + def get_output_dir(self): return self.get_config_value('OutputDir') diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index b74918f642..de191baa40 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -560,6 +560,7 @@ def initialize(self): if self.model.config.is_resource_strategy(self) and self.model.config.backend.name in [ 'Vivado', 'VivadoAccelerator', + 'Catapult', ]: self.weights['weight'].data_unquantized = np.transpose(folded_weights, axes=[3, 0, 1, 2]) self.weights['weight'].data = self.get_attr('weight_quantizer')(self.weights['weight'].data_unquantized) diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py index 84a83de23e..9560699405 100644 --- a/hls4ml/model/profiling.py +++ b/hls4ml/model/profiling.py @@ -588,8 +588,9 @@ def get_ymodel_keras(keras_model, X): # Note that if the layer is a standalone activation layer then skip this name = layer.name if ( - hasattr(layer, 'activation') - and layer.activation is not None + hasattr(layer, "activation") + and hasattr(layer.activation, "__name__") + and layer.activation.__name__ != "linear" and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation)) and layer.activation.__name__ != 'linear' ): diff --git a/hls4ml/report/__init__.py b/hls4ml/report/__init__.py index b73558f6ee..3c9b7707b7 100644 --- a/hls4ml/report/__init__.py +++ b/hls4ml/report/__init__.py @@ -1,3 +1,6 @@ +from hls4ml.report.catapult_report import parse_catapult_report # noqa: F401 +from hls4ml.report.catapult_report import qofr # noqa: F401 +from hls4ml.report.catapult_report import read_catapult_report # noqa: F401 from hls4ml.report.quartus_report import parse_quartus_report # noqa: F401 from hls4ml.report.quartus_report import read_quartus_report # noqa: F401 from hls4ml.report.vivado_report import parse_vivado_report # noqa: F401 diff --git a/hls4ml/report/catapult_report.py b/hls4ml/report/catapult_report.py new file mode 100755 index 0000000000..563a3a7594 --- /dev/null +++ b/hls4ml/report/catapult_report.py @@ -0,0 +1,256 @@ +import os +import re + +import yaml + + +def read_catapult_report(hls_dir, full_report=False): + if not os.path.exists(hls_dir): + print(f'Path {hls_dir} does not exist. Exiting.') + return + + prj_dir = None + top_func_name = None + + if os.path.isfile(hls_dir + '/build_prj.tcl'): + prj_dir, top_func_name = _parse_build_script(hls_dir + '/build_prj.tcl') + print('Prj Dir:', prj_dir) + print('Top func name:', top_func_name) + + if prj_dir is None or top_func_name is None: + print('Unable to read project data. Exiting.') + return + + sln_dir = hls_dir + '/' + prj_dir + if not os.path.exists(sln_dir): + print(f'Project {prj_dir} does not exist. Rerun "hls4ml build -p {hls_dir}".') + return + + solutions = _find_solutions(sln_dir, hls_dir) + + for sln in solutions: + print(f'Reports for solution "{sln}":\n') + _find_reports(sln_dir + '/' + sln, top_func_name, full_report) + + +def _parse_build_script(script_path): + prj_dir = None + top_func_name = None + + with open(script_path) as f: + for line in f.readlines(): + if 'project new' in line: + prj_dir = line.split()[-1] + if 'set design_top' in line: + top_func_name = line.split()[-1] + + return prj_dir, top_func_name + + +def _find_solutions(sln_dir, hls_dir): + solutions = [] + prj_dir, top_func_name = _parse_build_script(hls_dir + '/build_prj.tcl') + for path in os.listdir(sln_dir): + # check if current path is a dir + if os.path.isdir(os.path.join(sln_dir, path)): + pathstring = str(path) + if top_func_name in pathstring: + solutions.append(pathstring) + return solutions + + +def _find_reports(sln_dir, top_func_name, full_report=False): + csim_file = sln_dir + '/../../tb_data/csim_results.log' + if os.path.isfile(csim_file): + _show_csim_report(csim_file) + else: + print('C simulation report not found.') + + syn_file = sln_dir + '/rtl.rpt' + if os.path.isfile(syn_file): + _show_synth_report(syn_file, full_report) + else: + print('Synthesis report not found.') + + cosim_file = sln_dir + f'/sim/report/{top_func_name}_cosim.rpt' + if os.path.isfile(cosim_file): + _show_cosim_report(cosim_file) + else: + print('Co-simulation report not found.') + + timing_report = sln_dir + '/vivado_concat_v/timing_summary_synth.rpt' + if os.path.isfile(timing_report): + _show_timing_report(timing_report) + else: + print('Timing synthesis report not found.') + + utilization_report = sln_dir + '/vivado_concat_v/utilization_synth.rpt' + if os.path.isfile(utilization_report): + _show_utilization_report(utilization_report) + else: + print('Utilization synthesis report not found.') + + +def _show_csim_report(csim_file): + with open(csim_file) as f: + print('C SIMULATION RESULT:') + print(f.read()) + + +def _show_synth_report(synth_file, full_report=False): + with open(synth_file) as f: + print('SYNTHESIS REPORT:') + for line in f.readlines()[2:]: + if not full_report and '* DSP48' in line: + break + print(line, end='') + + +def _show_cosim_report(cosim_file): + with open(cosim_file) as f: + print('CO-SIMULATION RESULT:') + print(f.read()) + + +def _show_timing_report(timing_report): + with open(timing_report) as f: + print('TIMING REPORT:') + print(f.read()) + + +def _show_utilization_report(utilization_report): + with open(utilization_report) as f: + print('UTILIZATION REPORT:') + print(f.read()) + + +def _get_abs_and_percentage_values(unparsed_cell): + return int(unparsed_cell.split('(')[0]), float(unparsed_cell.split('(')[1].replace('%', '').replace(')', '')) + + +def parse_catapult_report(output_dir): + if not os.path.exists(output_dir): + print(f'Project OutputDir {output_dir} does not exist. Exiting.') + return + + # Read the YAML config file to determine the project settings + with open(output_dir + '/hls4ml_config.yml') as yfile: + ydata = yaml.safe_load(yfile) + + if not ydata['ProjectDir'] is None: + ProjectDir = ydata['ProjectDir'] + else: + ProjectDir = ydata['ProjectName'] + '_prj' + ProjectName = ydata['ProjectName'] + + sln_dir = output_dir + '/' + ProjectDir + if not os.path.exists(sln_dir): + print(f'Project {ProjectDir} does not exist. Rerun "hls4ml build -p {output_dir}".') + return + + solutions = _find_solutions(sln_dir, output_dir) + if len(solutions) > 1: + print(f'WARNING: Found {len(solutions)} solution(s) in {sln_dir}. Using the first solution.') + + report = {} + + sim_file = output_dir + '/tb_data/csim_results.log' + if os.path.isfile(sim_file): + csim_results = [] + with open(sim_file) as f: + for line in f.readlines(): + csim_results.append([r for r in line.split()]) + report['CSimResults'] = csim_results + + util_report_file = output_dir + '/' + ProjectDir + '/' + solutions[0] + '/vivado_concat_v/utilization_synth.rpt' + if os.path.isfile(util_report_file): + util_report = {} + a = 0 + with open(util_report_file) as f: + for line in f.readlines(): + # Sometimes, phrases such as 'CLB Registers' can show up in the non-tabular sections of the report + if '|' in line: + if ('CLB LUTs' in line) and (a == 0): + a += 1 + util_report['LUT'] = line.split('|')[2].strip() + elif ('CLB Registers' in line) and (a == 1): + a += 1 + util_report['FF'] = line.split('|')[2].strip() + elif ('RAMB18 ' in line) and (a == 2): + a += 1 + util_report['BRAM_18K'] = line.split('|')[2].strip() + elif ('DSPs' in line) and (a == 3): + a += 1 + util_report['DSP48E'] = line.split('|')[2].strip() + elif ('URAM' in line) and (a == 4): + a += 1 + util_report['URAM'] = line.split('|')[2].strip() + report['UtilizationReport'] = util_report + else: + print('Utilization report not found.') + + timing_report_file = output_dir + '/' + ProjectDir + '/' + solutions[0] + '/vivado_concat_v/timing_summary_synth.rpt' + if os.path.isfile(timing_report_file): + timing_report = {} + with open(timing_report_file) as f: + while not re.search('WNS', next(f)): + pass + # skip the successive line + next(f) + result = next(f).split() + + timing_report['WNS'] = float(result[0]) + timing_report['TNS'] = float(result[1]) + timing_report['WHS'] = float(result[4]) + timing_report['THS'] = float(result[5]) + timing_report['WPWS'] = float(result[8]) + timing_report['TPWS'] = float(result[9]) + + report['TimingReport'] = timing_report + else: + print('Timing report not found.') + + latest_prj_dir = get_latest_project_prj_directory(output_dir, ProjectDir) + latest_ver_dir = get_latest_project_version_directory(latest_prj_dir, ProjectName) + file_path = os.path.join(latest_ver_dir, 'nnet_layer_results.txt') + print('Results in nnet_layer_results.txt from:', file_path) + + # Initialize the array + report['PerLayerQOFR'] = [] + # Open the file and read its contents + with open(file_path) as file: + # Read each line and append it to the list + for line in file: + report['PerLayerQOFR'].append(line.strip()) # strip() removes leading/trailing + + return report + + +def get_latest_project_version_directory(base_path, ProjectName): + versions = [d for d in os.listdir(base_path) if d.startswith(ProjectName + '.v')] + if not versions: + raise FileNotFoundError('Error: No versions found.') + latest_version = max(versions) + return os.path.join(base_path, latest_version) + + +def get_latest_project_prj_directory(base_path, ProjectDir): + versions = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.startswith(ProjectDir)] + if not versions: + raise FileNotFoundError('Error: No versions found.') + latest_version = max(versions) + return os.path.join(base_path, latest_version) + + +def qofr(report): + # Access the PerLayerQOFR list from the report dictionary + PerLayerQOFR = report.get('PerLayerQOFR', []) + + # Check if the list is not empty + if PerLayerQOFR: + # print('Results in nnet_layer_results.txt:') + # Iterate over each line in the list and print it + for line in PerLayerQOFR: + print(line) + else: + print('No results found in nnet_layer_results.txt') diff --git a/hls4ml/templates/catapult/ac_math b/hls4ml/templates/catapult/ac_math new file mode 160000 index 0000000000..3696be957d --- /dev/null +++ b/hls4ml/templates/catapult/ac_math @@ -0,0 +1 @@ +Subproject commit 3696be957d0b0fa0a285f90382d75c8a521d77ee diff --git a/hls4ml/templates/catapult/ac_simutils b/hls4ml/templates/catapult/ac_simutils new file mode 160000 index 0000000000..9dfe23415c --- /dev/null +++ b/hls4ml/templates/catapult/ac_simutils @@ -0,0 +1 @@ +Subproject commit 9dfe23415cf670ed7c990d9a6a237d06a5a62e57 diff --git a/hls4ml/templates/catapult/ac_types b/hls4ml/templates/catapult/ac_types new file mode 160000 index 0000000000..134dcb1a05 --- /dev/null +++ b/hls4ml/templates/catapult/ac_types @@ -0,0 +1 @@ +Subproject commit 134dcb1a05e16f242de593b9c9a33f6aa08c66e6 diff --git a/hls4ml/templates/catapult/build_lib.sh b/hls4ml/templates/catapult/build_lib.sh new file mode 100755 index 0000000000..2c7a11c626 --- /dev/null +++ b/hls4ml/templates/catapult/build_lib.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +CC=g++ +if [[ "$OSTYPE" == "linux-gnu" ]]; then + CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique" +elif [[ "$OSTYPE" == "linux"* ]]; then + CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique -Wno-pragmas" +elif [[ "$OSTYPE" == "darwin"* ]]; then + CFLAGS="-O3 -fPIC -std=c++11" +fi +LDFLAGS= + +# Pick up AC libraries from Catapult install first +INCFLAGS="-I$MGC_HOME/shared/include -I$MGC_HOME/shared/include/nnet_utils -Ifirmware/ac_types/include -Ifirmware/ac_math/include -Ifirmware/ac_simutils/include -Ifirmware/nnet_utils" +PROJECT=myproject +LIB_STAMP=mystamp + +${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o +${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o +${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so +rm -f *.o diff --git a/hls4ml/templates/catapult/build_prj.tcl b/hls4ml/templates/catapult/build_prj.tcl new file mode 100755 index 0000000000..7ee4d640dd --- /dev/null +++ b/hls4ml/templates/catapult/build_prj.tcl @@ -0,0 +1,356 @@ +################# +# HLS4ML +################# +array set opt { + reset 0 + csim 0 + synth 1 + cosim 0 + validation 0 + vhdl 1 + verilog 1 + export 0 + vsynth 0 + bitfile 0 + fifo_opt 0 + ran_frame 2 + sw_opt 0 + power 0 + da 0 + bup 0 +} + +# Get pathname to this script to use as dereference path for relative file pathnames +set sfd [file dirname [info script]] + +if { [info exists ::argv] } { + foreach arg $::argv { + foreach {optname optval} [split $arg '='] {} + if { [info exists opt($optname)] } { + if {[string is integer -strict $optval]} { + set opt($optname) $optval + } else { + set opt($optname) [string is true -strict $optval] + } + } + } +} + +puts "***** INVOKE OPTIONS *****" +foreach x [lsort [array names opt]] { + puts "[format { %-20s %s} $x $opt($x)]" +} +puts "" + +proc report_time { op_name time_start time_end } { + set time_taken [expr $time_end - $time_start] + set time_s [expr ($time_taken / 1000) % 60] + set time_m [expr ($time_taken / (1000*60)) % 60] + set time_h [expr ($time_taken / (1000*60*60)) % 24] + puts "***** ${op_name} COMPLETED IN ${time_h}h${time_m}m${time_s}s *****" +} + +proc setup_xilinx_part { part } { + # Map Xilinx PART into Catapult library names + set part_sav $part + set libname [lindex [library get /CONFIG/PARAMETERS/Vivado/PARAMETERS/Xilinx/PARAMETERS/*/PARAMETERS/*/PARAMETERS/$part/LIBRARIES/*/NAME -match glob -ret v] 0] + puts "Library Name: $libname" + if { [llength $libname] == 1 } { + set libpath [library get /CONFIG/PARAMETERS/Vivado/PARAMETERS/Xilinx/PARAMETERS/*/PARAMETERS/*/PARAMETERS/$part/LIBRARIES/*/NAME -match glob -ret p] + puts "Library Path: $libpath" + if { [regexp {/CONFIG/PARAMETERS/(\S+)/PARAMETERS/(\S+)/PARAMETERS/(\S+)/PARAMETERS/(\S+)/PARAMETERS/(\S+)/.*} $libpath dummy rtltool vendor family speed part] } { + solution library add $libname -- -rtlsyntool $rtltool -vendor $vendor -family $family -speed $speed -part $part_sav + } else { + solution library add $libname -- -rtlsyntool Vivado + } + } else { + logfile message "Could not find specific Xilinx base library for part '$part'. Using KINTEX-u\n" warning + solution library add mgc_Xilinx-KINTEX-u-2_beh -- -rtlsyntool Vivado -manufacturer Xilinx -family KINTEX-u -speed -2 -part xcku115-flvb2104-2-i + } + solution library add Xilinx_RAMS + solution library add Xilinx_ROMS + solution library add Xilinx_FIFO +} + + +proc setup_asic_libs { args } { + set do_saed 0 + foreach lib $args { + solution library add $lib -- -rtlsyntool DesignCompiler + if { [lsearch -exact {saed32hvt_tt0p78v125c_beh saed32lvt_tt0p78v125c_beh saed32rvt_tt0p78v125c_beh} $lib] != -1 } { + set do_saed 1 + } + } + solution library add ccs_sample_mem + solution library add ccs_sample_rom + solution library add hls4ml_lib + go libraries + + # special exception for SAED32 for use in power estimation + if { $do_saed } { + # SAED32 selected - enable DC settings to access Liberty data for power estimation + source [application get /SYSTEM/ENV_MGC_HOME]/pkgs/siflibs/saed/setup_saedlib.tcl + } +} + +options set Input/CppStandard {c++17} +options set Input/CompilerFlags -DRANDOM_FRAMES=$opt(ran_frame) +options set Input/SearchPath {$MGC_HOME/shared/include/nnet_utils} -append +options set ComponentLibs/SearchPath {$MGC_HOME/shared/pkgs/ccs_hls4ml} -append + +if {$opt(reset)} { + project load CATAPULT_DIR.ccs + go new +} else { + project new -name CATAPULT_DIR +} + +#-------------------------------------------------------- +# Configure Catapult Options +# downgrade HIER-10 +options set Message/ErrorOverride HIER-10 -remove +solution options set Message/ErrorOverride HIER-10 -remove + +if {$opt(vhdl)} { + options set Output/OutputVHDL true +} else { + options set Output/OutputVHDL false +} +if {$opt(verilog)} { + options set Output/OutputVerilog true +} else { + options set Output/OutputVerilog false +} + +#-------------------------------------------------------- +# Configure Catapult Flows +if { [info exists ::env(XILINX_PCL_CACHE)] } { +options set /Flows/Vivado/PCL_CACHE $::env(XILINX_PCL_CACHE) +solution options set /Flows/Vivado/PCL_CACHE $::env(XILINX_PCL_CACHE) +} + +# Turn on HLS4ML flow (wrapped in a cache so that older Catapult installs still work) +catch {flow package require /HLS4ML} + +# Turn on SCVerify flow +flow package require /SCVerify +# flow package option set /SCVerify/INVOKE_ARGS {$sfd/firmware/weights $sfd/tb_data/tb_input_features.dat $sfd/tb_data/tb_output_predictions.dat} +#hls-fpga-machine-learning insert invoke_args + +# Turn on VSCode flow +# flow package require /VSCode +# To launch VSCode on the C++ HLS design: +# cd my-Catapult-test +# code Catapult.code-workspace + +#-------------------------------------------------------- +# Start of HLS script +set design_top myproject +solution file add $sfd/firmware/myproject.cpp +solution file add $sfd/myproject_test.cpp -exclude true + +# Parse parameters.h to determine config info to control directives/pragmas +set IOType io_stream +if { ![file exists $sfd/firmware/parameters.h] } { + logfile message "Could not locate firmware/parameters.h. Unable to determine network configuration.\n" warning +} else { + set pf [open "$sfd/firmware/parameters.h" "r"] + while {![eof $pf]} { + gets $pf line + if { [string match {*io_type = nnet::io_stream*} $line] } { + set IOType io_stream + break + } + } + close $pf +} + +if { $IOType == "io_stream" } { +solution options set Architectural/DefaultRegisterThreshold 2050 +} +directive set -RESET_CLEARS_ALL_REGS no +# Constrain arrays to map to memory only over a certain size +directive set -MEM_MAP_THRESHOLD [expr 2048 * 16 + 1] +# The following line gets modified by the backend writer +set hls_clock_period 5 + +go analyze + +# NORMAL TOP DOWN FLOW +if { ! $opt(bup) } { + +go compile + +if {$opt(csim)} { + puts "***** C SIMULATION *****" + set time_start [clock clicks -milliseconds] + flow run /SCVerify/launch_make ./scverify/Verify_orig_cxx_osci.mk {} SIMTOOL=osci sim + set time_end [clock clicks -milliseconds] + report_time "C SIMULATION" $time_start $time_end +} + +puts "***** SETTING TECHNOLOGY LIBRARIES *****" +#hls-fpga-machine-learning insert techlibs + +directive set -CLOCKS [list clk [list -CLOCK_PERIOD $hls_clock_period -CLOCK_EDGE rising -CLOCK_OFFSET 0.000000 -CLOCK_UNCERTAINTY 0.0 -RESET_KIND sync -RESET_SYNC_NAME rst -RESET_SYNC_ACTIVE high -RESET_ASYNC_NAME arst_n -RESET_ASYNC_ACTIVE low -ENABLE_NAME {} -ENABLE_ACTIVE high]] + +if {$opt(synth)} { + puts "***** C/RTL SYNTHESIS *****" + set time_start [clock clicks -milliseconds] + + go assembly + + go architect + + go allocate + + go schedule + + go extract + set time_end [clock clicks -milliseconds] + report_time "C/RTL SYNTHESIS" $time_start $time_end +} + +# BOTTOM-UP FLOW +} else { + # Start at 'go analyze' + go analyze + + # Build the design bottom-up + directive set -CLOCKS [list clk [list -CLOCK_PERIOD $hls_clock_period -CLOCK_EDGE rising -CLOCK_OFFSET 0.000000 -CLOCK_UNCERTAINTY 0.0 -RESET_KIND sync -RESET_SYNC_NAME rst -RESET_SYNC_ACTIVE high -RESET_ASYNC_NAME arst_n -RESET_ASYNC_ACTIVE low -ENABLE_NAME {} -ENABLE_ACTIVE high]] + + set blocks [solution get /HIERCONFIG/USER_HBS/*/RESOLVED_NAME -match glob -rec 1 -ret v -state analyze] + set bu_mappings {} + set top [lindex $blocks 0] + foreach block [lreverse [lrange $blocks 1 end]] { + # skip blocks that are net nnet:: functions + if { [string match {nnet::*} $block] == 0 } { continue } + go analyze + solution design set $block -top + go compile + solution library remove * + puts "***** SETTING TECHNOLOGY LIBRARIES *****" +#hls-fpga-machine-learning insert techlibs + go extract + set block_soln "[solution get /TOP/name -checkpath 0].[solution get /VERSION -checkpath 0]" + lappend bu_mappings [solution get /CAT_DIR] /$top/$block "\[Block\] $block_soln" + } + + # Move to top design + go analyze + solution design set $top -top + go compile + + if {$opt(csim)} { + puts "***** C SIMULATION *****" + set time_start [clock clicks -milliseconds] + flow run /SCVerify/launch_make ./scverify/Verify_orig_cxx_osci.mk {} SIMTOOL=osci sim + set time_end [clock clicks -milliseconds] + report_time "C SIMULATION" $time_start $time_end + } + foreach {d i l} $bu_mappings { + logfile message "solution options set ComponentLibs/SearchPath $d -append\n" info + solution options set ComponentLibs/SearchPath $d -append + } + + # Add bottom-up blocks + puts "***** SETTING TECHNOLOGY LIBRARIES *****" + solution library remove * +#hls-fpga-machine-learning insert techlibs + # need to revert back to go compile + go compile + foreach {d i l} $bu_mappings { + logfile message "solution library add [list $l]\n" info + eval solution library add [list $l] + } + go libraries + + # Map to bottom-up blocks + foreach {d i l} $bu_mappings { + # Make sure block exists + set cnt [directive get $i/* -match glob -checkpath 0 -ret p] + if { $cnt != {} } { + logfile message "directive set $i -MAP_TO_MODULE [list $l]\n" info + eval directive set $i -MAP_TO_MODULE [list $l] + } + } + go assembly + set design [solution get -name] + logfile message "Adjusting FIFO_DEPTH for top-level interconnect channels\n" warning + # FIFO interconnect between layers + foreach ch_fifo_m2m [directive get -match glob -checkpath 0 -ret p $design/*_out:cns/MAP_TO_MODULE] { + set ch_fifo [join [lrange [split $ch_fifo_m2m '/'] 0 end-1] /]/FIFO_DEPTH + logfile message "directive set -match glob $ch_fifo 1\n" info + directive set -match glob "$ch_fifo" 1 + } + # For bypass paths - the depth will likely need to be larger than 1 + foreach ch_fifo_m2m [directive get -match glob -checkpath 0 -ret p $design/*_cpy*:cns/MAP_TO_MODULE] { + set ch_fifo [join [lrange [split $ch_fifo_m2m '/'] 0 end-1] /]/FIFO_DEPTH + logfile message "Bypass FIFO '$ch_fifo' depth set to 1 - larger value may be required to prevent deadlock\n" warning + logfile message "directive set -match glob $ch_fifo 1\n" info + directive set -match glob "$ch_fifo" 1 + } + go architect + go allocate + go schedule + go dpfsm + go extract +} + +project save + +if {$opt(cosim) || $opt(validation)} { + if {$opt(verilog)} { + flow run /SCVerify/launch_make ./scverify/Verify_rtl_v_msim.mk {} SIMTOOL=msim sim + } + if {$opt(vhdl)} { + flow run /SCVerify/launch_make ./scverify/Verify_rtl_vhdl_msim.mk {} SIMTOOL=msim sim + } +} + +if {$opt(export)} { + puts "***** EXPORT IP *****" + set time_start [clock clicks -milliseconds] +# Not yet implemented. Do we need to include value of $version ? +# flow package option set /Vivado/BoardPart xilinx.com:zcu102:part0:3.1 +# flow package option set /Vivado/IP_Taxonomy {/Catapult} +# flow run /Vivado/launch_package_ip -shell ./vivado_concat_v/concat_v_package_ip.tcl + set time_end [clock clicks -milliseconds] + report_time "EXPORT IP" $time_start $time_end +} +if {$opt(sw_opt)} { + puts "***** Pre Power Optimization *****" + go switching + if {$opt(verilog)} { + flow run /PowerAnalysis/report_pre_pwropt_Verilog + } + if {$opt(vhdl)} { + flow run /PowerAnalysis/report_pre_pwropt_VHDL + } +} + +if {$opt(power)} { + puts "***** Power Optimization *****" + go power +} + +if {$opt(vsynth)} { + puts "***** VIVADO SYNTHESIS *****" + set time_start [clock clicks -milliseconds] + flow run /Vivado/synthesize -shell vivado_concat_v/concat_rtl.v.xv + set time_end [clock clicks -milliseconds] + report_time "VIVADO SYNTHESIS" $time_start $time_end +} + +if {$opt(bitfile)} { + puts "***** Option bitfile not supported yet *****" +} + +if {$opt(da)} { + puts "***** Launching DA *****" + flow run /DesignAnalyzer/launch +} + +if { [catch {flow package present /HLS4ML}] == 0 } { + flow run /HLS4ML/collect_reports +} diff --git a/hls4ml/templates/catapult/catapult_synth.tcl b/hls4ml/templates/catapult/catapult_synth.tcl new file mode 100644 index 0000000000..6d80a33ef5 --- /dev/null +++ b/hls4ml/templates/catapult/catapult_synth.tcl @@ -0,0 +1,3 @@ +add_files myproject_prj/solution1/syn/vhdl +synth_design -top myproject -part xcku115-flvb2104-2-i +report_utilization -file vivado_synth.rpt diff --git a/hls4ml/templates/catapult/firmware/defines.h b/hls4ml/templates/catapult/firmware/defines.h new file mode 100755 index 0000000000..c5601779e4 --- /dev/null +++ b/hls4ml/templates/catapult/firmware/defines.h @@ -0,0 +1,15 @@ +#ifndef DEFINES_H_ +#define DEFINES_H_ + +#include "nnet_utils/nnet_types.h" +#include +#include +#include +#include +#include + +// hls-fpga-machine-learning insert numbers + +// hls-fpga-machine-learning insert layer-precision + +#endif diff --git a/hls4ml/templates/catapult/firmware/myproject.cpp b/hls4ml/templates/catapult/firmware/myproject.cpp new file mode 100755 index 0000000000..bdb0570f8b --- /dev/null +++ b/hls4ml/templates/catapult/firmware/myproject.cpp @@ -0,0 +1,29 @@ +#include + +#include "myproject.h" +#include "parameters.h" + +#include + +#pragma hls_design top +// hls-fpga-machine-learning insert IFSynPragmas +void CCS_BLOCK(myproject)( + // hls-fpga-machine-learning insert header +) { + + // hls-fpga-machine-learning insert IO + +#ifndef __SYNTHESIS__ + static bool loaded_weights = false; + if (!loaded_weights) { + // hls-fpga-machine-learning insert load weights + loaded_weights = true; + } +#endif + + // **************************************** + // NETWORK INSTANTIATION + // **************************************** + + // hls-fpga-machine-learning insert layers +} diff --git a/hls4ml/templates/catapult/firmware/myproject.h b/hls4ml/templates/catapult/firmware/myproject.h new file mode 100755 index 0000000000..dd73c3e807 --- /dev/null +++ b/hls4ml/templates/catapult/firmware/myproject.h @@ -0,0 +1,15 @@ +#ifndef MYPROJECT_H_ +#define MYPROJECT_H_ + +#include +#include +#include + +#include "defines.h" + +// Prototype of top level function for C-synthesis +void myproject( + // hls-fpga-machine-learning insert header +); + +#endif diff --git a/hls4ml/templates/catapult/firmware/parameters.h b/hls4ml/templates/catapult/firmware/parameters.h new file mode 100755 index 0000000000..2915c145c8 --- /dev/null +++ b/hls4ml/templates/catapult/firmware/parameters.h @@ -0,0 +1,15 @@ +#ifndef PARAMETERS_H_ +#define PARAMETERS_H_ + +#include +#include + +#include "nnet_utils/nnet_code_gen.h" +#include "nnet_utils/nnet_helpers.h" +// hls-fpga-machine-learning insert includes + +// hls-fpga-machine-learning insert weights + +// hls-fpga-machine-learning insert layer-config + +#endif diff --git a/hls4ml/templates/catapult/myproject_bridge.cpp b/hls4ml/templates/catapult/myproject_bridge.cpp new file mode 100755 index 0000000000..f1326a1faf --- /dev/null +++ b/hls4ml/templates/catapult/myproject_bridge.cpp @@ -0,0 +1,72 @@ +#ifndef MYPROJECT_BRIDGE_H_ +#define MYPROJECT_BRIDGE_H_ + +#include "firmware/myproject.h" +#include "nnet_helpers.h" +#include +#include + +static std::string s_weights_dir = "weights"; + +const char *get_weights_dir() { return s_weights_dir.c_str(); } + +// hls-fpga-machine-learning insert bram + +// hls-fpga-machine-learning insert declare weights + +namespace nnet { +bool trace_enabled = false; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet + +extern "C" { + +struct trace_data { + const char *name; + void *data; +}; + +void allocate_trace_storage(size_t element_size) { + nnet::trace_enabled = true; + nnet::trace_outputs = new std::map; + nnet::trace_type_size = element_size; + // hls-fpga-machine-learning insert trace_outputs +} + +void free_trace_storage() { + for (std::map::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) { + void *ptr = i->second; + free(ptr); + } + nnet::trace_outputs->clear(); + delete nnet::trace_outputs; + nnet::trace_outputs = NULL; + nnet::trace_enabled = false; +} + +void collect_trace_output(struct trace_data *c_trace_outputs) { + int ii = 0; + for (std::map::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) { + c_trace_outputs[ii].name = i->first.c_str(); + c_trace_outputs[ii].data = i->second; + ii++; + } +} + +// Wrapper of top level function for Python bridge +void myproject_float( + // hls-fpga-machine-learning insert header #float +) { + + // hls-fpga-machine-learning insert wrapper #float +} + +void myproject_double( + // hls-fpga-machine-learning insert header #double +) { + // hls-fpga-machine-learning insert wrapper #double +} +} + +#endif diff --git a/hls4ml/templates/catapult/myproject_test.cpp b/hls4ml/templates/catapult/myproject_test.cpp new file mode 100755 index 0000000000..66b87f6741 --- /dev/null +++ b/hls4ml/templates/catapult/myproject_test.cpp @@ -0,0 +1,164 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static std::string s_weights_dir; + +const char *get_weights_dir() { return s_weights_dir.c_str(); } + +#include "firmware/myproject.h" +#include "nnet_utils/nnet_helpers.h" +// #include "firmware/parameters.h" + +#include + +// hls-fpga-machine-learning insert bram + +#define CHECKPOINT 5000 + +#ifndef RANDOM_FRAMES +#define RANDOM_FRAMES 1 +#endif + +// hls-fpga-machine-learning insert declare weights + +namespace nnet { +bool trace_enabled = true; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet + +CCS_MAIN(int argc, char *argv[]) { + if (argc < 2) { + std::cerr << "Error - too few arguments" << std::endl; + std::cerr << "Usage: " << argv[0] << " " << std::endl; + std::cerr << "Where: - string pathname to directory containing wN.txt and bN.txt files" + << std::endl; + std::cerr << " - string pathname to tb_input_features.dat (optional)" << std::endl; + std::cerr << " - string pathname to tb_output_predictions.dat (optional)" << std::endl; + std::cerr << std::endl; + std::cerr << "If no testbench input/prediction data provided, random input data will be generated" << std::endl; + CCS_RETURN(-1); + } + s_weights_dir = argv[1]; + std::cout << " Weights directory: " << s_weights_dir << std::endl; + + std::string tb_in; + std::string tb_out; + std::ifstream fin; + std::ifstream fpr; + bool use_random = false; + if (argc == 2) { + std::cout << "No testbench files provided - Using random input data" << std::endl; + use_random = true; + } else { + tb_in = argv[2]; + tb_out = argv[3]; + std::cout << " Test Feature Data: " << tb_in << std::endl; + std::cout << " Test Predictions : " << tb_out << std::endl; + + // load input data from text file + fin.open(tb_in); + // load predictions from text file + fpr.open(tb_out); + if (!fin.is_open() || !fpr.is_open()) { + use_random = true; + } + } + +#ifdef RTL_SIM + std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log"; +#else + std::string RESULTS_LOG = "tb_data/csim_results.log"; +#endif + std::ofstream fout(RESULTS_LOG); + +#ifndef __SYNTHESIS__ + static bool loaded_weights = false; + if (!loaded_weights) { + // hls-fpga-machine-learning insert load weights + loaded_weights = true; + } +#endif + std::string iline; + std::string pline; + int e = 0; + + if (!use_random) { + while (std::getline(fin, iline) && std::getline(fpr, pline)) { + if (e % CHECKPOINT == 0) + std::cout << "Processing input " << e << std::endl; + char *cstr = const_cast(iline.c_str()); + char *current; + std::vector in; + current = strtok(cstr, " "); + while (current != NULL) { + in.push_back(atof(current)); + current = strtok(NULL, " "); + } + cstr = const_cast(pline.c_str()); + std::vector pr; + current = strtok(cstr, " "); + while (current != NULL) { + pr.push_back(atof(current)); + current = strtok(NULL, " "); + } + // std::cout << " Input feature map size = " << in.size() << " Output predictions size = " << pr.size() << + // std::endl; + + // hls-fpga-machine-learning insert data + + // hls-fpga-machine-learning insert top-level-function + + if (e % CHECKPOINT == 0) { + std::cout << "Predictions" << std::endl; + // hls-fpga-machine-learning insert predictions + std::cout << "Quantized predictions" << std::endl; + // hls-fpga-machine-learning insert quantized + } + e++; + + // hls-fpga-machine-learning insert tb-output + } + if (fin.is_open()) { + fin.close(); + } + if (fpr.is_open()) { + fpr.close(); + } + } else { + std::cout << "INFO: Unable to open input/predictions file(s) so feeding random values" << std::endl; + std::cout << "Number of Frames Passed from the tcl= " << RANDOM_FRAMES << std::endl; + + if (RANDOM_FRAMES > 0) { + for (unsigned int k = 0; k < RANDOM_FRAMES; k++) { + // hls-fpga-machine-learning insert random + + // hls-fpga-machine-learning insert top-level-function + + // hls-fpga-machine-learning insert output + + // hls-fpga-machine-learning insert tb-output + } + } else { + // hls-fpga-machine-learning insert zero + + // hls-fpga-machine-learning insert top-level-function + + // hls-fpga-machine-learning insert output + + // hls-fpga-machine-learning insert tb-output + } + } + + fout.close(); + std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; + + return 0; +} diff --git a/hls4ml/templates/catapult/nnet_utils/ap_shift_reg.h b/hls4ml/templates/catapult/nnet_utils/ap_shift_reg.h new file mode 100644 index 0000000000..0645efa73f --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/ap_shift_reg.h @@ -0,0 +1,136 @@ +/* +#- (c) Copyright 2011-2019 Xilinx, Inc. All rights reserved. +#- +#- This file contains confidential and proprietary information +#- of Xilinx, Inc. and is protected under U.S. and +#- international copyright and other intellectual property +#- laws. +#- +#- DISCLAIMER +#- This disclaimer is not a license and does not grant any +#- rights to the materials distributed herewith. Except as +#- otherwise provided in a valid license issued to you by +#- Xilinx, and to the maximum extent permitted by applicable +#- law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND +#- WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES +#- AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING +#- BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON- +#- INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and +#- (2) Xilinx shall not be liable (whether in contract or tort, +#- including negligence, or under any other theory of +#- liability) for any loss or damage of any kind or nature +#- related to, arising under or in connection with these +#- materials, including for any direct, or any indirect, +#- special, incidental, or consequential loss or damage +#- (including loss of data, profits, goodwill, or any type of +#- loss or damage suffered as a result of any action brought +#- by a third party) even if such damage or loss was +#- reasonably foreseeable or Xilinx had been advised of the +#- possibility of the same. +#- +#- CRITICAL APPLICATIONS +#- Xilinx products are not designed or intended to be fail- +#- safe, or for use in any application requiring fail-safe +#- performance, such as life-support or safety devices or +#- systems, Class III medical devices, nuclear facilities, +#- applications related to the deployment of airbags, or any +#- other applications that could lead to death, personal +#- injury, or severe property or environmental damage +#- (individually and collectively, "Critical +#- Applications"). Customer assumes the sole risk and +#- liability of any use of Xilinx products in Critical +#- Applications, subject only to applicable laws and +#- regulations governing limitations on product liability. +#- +#- THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS +#- PART OF THIS FILE AT ALL TIMES. +#- ************************************************************************ + + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __SIM_AP_SHIFT_REG_H__ +#define __SIM_AP_SHIFT_REG_H__ + +/* + * This file contains a C++ model of shift register. + * It defines C level simulation model. + */ +#ifndef __cplusplus +#error C++ is required to include this header file +#else + +#ifndef __SYNTHESIS__ +#include +#endif + +////////////////////////////////////////////// +// C level simulation model for ap_shift_reg +////////////////////////////////////////////// +template class ap_shift_reg { + public: + /// Constructors + ap_shift_reg() { + for (unsigned int i = 0; i < __SHIFT_DEPTH__; i++) { + __SHIFT_T__ dummy; + Array[i] = dummy; // uninitialize so Catapult does not add a reset + } + } + ap_shift_reg(const char *name) {} + /// Destructor + virtual ~ap_shift_reg() {} + + private: + /// Make copy constructor and assignment operator private + ap_shift_reg(const ap_shift_reg<__SHIFT_T__, __SHIFT_DEPTH__> &shreg) { + for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i) + Array[i] = shreg.Array[i]; + } + + ap_shift_reg &operator=(const ap_shift_reg<__SHIFT_T__, __SHIFT_DEPTH__> &shreg) { + for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i) + Array[i] = shreg.Array[i]; + return *this; + } + + public: + // Shift the queue, push to back and read from a given address. + __SHIFT_T__ shift(__SHIFT_T__ DataIn, unsigned int Addr = __SHIFT_DEPTH__ - 1, bool Enable = true) { +#ifndef __SYNTHESIS__ + assert(Addr < __SHIFT_DEPTH__ && "Out-of-bound shift is found in ap_shift_reg."); +#endif + __SHIFT_T__ ret = Array[Addr]; + if (Enable) { + for (unsigned int i = __SHIFT_DEPTH__ - 1; i > 0; --i) + Array[i] = Array[i - 1]; + Array[0] = DataIn; + } + return ret; + } + + // Read from a given address. + __SHIFT_T__ read(unsigned int Addr = __SHIFT_DEPTH__ - 1) const { +#ifndef __SYNTHESIS__ + assert(Addr < __SHIFT_DEPTH__ && "Out-of-bound read is found in ap_shift_reg."); +#endif + return Array[Addr]; + } + + protected: + __SHIFT_T__ Array[__SHIFT_DEPTH__]; +}; + +#endif //__cplusplus + +#endif //__SIM_AP_SHIFT_REG_H__ diff --git a/hls4ml/templates/catapult/nnet_utils/hls_math.h b/hls4ml/templates/catapult/nnet_utils/hls_math.h new file mode 100755 index 0000000000..ea05fe122a --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/hls_math.h @@ -0,0 +1,24 @@ +#ifndef X_HLS_MATH_H +#define X_HLS_MATH_H + +#include "ac_fixed.h" +#include + +namespace hls { + +template static T exp(const T x) { return (T)std::exp(x.to_double()); } + +template T sin(T x) { return (T)std::sin(x.to_double()); }; + +template T cos(T x) { return (T)std::cos(x.to_double()); }; + +template T asin(T x) { return (T)std::asin(x.to_double()); }; + +template T acos(T x) { return (T)std::acos(x.to_double()); }; + +template T atan(T x) { return (T)std::atan(x.to_double()); }; + +template T atan2(T x, T y) { return (T)hls::atan2(x.to_double(), y.to_double()); }; + +} // namespace hls +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_activation.h b/hls4ml/templates/catapult/nnet_utils/nnet_activation.h new file mode 100644 index 0000000000..f08e75a0d6 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_activation.h @@ -0,0 +1,1107 @@ + +// Change History: +// 2022-06-30 dgburnette - Cleaned up code to separate AC Math from LUT code. +// Added LUT dump to text file. +// Activation functions not implemented in AC Math will assert. +// 2022-06-28 dgburnette - Replaced AP Types with AC Datatypes. +// Commented out all Vivado pragmas. +// Added Catapult hierarchy pragmas. +// Started replacement of activation functions with +// AC Math piecewise-linear versions. + +#ifndef NNET_ACTIVATION_H_ +#define NNET_ACTIVATION_H_ + +// Define this macro to switch the implementations of certain activiation functions +// from the original HLS4ML look-up table approach to using the piecewise-linear approximation +// functions in AC Math. +#define USE_AC_MATH 1 + +#if !defined(USE_AC_MATH) && !defined(__SYNTHESIS__) +// Define a macro that causes the look-up table generation code to dump out text files +// of the array contents. +// #define BUILD_TABLE_FILE 1 +#endif + +#include "ac_fixed.h" +#include "ac_std_float.h" +#include "nnet_common.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nnet { + +struct activ_config { + // IO size + static const unsigned n_in = 10; + + // Internal info + static const unsigned table_size = 1024; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + + // Internal data type definitions + typedef ac_fixed<18, 8, true> table_t; +}; + +// ************************************************* +// LINEAR Activation -- See Issue 53 +// ************************************************* +template void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + res[ii] = data[ii]; + } +} + +// ************************************************* +// RELU Activation +// ************************************************* +template void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; +#ifndef USE_AC_MATH + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = 0; +#else + ac_math::ac_relu(datareg, res[ii]); +#endif + } +} + +template +void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg < 0) + res[ii] = 0; + else if (datareg > MAX_INT) + res[ii] = MAX_INT; + else + res[ii] = datareg; + } +} + +template void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + relu_max(data, res); +} + +template void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + relu_max(data, res); +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* + +template +void ac_sigmoid_pwl_wrapper(const ac_fixed(&input) /*[K]*/, + ac_fixed(&output) /*[K]*/) { + ac_fixed tmp; //[K]; + ac_math::ac_sigmoid_pwl(input, tmp); + output = tmp; +} + +inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); } + +template void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "sigmoid_table%d.tab", N_TABLE); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_sigmoid_table()\n"); +#endif + // Default logistic sigmoid function: + // result = 1/(1+e^(-x)) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; +#ifdef BUILD_TABLE_FILE + fprintf(f, "%32.31f", sigmoid_fcn_float(in_val)); + if (ii < N_TABLE - 1) + fprintf(f, ","); + fprintf(f, " // sigmoid(%32.31f)", in_val); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +#ifndef USE_AC_MATH + +template +void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_sigmoid_table(sigmoid_table); + initialized = true; + } + + //#pragma HLS PIPELINE + + // Index into the lookup table based on data + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii].to_double() * (int)CONFIG_T::table_size / 16; + index = data_round + 8 * (int)CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + res[ii] = (res_T)sigmoid_table[index]; + } +} + +#else + +template +void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + // res[ii] = ac_math::ac_sigmoid_pwl(data[ii]); + ac_sigmoid_pwl_wrapper(data[ii], res[ii]); + } +} + +#endif + +// ************************************************* +// Softmax Activation +// ************************************************* + +enum class softmax_implementation { latency = 0, legacy = 1, stable = 2 }; + +inline float exp_fcn_float(float input) { return std::exp(input); } + +template inline float softmax_real_val_from_idx(unsigned i) { + // Treat the index as the top N bits + static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table + data_T x(0); + // CATAPULT_PORT + // x(x.width-1, x.width-N) = i; + ac_int tmp = i; + x.template set_slc(x.width - N, tmp); + return (float)x.to_double(); +} + +template inline unsigned softmax_idx_from_real_val(data_T x) { + // Slice the top N bits to get an index into the table + static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table + // CATAPULT_PORT + // ac_int y = x(x.width-1, x.width-N); // slice the top N bits of input + // return (unsigned) y(N-1, 0); + ac_int y = x.template slc(x.width - N); // slice the top N bits of input + return (unsigned)y.template slc(0); +} + +template +void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "exp_table%d.tab", CONFIG_T::table_size); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_exp_table()\n"); +#endif + // The template data_T is the data type used to address the table + for (unsigned i = 0; i < CONFIG_T::table_size; i++) { + // Slicing bits for address is going to round towards 0, so take the central value + float x = softmax_real_val_from_idx(i); + typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x); + table_out[i] = exp_x; +#ifdef BUILD_TABLE_FILE + fprintf(f, "%32.31f", exp_fcn_float(x)); + if (i < CONFIG_T::table_size - 1) + fprintf(f, ","); + fprintf(f, " // exp(%32.31f)", x); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +template +void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "invert_table%d.tab", CONFIG_T::table_size); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_invert_table()\n"); +#endif + // The template data_T is the data type used to address the table + for (unsigned i = 0; i < CONFIG_T::table_size; i++) { + float x = softmax_real_val_from_idx(i); +#ifdef __SYNTHESIS__ + // hack for now to get through the flow + typename CONFIG_T::inv_table_t inv_x = 1 + x; +#else + typename CONFIG_T::inv_table_t inv_x = 1 / x; +#endif + table_out[i] = inv_x; +#ifdef BUILD_TABLE_FILE + if (x > 0.0) + fprintf(f, "%32.31f", (1.0 / x)); + else + fprintf(f, "%32.31f", 0.0); + if (i < CONFIG_T::table_size - 1) + fprintf(f, ","); + fprintf(f, " // 1/(%32.31f)", x); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +#ifndef USE_AC_MATH + +template +void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS pipeline + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + // Calculate all the e^x's + typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + //#pragma HLS array_partition variable=exp_res complete + typename CONFIG_T::exp_table_t exp_sum(0); + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + //#pragma HLS unroll + unsigned x = softmax_idx_from_real_val(data[i]); + exp_res[i] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + //#pragma HLS unroll + res[i] = exp_res[i] * inv_exp_sum; + } +} + +template +void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS pipeline + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + // Find the max and compute all delta(x_i, x_max) + Op_max op_max; + data_T x_max = reduce>(data, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + ac_fixed d_xi_xmax[CONFIG_T::n_in]; + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + //#pragma HLS unroll + d_xi_xmax[i] = data[i] - x_max; + } + + // Calculate all the e^x's + typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + //#pragma HLS array_partition variable=exp_res complete + typename CONFIG_T::exp_table_t exp_sum(0); + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + //#pragma HLS unroll + unsigned x = softmax_idx_from_real_val(d_xi_xmax[i]); + exp_res[i] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + //#pragma HLS unroll + res[i] = exp_res[i] * inv_exp_sum; + } +} + +#endif + +template void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "exp_table_legacy%d.tab", N_TABLE); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_exp_table_legacy()\n"); +#endif + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = exp_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; +#ifdef BUILD_TABLE_FILE + fprintf(f, "%32.31f", exp_fcn_float(in_val)); + if (ii < N_TABLE - 1) + fprintf(f, ","); + fprintf(f, " // exp(%32.31f)", in_val); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +template void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "invert_table_legacy%d.tab", N_TABLE); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_invert_table_legacy()\n"); +#endif + // Inversion function: + // result = 1/x + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range 0 to +64) + float in_val = 64.0 * ii / float(N_TABLE); + // Next, compute lookup table function + if (in_val > 0.0) + table_out[ii] = 1.0 / in_val; + else + table_out[ii] = 0.0; +#ifdef BUILD_TABLE_FILE + if (in_val > 0.0) + fprintf(f, "%32.31f", (1.0 / in_val)); + else + fprintf(f, "%32.31f", 0.0); + if (ii < N_TABLE - 1) + fprintf(f, ","); + fprintf(f, " // 1/%32.31f", in_val); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +#ifndef USE_AC_MATH + +template +void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_exp_table_legacy(exp_table); + init_invert_table_legacy(invert_table); + initialized = true; + } + + //#pragma HLS PIPELINE + + // Index into the lookup table based on data for exponentials + typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision + typename CONFIG_T::table_t exp_diff_res; // different, independent, fixed point precision + data_T data_cache[CONFIG_T::n_in]; + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_cache[ii] = data[ii]; + exp_res[ii] = 0; + } + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + for (int jj = 0; jj < CONFIG_T::n_in; jj++) { + if (ii == jj) + exp_diff_res = 1; + else { + // CATAPULT_PORT + // data_round = (data_cache[jj]-data_cache[ii])*CONFIG_T::table_size/16; + auto tmp_data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16; + data_round = tmp_data_round.to_int(); + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + exp_diff_res = exp_table[index]; + } + exp_res[ii] += exp_diff_res; + } + } + + // Second loop to invert + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + // CATAPULT_PORT + // int exp_res_index = exp_res[ii]*CONFIG_T::table_size/64; + auto tmp_exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64; + int exp_res_index = tmp_exp_res_index.to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index]; + res[ii] = (res_T)invert_table[exp_res_index]; + } +} + +template +void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + } +} + +#else +// This is a workaround to help the template deduction to work correctly and fix the inconsistency that HLS4ML expects +// softmax output to be signed but AC Math softmax knows it is always unsigned +template +void ac_softmax_pwl_wrapper(const ac_fixed (&input)[K], ac_fixed (&output)[K]) { + ac_fixed tmp[K]; + ac_math::ac_softmax_pwl(input, tmp); + for (unsigned int x = 0; x < K; x++) + output[x] = tmp[x]; +} + +template +void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + data_T data_copy[CONFIG_T::n_in]; + res_T res_copy[CONFIG_T::n_in]; +// workaround for the array passing - alternative is to change the signature of all of the functions to reference-of-array +COPY_IN_ARRAY: + for (unsigned i = 0; i < CONFIG_T::n_in; i++) + data_copy[i] = data[i]; + ac_softmax_pwl_wrapper(data_copy, res_copy); +COPY_OUT_ARRAY: + for (unsigned i = 0; i < CONFIG_T::n_in; i++) + res[i] = res_copy[i]; +} + +#endif + +// ************************************************* +// TanH Activation +// ************************************************* +template void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "tanh_table%d.tab", N_TABLE); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_tanh_table()\n"); +#endif + // Implement tanh lookup + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -4 to +4) + float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = tanh(in_val); + // std::cout << "Tanh: Lookup table Index: " << ii<< " In Value: " << in_val << " Result: " << real_val << + // std::endl; + table_out[ii] = real_val; +#ifdef BUILD_TABLE_FILE + fprintf(f, "%32.31f", tanh(in_val)); + if (ii < N_TABLE - 1) + fprintf(f, ","); + fprintf(f, " // tanh(%32.31f)", in_val); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +#ifndef USE_AC_MATH + +template void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_tanh_table(tanh_table); + initialized = true; + } + + //#pragma HLS PIPELINE + + // Index into the lookup table based on data + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii].to_double() * (int)CONFIG_T::table_size / 8; + index = data_round + 4 * (int)CONFIG_T::table_size / 8; + // std::cout << "Input: " << data[ii] << " Round: " << data_round << " Index: " << index << std::endl; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + res[ii] = (res_T)tanh_table[index]; + } +} + +#else + +template void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + // res[ii] = ac_math::ac_tanh_pwl(data[ii]); + ac_math::ac_tanh_pwl(data[ii], res[ii]); + } +} + +#endif + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* +template +void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + + data_T datareg; + data_T slope = (data_T)0.2; + data_T shift = (data_T)0.5; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = slope * data[ii] + shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + res[ii] = datareg; + } +} + +// ************************************************* +// Hard TanH Activation +// ************************************************* +template +void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + + data_T datareg; + data_T slope = (data_T)0.2; + data_T shift = (data_T)0.5; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (sigmoid > 1) + datareg = 1; + else if (sigmoid < 0) + datareg = 0; + res[ii] = 2 * sigmoid - 1; + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template +void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha * datareg; + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* +template +void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > theta) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* +inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); } + +template void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "softplus_table%d.tab", N_TABLE); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_softplus_table()\n"); +#endif + // Default softplus function: + // result = log(exp(x) + 1) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; +#ifdef BUILD_TABLE_FILE + fprintf(f, "%32.31f", softplus_fcn_float(in_val)); + if (ii < N_TABLE - 1) + fprintf(f, ","); + fprintf(f, " // softplus(%32.31f)", in_val); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +#ifndef USE_AC_MATH + +template +void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softplus_table(softplus_table); + initialized = true; + } + + //#pragma HLS PIPELINE + + // Index into the lookup table based on data + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii].to_double() * (int)CONFIG_T::table_size / 16; + index = data_round + 8 * (int)CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + res[ii] = (res_T)softplus_table[index]; + } +} + +#else +template +void ac_softplus_pwl_wrapper(const ac_fixed(&input), ac_fixed(&output)) { + ac_fixed tmp; + ac_math::ac_softplus_pwl(input, tmp); + output = tmp; +} + +template +void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_softplus_pwl_wrapper(data[ii], res[ii]); + } +} + +#endif + +// ************************************************* +// Softsign Activation +// ************************************************* +inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); } + +template void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "softsign_table%d.tab", N_TABLE); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_softsign_table()\n"); +#endif + // Default softsign function: + // result = x / (abs(x) + 1) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; +#ifdef BUILD_TABLE_FILE + fprintf(f, "%32.31f", softsign_fcn_float(in_val)); + if (ii < N_TABLE - 1) + fprintf(f, ","); + fprintf(f, " // softsign(%32.31f)", in_val); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +#ifndef USE_AC_MATH + +template +void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softsign_table(softsign_table); + initialized = true; + } + + //#pragma HLS PIPELINE + + // Index into the lookup table based on data + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii].to_double() * (int)CONFIG_T::table_size / 16; + index = data_round + 8 * (int)CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + res[ii] = (res_T)softsign_table[index]; + } +} + +#else + +template +void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + // res[ii] = ac_math::ac_softsign_pwl(data[ii]); + ac_math::ac_softsign_pwl(data[ii], res[ii]); + } +} + +#endif + +// ************************************************* +// ELU Activation +// ************************************************* +inline float elu_fcn_float(float input) { return std::exp(input) - 1.; } + +template void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "elu_table%d.tab", N_TABLE); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_elu_table()\n"); +#endif + // Default ELU function: + // result = alpha * (e^(x) - 1) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to 0) + float in_val = -8.0 * ii / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = elu_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; +#ifdef BUILD_TABLE_FILE + fprintf(f, "%32.31f", elu_fcn_float(in_val)); + if (ii < N_TABLE - 1) + fprintf(f, ","); + fprintf(f, " // elu(%32.31f)", in_val); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +#ifndef USE_AC_MATH + +template +void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#endif + + if (!initialized) { + init_elu_table(elu_table); + initialized = true; + } + + //#pragma HLS PIPELINE + + data_T datareg; + // Index into the lookup table based on data + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg >= 0) { + res[ii] = datareg; + } else { + index = datareg.to_double() * (int)CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + res[ii] = alpha * elu_table[index]; + } + } +} + +#else + +template +void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_math::ac_elu_pwl(data[ii], res[ii], alpha); + } +} + +#endif + +// ************************************************* +// SELU Activation +// ************************************************* +inline float selu_fcn_float(float input) { + return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.)); +} + +template void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) { +#ifdef BUILD_TABLE_FILE + char filename[1024]; + sprintf(filename, "selu_table%d.tab", N_TABLE); + FILE *f = fopen(filename, "w"); + fprintf(f, "// init_selu_table()\n"); +#endif + // Default SELU function: + // result = 1.05 * (1.673 * (e^(x) - 1)) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to 0) + float in_val = -8.0 * ii / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = selu_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; +#ifdef BUILD_TABLE_FILE + fprintf(f, "%32.31f", selu_fcn_float(in_val)); + if (ii < N_TABLE - 1) + fprintf(f, ","); + fprintf(f, " // selu(%32.31f)", in_val); + fprintf(f, "\n"); +#endif + } +#ifdef BUILD_TABLE_FILE + fclose(f); +#endif +} + +#ifndef USE_AC_MATH + +template void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_selu_table(selu_table); + initialized = true; + } + + //#pragma HLS PIPELINE + + data_T datareg; + // Index into the lookup table based on data + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg >= 0) { + res[ii] = res_T(1.0507009873554804934193349852946) * datareg; + } else { + index = datareg.to_double() * (int)CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + res[ii] = selu_table[index]; + } + } +} + +#else + +template void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + res[ii] = ac_math::ac_selu_pwl(data[ii]); + } +} + +#endif + +// ************************************************* +// PReLU Activation +// ************************************************* +template +void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha[ii] * datareg; + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template +void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + + data_T datareg; + res_T cache; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + cache = 1; + else + cache = -1; + + res[ii] = (res_T)cache; + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template +void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + + //#pragma HLS PIPELINE + + data_T datareg; + res_T cache; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = 2 * data[ii]; + if (datareg > 1) + cache = 1; + else if (datareg > -1 && datareg <= 1) + cache = 0; + else + cache = -1; + + res[ii] = (res_T)cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h new file mode 100644 index 0000000000..509560bc2b --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h @@ -0,0 +1,922 @@ + +// Change History: +// 2022-06-30 dgburnette - Cleaned up code to separate AC Math from LUT code. +// Activation functions not implemented in AC Math will assert. +// 2022-06-28 dgburnette - Replaced AP Types with AC Datatypes. + +#ifndef NNET_ACTIVATION_STREAM_H_ +#define NNET_ACTIVATION_STREAM_H_ + +#include "ac_channel.h" +#include "ac_fixed.h" +#include "nnet_activation.h" +#include "nnet_common.h" +#include "nnet_stream.h" +#include "nnet_types.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nnet { + +// ************************************************* +// LINEAR Activation +// ************************************************* +// Adding this to work around problem with Catapult and SR model where the output channel appears to be inout +template void linear(ac_channel &data, ac_channel &res) { +LinearActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + LinearPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + out_data[j] = in_data[j]; + } + + res.write(out_data); + } +} + +// ************************************************* +// RELU Activation +// ************************************************* +template void relu(ac_channel &data, ac_channel &res) { +ReLUActLoop: + for (unsigned int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ReLUPackLoop: + for (unsigned int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL +#ifndef USE_AC_MATH + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = 0; +#else + ac_math::ac_relu(in_data[j], out_data[j]); +#endif + } + + res.write(out_data); + } +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* +#ifndef USE_AC_MATH + +template void sigmoid(ac_channel &data, ac_channel &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_sigmoid_table(sigmoid_table); + initialized = true; + } + +SigmoidActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + SigmoidPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + int data_round = in_data[j].to_double() * (int)CONFIG_T::table_size / 16; + int index = data_round + 8 * (int)CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + out_data[j] = sigmoid_table[index]; + } + + res.write(out_data); + } +} + +#else + +template void sigmoid(ac_channel &data, ac_channel &res) { +SigmoidActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + data_T in_data = data.read(); + res_T out_data; + SigmoidPackLoop: + for (int j = 0; j < res_T::size; j++) { + // ac_math::ac_sigmoid_pwl(in_data[j], out_data[j]); + ac_sigmoid_pwl_wrapper(in_data[j], out_data[j]); + } + res.write(out_data); + } +} + +#endif + +// ************************************************* +// Softmax Activation +// ************************************************* + +#ifndef USE_AC_MATH + +template +void softmax_latency(ac_channel &data, ac_channel &res) { + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); + constexpr unsigned ii = data_T::size / multiplier_limit; + (void)ii; + + // Calculate all the e^x's + typename CONFIG_T::exp_table_t exp_res[data_T::size]; + //#pragma HLS array_partition variable=exp_res complete + typename CONFIG_T::exp_table_t exp_sum(0); + +SoftmaxExpLoop: + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + //#pragma HLS PIPELINE II=ii + + data_T in_pack = data.read(); + SoftmaxExpPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { + //#pragma HLS UNROLL + unsigned x = softmax_idx_from_real_val(in_pack[j]); + exp_res[j] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + + res_T out_pack; + //#pragma HLS DATA_PACK variable=out_pack + SoftmaxInvPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + out_pack[j] = exp_res[j] * inv_exp_sum; + } + res.write(out_pack); + } +} + +template +void softmax_stable(ac_channel &data, ac_channel &res) { + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); + constexpr unsigned ii = data_T::size / multiplier_limit; + (void)ii; + + typename data_T::value_type data_array[data_T::size]; + //#pragma HLS ARRAY_PARTITION variable=data_array complete + + if constexpr (ii == 1) { + } + if constexpr (ii != 1) { + // future enhancement for Catapult + } +SoftmaxArrayLoop: + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + //#pragma HLS PIPELINE II=ii + + data_T in_pack = data.read(); + SoftmaxArrayPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { + //#pragma HLS UNROLL + data_array[j] = in_pack[j]; + } + + // Find the max and compute all delta(x_i, x_max) + Op_max op_max; + typename data_T::value_type x_max = + reduce>(data_array, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + ac_fixed d_xi_xmax[data_T::size]; + for (unsigned j = 0; j < data_T::size; j++) { + //#pragma HLS UNROLL + d_xi_xmax[j] = data_array[j] - x_max; + } + + // Calculate all the e^x's + typename CONFIG_T::exp_table_t exp_res[data_T::size]; + //#pragma HLS ARRAY_PARTITION variable=exp_res complete + typename CONFIG_T::exp_table_t exp_sum(0); + for (unsigned j = 0; j < data_T::size; j++) { + //#pragma HLS UNROLL + unsigned x = softmax_idx_from_real_val(d_xi_xmax[j]); + exp_res[j] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + + res_T out_pack; + //#pragma HLS DATA_PACK variable=out_pack + SoftmaxInvPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + out_pack[j] = exp_res[j] * inv_exp_sum; + } + res.write(out_pack); + } +} + +template +void softmax_legacy(ac_channel &data, ac_channel &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_exp_table_legacy(exp_table); + init_invert_table_legacy(invert_table); + initialized = true; + } + + // Index into the lookup table based on data for exponentials + typename CONFIG_T::table_t exp_res[data_T::size]; + typename CONFIG_T::table_t exp_diff_res; + typename data_T::value_type data_cache[data_T::size]; + +SoftmaxInitLoop: + for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) { + //#pragma HLS PIPELINE + data_T in_pack = data.read(); + SoftmaxInitPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { + //#pragma HLS UNROLL + data_cache[j] = in_pack[j]; + exp_res[j] = 0; + } + + SoftmaxExpLoop: + for (int i = 0; i < data_T::size; i++) { + //#pragma HLS UNROLL + SoftmaxExpInner: + for (int j = 0; j < data_T::size; j++) { + //#pragma HLS UNROLL + + if (i == j) { + exp_diff_res = 1; + } else { + int data_round = + (data_cache[j].to_double() - data_cache[i].to_double()) * (int)CONFIG_T::table_size / 16; + int index = data_round + 8 * (int)CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + exp_diff_res = exp_table[index]; + } + + exp_res[i] += exp_diff_res; + } + } + + res_T out_pack; + //#pragma HLS DATA_PACK variable=out_pack + SoftmaxInvPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + + int exp_res_index = exp_res[j].to_double() * (int)CONFIG_T::table_size / 64; + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = (int)CONFIG_T::table_size - 1; + + out_pack[j] = (typename res_T::value_type)invert_table[exp_res_index]; + } + res.write(out_pack); + } +} + +template void softmax(ac_channel &data, ac_channel &res) { + assert(CONFIG_T::axis == -1); + + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + } +} + +#else + +template void softmax(ac_channel &data, ac_channel &res) { + typename data_T::value_type data_cache[data_T::size]; + typename res_T::value_type res_cache[res_T::size]; +SoftmaxInitLoop: + for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) { + data_T in_pack = data.read(); + + SoftmaxInitPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { + data_cache[j] = in_pack[j]; + } + + res_T out_pack; + // ac_math::ac_softmax_pwl(data_cache,res_cache); + ac_softmax_pwl_wrapper(data_cache, res_cache); + + SoftmaxResPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { + out_pack[j] = res_cache[j]; + } + + res.write(out_pack); + } +} + +#endif + +// ************************************************* +// TanH Activation +// ************************************************* + +#ifndef USE_AC_MATH + +template void tanh(ac_channel &data, ac_channel &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_tanh_table(tanh_table); + initialized = true; + } + +TanHActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + TanHPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + int data_round = in_data[j].to_double() * (int)CONFIG_T::table_size / 8; + int index = data_round + 4 * (int)CONFIG_T::table_size / 8; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + out_data[j] = tanh_table[index]; + } + + res.write(out_data); + } +} + +#else + +template void tanh(ac_channel &data, ac_channel &res) { +TanHActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + TanHPackLoop: + for (int j = 0; j < res_T::size; j++) { + // int data_round = in_data[j]*CONFIG_T::table_size/8; + ac_math::ac_tanh_pwl(in_data[j], out_data[j]); + } + res.write(out_data); + } +} + +#endif + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* + +template void hard_sigmoid(ac_channel &data, ac_channel &res) { + typename data_T::value_type slope = (typename data_T::value_type)0.2; + typename data_T::value_type shift = (typename data_T::value_type)0.5; + +HardSigmoidActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + HardSigmoidPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + typename data_T::value_type datareg = slope * in_data[j] + shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + out_data[j] = datareg; + } + + res.write(out_data); + } +} + +// ************************************************* +// Hard TanH Activation +// ************************************************* + +template void hard_tanh(ac_channel &data, ac_channel &res) { + // typename data_T::value_type slope = (typename data_T::value_type) 0.2; + // typename data_T::value_type shift = (typename data_T::value_type) 0.5; + +HardTanhActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + // PRAGMA_DATA_PACK(out_data) + + HardTanhPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + out_data[j] = 2 * sigmoid - 1; + } + + res.write(out_data); + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template +void leaky_relu(ac_channel &data, typename data_T::value_type alpha, ac_channel &res) { +LeakyReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + LeakyReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha * in_data[j]; + } + res.write(out_data); + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* + +template +void thresholded_relu(ac_channel &data, typename data_T::value_type theta, ac_channel &res) { +ThresholdedReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ThresholdedReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + if (in_data[j] > theta) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res.write(out_data); + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* + +#ifndef USE_AC_MATH + +template void softplus(ac_channel &data, ac_channel &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softplus_table(softplus_table); + initialized = true; + } + +SoftplusActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + SoftplusPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + int data_round = in_data[j].to_double() * (int)CONFIG_T::table_size / 16; + int index = data_round + 8 * (int)CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + out_data[j] = softplus_table[index]; + } + res.write(out_data); + } +} + +#else + +template void softplus(ac_channel &data, ac_channel &res) { +SoftplusActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + data_T in_data = data.read(); + res_T out_data; + SoftplusPackLoop: + for (int j = 0; j < res_T::size; j++) { + ac_softplus_pwl_wrapper(in_data[j], out_data[j]); + } + res.write(out_data); + } +} + +#endif + +// ************************************************* +// Softsign Activation +// ************************************************* + +#ifndef USE_AC_MATH + +template void softsign(ac_channel &data, ac_channel &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softsign_table(softsign_table); + initialized = true; + } + +SoftsignActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + SoftsignPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + int data_round = in_data[j].to_double() * (int)CONFIG_T::table_size / 16; + int index = data_round + 8 * (int)CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + out_data[j] = softsign_table[index]; + } + res.write(out_data); + } +} + +#else + +template void softsign(ac_channel &data, ac_channel &res) { +SoftsignActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + data_T in_data = data.read(); + res_T out_data; + SoftsignPackLoop: + for (int j = 0; j < res_T::size; j++) { + ac_math::ac_softsign_pwl(in_data[j], out_data[j]); + } + res.write(out_data); + } +} + +#endif + +// ************************************************* +// ELU Activation +// ************************************************* + +#ifndef USE_AC_MATH + +template +void elu(ac_channel &data, typename data_T::value_type alpha, ac_channel &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#endif + + if (!initialized) { + init_elu_table(elu_table); + initialized = true; + } + +EluActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + EluPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + + typename data_T::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = datareg; + } else { + int index = (int)datareg.to_double() * (int)CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = alpha * elu_table[index]; + } + } + res.write(out_data); + } +} + +#else +template +void elu(ac_channel &data, typename data_T::value_type alpha, ac_channel &res) { +EluActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + data_T in_data = data.read(); + res_T out_data; + EluPackLoop: + for (int j = 0; j < res_T::size; j++) { + ac_math::ac_elu_pwl(in_data[j], out_data[j], alpha); + } + res.write(out_data); + } +} + +#endif + +// ************************************************* +// SELU Activation +// ************************************************* + +#ifndef USE_AC_MATH + +template void selu(ac_channel &data, ac_channel &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_selu_table(selu_table); + initialized = true; + } + +SeluActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + SeluPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + + typename data_T::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = (typename data_T::value_type)1.0507009873554804934193349852946 * datareg; + } else { + int index = (int)datareg.to_double() * (int)CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = (int)CONFIG_T::table_size - 1; + out_data[j] = selu_table[index]; + } + } + res.write(out_data); + } +} + +#else + +template void selu(ac_channel &data, ac_channel &res) { +SeluActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + data_T in_data = data.read(); + res_T out_data; + SeluPackLoop: + for (int j = 0; j < res_T::size; j++) { + ac_math::ac_selu_pwl(in_data[j], out_data[j]); + } + res.write(out_data); + } +} + +#endif + +// ************************************************* +// PReLU Activation +// ************************************************* +template +void prelu(ac_channel &data, typename data_T::value_type alpha[CONFIG_T::n_in], ac_channel &res) { +PReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + PReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha[i * res_T::size + j] * in_data[j]; + } + res.write(out_data); + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template void binary_tanh(ac_channel &data, ac_channel &res) { +PReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + PReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + if (in_data[j] > 0) + out_data[j] = (typename res_T::value_type)1; + else + out_data[j] = (typename res_T::value_type) - 1; + } + res.write(out_data); + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template void ternary_tanh(ac_channel &data, ac_channel &res) { +PReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + PReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + //#pragma HLS UNROLL + if (in_data[j] > 1) + out_data[j] = (typename res_T::value_type)1; + else if (in_data[j] <= -1) + out_data[j] = (typename res_T::value_type) - 1; + else + out_data[j] = (typename res_T::value_type)0; + } + res.write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_array.h b/hls4ml/templates/catapult/nnet_utils/nnet_array.h new file mode 100755 index 0000000000..cd3b73cf73 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_array.h @@ -0,0 +1,52 @@ +#ifndef NNET_ARRAY_H_ +#define NNET_ARRAY_H_ + +#include + +namespace nnet { + +struct transpose_config { + static const unsigned height = 10; + static const unsigned width = 10; + static const unsigned depth = 10; + static constexpr unsigned perm[3] = {2, 0, 1}; +}; + +template +void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) { + //#pragma HLS PIPELINE + + for (int i = 0; i < CONFIG_T::height; i++) { + for (int j = 0; j < CONFIG_T::width; j++) { + data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j]; + } + } +} + +template +void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width], + res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) { + unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width}; + unsigned dims_t[3]; + dims_t[0] = dims[CONFIG_T::perm[0]]; + dims_t[1] = dims[CONFIG_T::perm[1]]; + dims_t[2] = dims[CONFIG_T::perm[2]]; + + int idx[3] = {0}, idx_t[3] = {0}; + for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) { + for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) { + for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) { + idx_t[0] = idx[CONFIG_T::perm[0]]; + idx_t[1] = idx[CONFIG_T::perm[1]]; + idx_t[2] = idx[CONFIG_T::perm[2]]; + + data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] = + data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h new file mode 100644 index 0000000000..1db18043ec --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h @@ -0,0 +1,127 @@ +#ifndef NNET_BATCHNORM_H_ +#define NNET_BATCHNORM_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_dense.h" +#include + +namespace nnet { + +struct batchnorm_config { + // Internal data type definitions + typedef float bias_t; + typedef float scale_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const int n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + template using product = nnet::product::mult; +}; + +template +void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in], + typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], + typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) { + data_T cache; + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + //#pragma HLS function_instantiate variable=scale,bias + + // For parallel inputs: + // - completely partition arrays -- target fabric + // - if we have an unroll factor, limit number of multipliers + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; + + // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes + //#pragma HLS ARRAY_PARTITION variable=scale complete + //#pragma HLS ARRAY_PARTITION variable=bias complete + + int multiplier_limit = ceil(float(CONFIG_T::n_in) / float(CONFIG_T::reuse_factor)); + CONFIG_T::template product::limit(multiplier_limit); + + // Calcuate result +Result: + for (int ires = 0; ires < CONFIG_T::n_in; ires++) { + if (CONFIG_T::n_filt == -1) { + res[ires] = CONFIG_T::template product::product(data[ires], scale[ires]) + + bias[ires]; + } else { + int norm_index = ires % CONFIG_T::n_filt; + res[ires] = + CONFIG_T::template product::product(data[ires], scale[norm_index]) + + bias[norm_index]; + } + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +struct batchnorm_quantized_tanh_config { + // Layer Sizes + static const unsigned n_in = 10; + static const int n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; +}; + +template +void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in], + data_T threshold[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + //#pragma HLS ARRAY_PARTITION variable=res complete + + data_T datareg; + ac_int<1, false> cache; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg >= threshold[norm_index]) + cache = 1; + else + cache = 0; + + res[ii] = cache; + } +} + +template +void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in], + data_T threshold_hi[CONFIG_T::n_in], data_T threshold_lo[CONFIG_T::n_in]) { + //#pragma HLS PIPELINE + //#pragma HLS ARRAY_PARTITION variable=res complete + + data_T datareg; + ac_int<2, true> cache; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg > threshold_hi[norm_index]) + cache = 1; + else if (datareg <= threshold_lo[norm_index]) + cache = -1; + else + cache = 0; + + res[ii] = cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h new file mode 100644 index 0000000000..48085f82dc --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h @@ -0,0 +1,113 @@ + +#ifndef NNET_BATCHNORM_STREAM_H_ +#define NNET_BATCHNORM_STREAM_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_mult.h" +#include "nnet_types.h" + +namespace nnet { + +// **************************************************** +// Streaming Batch Normalization +// **************************************************** + +template +void normalize(ac_channel &data, ac_channel &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], + typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) { + //#pragma HLS ARRAY_PARTITION variable=scale complete + //#pragma HLS ARRAY_PARTITION variable=bias complete + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); + constexpr unsigned ii = CONFIG_T::n_in / multiplier_limit; + (void)ii; + CONFIG_T::template product::limit(multiplier_limit); + +BatchNormLoop: + for (unsigned int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + //#pragma HLS PIPELINE II=ii + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + BatchNormpack: + for (unsigned int j = 0; j < data_T::size; j++) { + // #pragma HLS UNROLL + int norm_index; + if (CONFIG_T::n_filt == -1) { + norm_index = i * data_T::size + j; + } else { + norm_index = j % CONFIG_T::n_filt; + } + out_data[j] = CONFIG_T::template product::product( + in_data[j], scale[norm_index]) + + bias[norm_index]; + } + + res.write(out_data); + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +template +void normalize_binary_tanh(ac_channel &data, ac_channel, CONFIG_T::n_in>> &res, + typename data_T::value_type threshold[CONFIG_T::n_in]) { + //#pragma HLS ARRAY_PARTITION variable=threshold complete + +BinaryNormLoop: + for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + //#pragma HLS DATA_PACK variable=out_data + + BatchNormPack: + for (int j = 0; j < data_T::size; j++) { + out_data[j] = (in_data[j] > threshold[i * data_T::size + j]) ? 1 : 0; + } + + res.write(out_data); + } +} + +template +void normalize_ternary_tanh(ac_channel &data, ac_channel, CONFIG_T::n_in>> &res, + typename data_T::value_type threshold_hi[CONFIG_T::n_in], + typename data_T::value_type threshold_lo[CONFIG_T::n_in]) { + //#pragma HLS ARRAY_PARTITION variable=threshold_hi complete + //#pragma HLS ARRAY_PARTITION variable=threshold_lo complete + +TernaryNormLoop: + for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + //#pragma HLS DATA_PACK variable=out_data + + BatchNormPack: + for (int j = 0; j < data_T::size; j++) { + + int norm_index = i * data_T::size + j; + + if (in_data[j] > threshold_hi[norm_index]) { + out_data[j] = 1; + } else if (in_data[j] <= threshold_lo[norm_index]) { + out_data[j] = -1; + } else { + out_data[j] = 0; + } + } + + res.write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h b/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h new file mode 100755 index 0000000000..e4db43682e --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h @@ -0,0 +1,32 @@ +#ifndef NNET_INSTR_GEN_H_ +#define NNET_INSTR_GEN_H_ + +#include "nnet_helpers.h" +#include + +namespace nnet { + +template class FillConv1DBuffer { + public: + static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { + // To be implemented in subclasses + } +}; + +template class FillConv2DBuffer { + public: + static void + fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { + // To be implemented in subclasses + } +}; + +// hls4ml insert code + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_common.h b/hls4ml/templates/catapult/nnet_utils/nnet_common.h new file mode 100755 index 0000000000..b9b27209fa --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_common.h @@ -0,0 +1,66 @@ + +#ifndef NNET_COMMON_H_ +#define NNET_COMMON_H_ + +#include "ac_fixed.h" + +// This is a substitute for "ceil(n/(float)d)". +#define DIV_ROUNDUP(n, d) ((n + d - 1) / d) +#define MIN(n, d) (n > d ? d : n) +#define MAX(n, d) (n > d ? n : d) + +namespace nnet { + +// Common type definitions +enum io_type { io_parallel = 0, io_stream }; +enum strategy { latency, resource }; + +/* --- + * Balanced tree reduce implementation. + * For use in scenarios where Vivado cannot expression balance + * Reduces an array of inputs to a single value using the template binary operator 'Op', + * for example summing all elements with Op_add, or finding the maximum with Op_max + * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section + * before applying and accumulate the result over the rolled dimension. + * --- */ +template T reduce(const T *x, Op op) { + static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0; + static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; + + if (N == 1) { + return x[0]; + } else if (N == 2) { + return op(x[0], x[1]); + } else { + return op(reduce(x, op), reduce(x + leftN, op)); + } +} + +template class Op_add { + public: + T operator()(T a, T b) { return a + b; } +}; + +template class Op_and { + public: + T operator()(T a, T b) { return a && b; } +}; + +template class Op_or { + public: + T operator()(T a, T b) { return a || b; } +}; + +template class Op_max { + public: + T operator()(T a, T b) { return a >= b ? a : b; } +}; + +template class Op_min { + public: + T operator()(T a, T b) { return a <= b ? a : b; } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h new file mode 100755 index 0000000000..98e075d4ab --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h @@ -0,0 +1,62 @@ + +#ifndef NNET_CONV1D_H_ +#define NNET_CONV1D_H_ + +#include "nnet_common.h" +#include "nnet_conv1d_latency.h" +#include "nnet_conv1d_resource.h" +#include + +namespace nnet { + +struct conv1d_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Convolutional parameters + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned in_width = 10; + static const unsigned n_chan = 0; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_width; + static const unsigned n_filt = 1; + static const unsigned stride_width = 1; + static const unsigned dilation = 1; + static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1 + + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; // not used yet +}; + +template +void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + if (CONFIG_T::strategy == nnet::latency) { + conv_1d_latency_cl(data, res, weights, biases); + } else { + conv_1d_resource_cl(data, res, weights, biases); + } +} + +template +void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + if (CONFIG_T::strategy == nnet::latency) { + pointwise_conv_1d_latency_cl(data, res, weights, biases); + } else { + pointwise_conv_1d_resource_cl(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h new file mode 100755 index 0000000000..0323b1ac4b --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h @@ -0,0 +1,198 @@ +#ifndef NNET_CONV1D_LATENCY_H_ +#define NNET_CONV1D_LATENCY_H_ + +#include "nnet_common.h" +#include + +namespace nnet { + +// Computes multiplier limit +// This function should not be synthesized into firmware +template +int compute_multiplier_limit( + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt]) { + int n_mult = 0; + for (int ii = 0; ii < CONFIG_T::out_width; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + for (int jj = 0; jj < CONFIG_T::filt_width; jj++) { + + int index_weight = jj * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff; + + if ((ii * CONFIG_T::stride_width + jj) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width + jj) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + // padded -- do nothing + continue; + } else { + // need to tune this cut? + if (weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20) { + n_mult++; + } // end if nonzero weight + } // end not padding + } // end loop accross filter + } // end channel loop + } // end filter loop + } // end output loop + + return ceil(float(n_mult) / float(CONFIG_T::reuse_factor)); + +} // end compute_n_mult + +template +void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_width]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width][CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + //#pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + //#pragma HLS PIPELINE + //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + const int multiplier_limit = compute_multiplier_limit(weights); +//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + ConvMult: + for (int jj = 0; jj < CONFIG_T::filt_width; jj++) { + + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_width + + ff * CONFIG_T::n_chan * CONFIG_T::filt_width + cc * CONFIG_T::filt_width + jj; + int index_weight = jj * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width + jj - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width + jj) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width + jj) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + AccumDot: + for (int jj = 0; jj < CONFIG_T::filt_width; jj++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_width + + ff * CONFIG_T::n_chan * CONFIG_T::filt_width + cc * CONFIG_T::filt_width + jj; + acc[ii][ff] += mult[index_mult]; + } // end dot product loop + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width][CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + //#pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + //#pragma HLS PIPELINE + //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + const int multiplier_limit = compute_multiplier_limit(weights); +//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h new file mode 100644 index 0000000000..143a1271ba --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h @@ -0,0 +1,241 @@ +#ifndef NNET_CONV1D_RESOURCE_H_ +#define NNET_CONV1D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +template +void im2col_1d(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_width]) { + // int index = 0; + for (int channel = CONFIG_T::n_chan; channel--; data += CONFIG_T::in_width) { + //#pragma HLS PIPELINE II=1 rewind + for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) { + int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation; + for (int output_col = CONFIG_T::out_width; output_col; output_col--) { + if (input_col >= 0 && input_col < CONFIG_T::in_width) { + *(data_col++) = data[input_col]; + // data_col[index] = data[input_col]; + } else { + *(data_col++) = 0; + // data_col[index] = 0; + } + // index++; + input_col += CONFIG_T::stride_width; + } + } + } +} + +template +void conv_1d_full(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + data_T data_conv[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_width]; + data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan]; + res_T res_col[CONFIG_T::n_filt]; + + ////#pragma HLS ARRAY_PARTITION variable=data_conv complete + //#pragma HLS ARRAY_PARTITION variable=data_col complete + //#pragma HLS ARRAY_PARTITION variable=res_col complete + + im2col_1d(data, data_conv); + + for (int i = 0; i < CONFIG_T::out_width; i++) { + for (int j = 0; j < CONFIG_T::filt_width * CONFIG_T::n_chan; j++) { + data_col[j] = data_conv[j * CONFIG_T::out_width + i]; + } + dense_resource(data_col, res_col, weights, biases); + for (int j = 0; j < CONFIG_T::n_filt; j++) { + // res[i * CONFIG_T::n_filt + j] = res_col[j]; + res[j * CONFIG_T::out_width + i] = res_col[j]; // Transposed order + } + } +} + +template +void im2col_1d_cf_idx(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan], const int col) { +ChannelLoop: + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + //#pragma HLS PIPELINE II=1 rewind + KernelLoop: + for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) { + int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation + col * CONFIG_T::stride_width; + if (input_col >= 0 && input_col < CONFIG_T::in_width) { + //*(data_col++) = data[input_col]; + data_col[channel * CONFIG_T::filt_width + kernel_col] = data[channel * CONFIG_T::in_width + input_col]; + } else { + //*(data_col++) = 0; + data_col[channel * CONFIG_T::filt_width + kernel_col] = 0; + } + } + } +} + +template +void im2col_1d_cf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::n_chan * CONFIG_T::filt_width], const int col) { + int index = 0; +ChannelLoop: + for (int channel = CONFIG_T::n_chan; channel--; data += CONFIG_T::in_width) { + KernelLoop: + for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) { + int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation + col * CONFIG_T::stride_width; + if (input_col >= 0 && input_col < CONFIG_T::in_width) { + //*(data_col++) = data[input_col]; + data_col[index] = data[input_col]; + } else { + //*(data_col++) = 0; + data_col[index] = 0; + } + index++; + } + } +} + +template +void conv_1d_resource_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width; + const int nout = CONFIG_T::n_filt; + const int rufactor = CONFIG_T::reuse_factor; + const int block_factor = DIV_ROUNDUP(nin * nout, rufactor); + + ////#pragma HLS function_instantiate variable=weights,biases + ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose + /// correctly + ////#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + ////#pragma HLS ARRAY_PARTITION variable=biases complete + + data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan]; + res_T res_col[CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=data_col complete + //#pragma HLS ARRAY_PARTITION variable=res_col complete + +ColLoop: + for (int i = 0; i < CONFIG_T::out_width; i++) { + //#pragma HLS PIPELINE + im2col_1d_cf(data, data_col, i); + dense_resource(data_col, res_col, weights, biases); + for (int j = 0; j < CONFIG_T::n_filt; j++) { + // res[i * CONFIG_T::n_filt + j] = res_col[j]; + res[j * CONFIG_T::out_width + i] = res_col[j]; // Transposed order + } + } +} + +template +void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan], const int col) { + int index = 0; +KernelLoop: + for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) { + + ChannelLoop: + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + int index_data = (col * CONFIG_T::stride_width + kernel_col - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; + + if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { + data_col[index] = data[index_data]; + } else { + data_col[index] = 0; + } + index++; + } + } +} + +template +void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], data_T data_col[CONFIG_T::n_chan], + const int col) { + int index = 0; +ChannelLoop: + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + + int index_data = (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; + + if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { + data_col[index] = data[index_data]; + } else { + data_col[index] = 0; + } + index++; + } +} + +template +void conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width; + const int nout = CONFIG_T::n_filt; + const int rufactor = CONFIG_T::reuse_factor; + const int block_factor = DIV_ROUNDUP(nin * nout, rufactor); + + ////#pragma HLS function_instantiate variable=weights,biases + ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose + /// correctly + ////#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + ////#pragma HLS ARRAY_PARTITION variable=biases complete + + data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan]; + res_T res_col[CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=data_col complete + //#pragma HLS ARRAY_PARTITION variable=res_col complete + +ColLoop: + for (int i = 0; i < CONFIG_T::out_width; i++) { + //#pragma HLS PIPELINE + im2col_1d_cl(data, data_col, i); + dense_resource(data_col, res_col, weights, biases); + for (int j = 0; j < CONFIG_T::n_filt; j++) { + res[i * CONFIG_T::n_filt + j] = res_col[j]; + } + } +} + +template +void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + const int nin = CONFIG_T::n_chan; + const int nout = CONFIG_T::n_filt; + const int rufactor = CONFIG_T::reuse_factor; + const int block_factor = DIV_ROUNDUP(nin * nout, rufactor); + + ////#pragma HLS function_instantiate variable=weights,biases + ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose + /// correctly + ////#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + ////#pragma HLS ARRAY_PARTITION variable=biases complete + + data_T data_col[CONFIG_T::n_chan]; + res_T res_col[CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=data_col complete + //#pragma HLS ARRAY_PARTITION variable=res_col complete + +ColLoop: + for (int i = 0; i < CONFIG_T::out_width; i++) { + //#pragma HLS PIPELINE + im2col_1d_pointwise_cl(data, data_col, i); + dense_resource(data_col, res_col, weights, biases); + for (int j = 0; j < CONFIG_T::n_filt; j++) { + res[i * CONFIG_T::n_filt + j] = res_col[j]; + } + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h new file mode 100644 index 0000000000..48f6244ce1 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h @@ -0,0 +1,94 @@ +#ifndef NNET_CONV1D_STREAM_H_ +#define NNET_CONV1D_STREAM_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_conv_stream.h" + +namespace nnet { + +template +void compute_scaled_indices_1d(const unsigned w_idx, ac_int *pixel_idx) { + unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan); + +ComputeIndex: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) { + // #pragma HLS UNROLL + unsigned sw_idx = + CONFIG_T::template scale_index::scale_index( + wp_idx + p); + pixel_idx[p] = CONFIG_T::pixels[sw_idx]; + } +} + +template +void conv_1d_encoded_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + ac_channel data_window[CONFIG_T::filt_width * CONFIG_T::n_chan]; + // const int win_depth = CONFIG_T::out_width; + // for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) { + // #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + // } + + //#pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + unsigned outputs_ready = 0; + + ac_int pixel_idx[data_T::size / CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=pixel_idx complete + + constexpr int ce_reuse_factor = + CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1); + (void)ce_reuse_factor; +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + //#pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_scaled_indices_1d(i_iw, pixel_idx); + compute_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, weights, + biases, pixel_idx); + } +} + +template +void conv_1d_buffer_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency); + (void)ce_reuse_factor; +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + //#pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_output_buffer_1d(data.read(), res, weights, biases); + } +} + +template +void conv_1d_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + //#pragma HLS inline region + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + conv_1d_buffer_cl(data, res, weights, biases); + break; + case conv_implementation::encoded: + conv_1d_encoded_cl(data, res, weights, biases); + break; + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h new file mode 100755 index 0000000000..01476a0449 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h @@ -0,0 +1,84 @@ + +#ifndef NNET_CONV2D_H_ +#define NNET_CONV2D_H_ + +#include "nnet_common.h" +#include "nnet_conv2d_latency.h" +#include "nnet_conv2d_resource.h" +#include + +namespace nnet { + +struct conv2d_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Convolutional parameters + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_chan = 1; + static const unsigned filt_height = 1; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_height * filt_width; + static const unsigned n_filt = 1; + static const unsigned stride_height = 1; + static const unsigned stride_width = 1; + static const unsigned out_height = 10; + static const unsigned out_width = 10; + static const unsigned dilation_height = 1; + static const unsigned dilation_width = 1; + + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; // not used yet +}; + +template +void conv_2d_cf( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + if (CONFIG_T::strategy == nnet::latency) { + conv_2d_latency_cf(data, res, weights, biases); + } else { + conv_2d_resource_cf(data, res, weights, biases); + } +} + +template +void conv_2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + if (CONFIG_T::strategy == nnet::latency) { + conv_2d_latency_cl(data, res, weights, biases); + } else { + conv_2d_resource_cl(data, res, weights, biases); + } +} + +template +void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + if (CONFIG_T::strategy == nnet::latency) { + pointwise_conv_2d_latency_cl(data, res, weights, biases); + } else { + pointwise_conv_2d_resource_cl(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h new file mode 100644 index 0000000000..29dd8ca633 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h @@ -0,0 +1,392 @@ +#ifndef NNET_CONV2D_LATENCY_H_ +#define NNET_CONV2D_LATENCY_H_ + +#include "nnet_common.h" +#include + +namespace nnet { + +// Computes multiplier limit +// This function should not be synthesized into firmware +template +int compute_multiplier_limit_conv2d(typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * + CONFIG_T::n_chan * CONFIG_T::n_filt]) { + int n_mult = 0; + + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + for (int fh = 0; fh < CONFIG_T::filt_height; fh++) { + for (int fw = 0; fw < CONFIG_T::filt_width; fw++) { + + int index_weight = fh * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt + + fw * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff; + + if ((oh * CONFIG_T::stride_height + fh) < CONFIG_T::pad_top || + (oh * CONFIG_T::stride_height + fh) >= (CONFIG_T::pad_top + CONFIG_T::in_height) || + (ow * CONFIG_T::stride_width + fw) < CONFIG_T::pad_left || + (ow * CONFIG_T::stride_width + fw) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + // padded - do nothing + continue; + } else { + if (weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20) { + n_mult++; + } + } + + } // end mult loop + } // end channel loop + } // end filter width loop + } // end filter height loop + } // end output width loop + } // end output height loop + + // return ceil(float(n_mult) / float(CONFIG_T::reuse_factor)); + return (n_mult + CONFIG_T::reuse_factor - 1) / CONFIG_T::reuse_factor; + +} // end compute_n_mult + +template +void conv_2d_latency_cf( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * + CONFIG_T::filt_height * CONFIG_T::filt_width]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + //#pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + //#pragma HLS PIPELINE + //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + const int multiplier_limit = compute_multiplier_limit_conv2d(weights); +//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + +// Convolve, saving all multiplication results to accumulate later +ConvOutHeight: + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + ConvOutWidth: + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + ConvFiltHeight: + for (int fh = 0; fh < CONFIG_T::filt_height; fh++) { + ConvFiltWidth: + for (int fw = 0; fw < CONFIG_T::filt_width; fw++) { + + int index_mult = + oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * + CONFIG_T::filt_width + + ow * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width + + ff * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width + + cc * CONFIG_T::filt_height * CONFIG_T::filt_width + fh * CONFIG_T::filt_width + fw; + + int index_weight = fh * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt + + fw * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff; + + if ((oh * CONFIG_T::stride_height + fh) < CONFIG_T::pad_top || + (oh * CONFIG_T::stride_height + fh) >= (CONFIG_T::pad_top + CONFIG_T::in_height) || + (ow * CONFIG_T::stride_width + fw) < CONFIG_T::pad_left || + (ow * CONFIG_T::stride_width + fw) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + int index_data = + cc * CONFIG_T::in_height * CONFIG_T::in_width + + (oh * CONFIG_T::stride_height + fh - CONFIG_T::pad_top) * CONFIG_T::in_width + + (ow * CONFIG_T::stride_width + fw - CONFIG_T::pad_left); + mult[index_mult] = data[index_data] * weights[index_weight]; + } + + } // end mult loop + } // end channel loop + } // end filter width loop + } // end filter height loop + } // end output width loop + } // end output height loop + + // Initialize accumulator with input biases + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + acc[oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff] = biases[ff]; + } + } + } + +// Accumulate multiplication result +AccumOutHeight: + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + AccumOutWidth: + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + AccumDotHeight: + for (int fh = 0; fh < CONFIG_T::filt_height; fh++) { + AccumDotWidth: + for (int fw = 0; fw < CONFIG_T::filt_width; fw++) { + + int index_mult = + oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * + CONFIG_T::filt_width + + ow * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width + + ff * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width + + cc * CONFIG_T::filt_height * CONFIG_T::filt_width + fh * CONFIG_T::filt_width + fw; + int index_acc = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff; + + acc[index_acc] += mult[index_mult]; + + } // end dot product filter width loop + } // end dot product filter height loop + } // end n channel loop + } // end n filter loop + } // end output width loop + } // end output height loop + + // Cast to "res_t" type + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + int res_index = ff * CONFIG_T::out_height * CONFIG_T::out_width + oh * CONFIG_T::out_width + ow; + int acc_index = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff; + res[res_index] = acc[acc_index]; + } + } + } + +} // end conv2d + +template +void conv_2d_latency_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * + CONFIG_T::filt_height * CONFIG_T::filt_width]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + //#pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + //#pragma HLS PIPELINE + //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + const int multiplier_limit = compute_multiplier_limit_conv2d(weights); +//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + +// Convolve, saving all multiplication results to accumulate later +ConvOutHeight: + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + ConvOutWidth: + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + ConvFiltHeight: + for (int fh = 0; fh < CONFIG_T::filt_height; fh++) { + ConvFiltWidth: + for (int fw = 0; fw < CONFIG_T::filt_width; fw++) { + + int index_mult = + oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * + CONFIG_T::filt_width + + ow * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width + + ff * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width + + cc * CONFIG_T::filt_height * CONFIG_T::filt_width + fh * CONFIG_T::filt_width + fw; + + int index_weight = fh * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt + + fw * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff; + + if ((oh * CONFIG_T::stride_height + fh) < CONFIG_T::pad_top || + (oh * CONFIG_T::stride_height + fh) >= (CONFIG_T::pad_top + CONFIG_T::in_height) || + (ow * CONFIG_T::stride_width + fw) < CONFIG_T::pad_left || + (ow * CONFIG_T::stride_width + fw) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + int index_data = (oh * CONFIG_T::stride_height + fh - CONFIG_T::pad_top) * + CONFIG_T::in_width * CONFIG_T::n_chan + + (ow * CONFIG_T::stride_width + fw - CONFIG_T::pad_left) * CONFIG_T::n_chan + + cc; + mult[index_mult] = data[index_data] * weights[index_weight]; + } + + } // end mult loop + } // end channel loop + } // end filter width loop + } // end filter height loop + } // end output width loop + } // end output height loop + + // Initialize accumulator with input biases + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + acc[oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff] = biases[ff]; + } + } + } + +// Accumulate multiplication result +AccumOutHeight: + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + AccumOutWidth: + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + AccumDotHeight: + for (int fh = 0; fh < CONFIG_T::filt_height; fh++) { + AccumDotWidth: + for (int fw = 0; fw < CONFIG_T::filt_width; fw++) { + + int index_mult = + oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * + CONFIG_T::filt_width + + ow * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width + + ff * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width + + cc * CONFIG_T::filt_height * CONFIG_T::filt_width + fh * CONFIG_T::filt_width + fw; + int index_acc = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff; + + acc[index_acc] += mult[index_mult]; + + } // end dot product filter width loop + } // end dot product filter height loop + } // end n channel loop + } // end n filter loop + } // end output width loop + } // end output height loop + + // Cast to "res_t" type + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + int index = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff; + res[index] = (res_T)(acc[index]); + } + } + } + +} // end conv2d + +template +void pointwise_conv_2d_latency_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + //#pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + //#pragma HLS PIPELINE + //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + const int multiplier_limit = compute_multiplier_limit_conv2d(weights); +//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + +// Convolve, saving all multiplication results to accumulate later +ConvOutHeight: + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + ConvOutWidth: + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + + int index_mult = oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan + + ow * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + + int index_weight = cc * CONFIG_T::n_filt + ff; + + if ((oh * CONFIG_T::stride_height) < CONFIG_T::pad_top || + (oh * CONFIG_T::stride_height) >= (CONFIG_T::pad_top + CONFIG_T::in_height) || + (ow * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ow * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + int index_data = + (oh * CONFIG_T::stride_height - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_chan + + (ow * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } + } + } + } + + // Initialize accumulator with input biases + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + acc[oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff] = biases[ff]; + } + } + } + +// Accumulate multiplication result +AccumOutHeight: + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + AccumOutWidth: + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + + int index_mult = oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan + + ow * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_acc = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff; + + acc[index_acc] += mult[index_mult]; + } + } + } + } + + // Cast to "res_t" type + for (int oh = 0; oh < CONFIG_T::out_height; oh++) { + for (int ow = 0; ow < CONFIG_T::out_width; ow++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + int index = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff; + res[index] = (res_T)(acc[index]); + } + } + } + +} // end conv2d + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h new file mode 100644 index 0000000000..c5e386b5e9 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h @@ -0,0 +1,275 @@ +#ifndef NNET_CONV2D_RESOURCE_H_ +#define NNET_CONV2D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +template +void im2col_2d(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_height * + CONFIG_T::out_width]) { + const int output_h = (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom - + (CONFIG_T::dilation_height * (CONFIG_T::filt_height - 1) + 1)) / + CONFIG_T::stride_height + + 1; + const int output_w = (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right - + (CONFIG_T::dilation_width * (CONFIG_T::filt_width - 1) + 1)) / + CONFIG_T::stride_width + + 1; + const int channel_size = CONFIG_T::in_height * CONFIG_T::in_width; + + for (int channel = CONFIG_T::n_chan; channel--; data += channel_size) { + for (int kernel_row = 0; kernel_row < CONFIG_T::filt_height; kernel_row++) { + for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) { + int input_row = -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height; + for (int output_rows = output_h; output_rows; output_rows--) { + if (input_row < 0 || input_row > CONFIG_T::in_height) { + for (int output_cols = output_w; output_cols; output_cols--) { + *(data_col++) = 0; + } + } else { + int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width; + for (int output_col = output_w; output_col; output_col--) { + if (input_col >= 0 && input_col < CONFIG_T::in_width) { + *(data_col++) = data[input_row * CONFIG_T::in_width + input_col]; + } else { + *(data_col++) = 0; + } + input_col += CONFIG_T::stride_width; + } + } + input_row += CONFIG_T::stride_height; + } + } + } + } +} + +template +void conv_2d_full( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + data_T data_conv[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_height * + CONFIG_T::out_width]; + data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + res_T res_col[CONFIG_T::n_filt]; + + ////#pragma HLS ARRAY_PARTITION variable=data_conv complete + //#pragma HLS ARRAY_PARTITION variable=data_col complete + //#pragma HLS ARRAY_PARTITION variable=res_col complete + + im2col_2d(data, data_conv); + + for (int i = 0; i < CONFIG_T::out_height * CONFIG_T::out_width; i++) { + for (int j = 0; j < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; j++) { + data_col[j] = data[j * CONFIG_T::out_height * CONFIG_T::out_width + i]; + } + dense(data_col, res_col, weights, biases); + for (int j = 0; j < CONFIG_T::n_filt; j++) { + // res[i * CONFIG_T::n_filt + j] = res_col[j]; + res[j * CONFIG_T::out_height * CONFIG_T::out_width + i] = res_col[j]; // Transposed order + } + } +} + +template +void im2col_2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + data_T data_col[CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width], const int row, + const int col) { + const int channel_size = CONFIG_T::in_height * CONFIG_T::in_width; + int index = 0; + for (int channel = CONFIG_T::n_chan; channel--; data += channel_size) { + for (int kernel_row = 0; kernel_row < CONFIG_T::filt_height; kernel_row++) { + int input_row = -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height; + for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) { + if (input_row < 0 || input_row > CONFIG_T::in_height) { + data_col[index++] = 0; + } else { + int input_col = + -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width; + if (input_col >= 0 && input_col < CONFIG_T::in_width) { + //*(data_col++) = data[input_row * CONFIG_T::in_width + input_col]; + data_col[index++] = data[input_row * CONFIG_T::in_width + input_col]; + } else { + //*(data_col++) = 0; + data_col[index++] = 0; + } + input_col += CONFIG_T::stride_width; + } + } + input_row += CONFIG_T::stride_height; + } + } +} + +template +void conv_2d_resource_cf( + data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width; + const int nout = CONFIG_T::n_filt; + const int rufactor = CONFIG_T::reuse_factor; + const int block_factor = DIV_ROUNDUP(nin * nout, rufactor); + + ////#pragma HLS function_instantiate variable=weights,biases + ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose + /// correctly + ////#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + ////#pragma HLS ARRAY_PARTITION variable=biases complete + + data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + res_T res_col[CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=data_col complete + //#pragma HLS ARRAY_PARTITION variable=res_col complete + +HeightLoop: + for (int i = 0; i < CONFIG_T::out_height; i++) { + WidthLoop: + for (int j = 0; j < CONFIG_T::out_width; j++) { + //#pragma HLS PIPELINE + im2col_2d_cf(data, data_col, i, j); + dense(data_col, res_col, weights, biases); + FiltLoop: + for (int k = 0; k < CONFIG_T::n_filt; k++) { + // res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k]; + res[k * CONFIG_T::out_height * CONFIG_T::out_width + i * CONFIG_T::out_width + j] = + res_col[k]; // Transposed order + } + } + } +} + +template +void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], const int row, + const int col) { + int index = 0; + for (int kernel_row = 0; kernel_row < CONFIG_T::filt_height; kernel_row++) { + int input_row = -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height; + for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) { + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + if (input_row < 0 || input_row >= CONFIG_T::in_height) { + data_col[index++] = 0; + } else { + int input_col = + -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width; + if (input_col >= 0 && input_col < CONFIG_T::in_width) { + data_col[index++] = + data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel]; + } else { + data_col[index++] = 0; + } + } + } + } + } +} + +template +void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::n_chan], const int row, const int col) { + int index = 0; + int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height; + +ChannelLoop: + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + if (input_row < 0 || input_row >= CONFIG_T::in_height) { + data_col[index++] = 0; + } else { + int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width; + if (input_col >= 0 && input_col < CONFIG_T::in_width) { + data_col[index++] = + data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel]; + } else { + data_col[index++] = 0; + } + } + } +} + +template +void conv_2d_resource_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width; + const int nout = CONFIG_T::n_filt; + const int rufactor = CONFIG_T::reuse_factor; + const int block_factor = DIV_ROUNDUP(nin * nout, rufactor); + + ////#pragma HLS function_instantiate variable=weights,biases + ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose + /// correctly + ////#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + ////#pragma HLS ARRAY_PARTITION variable=biases complete + + data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + res_T res_col[CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=data_col complete + //#pragma HLS ARRAY_PARTITION variable=res_col complete + +HeightLoop: + for (int i = 0; i < CONFIG_T::out_height; i++) { + WidthLoop: + for (int j = 0; j < CONFIG_T::out_width; j++) { + //#pragma HLS PIPELINE + im2col_2d_cl(data, data_col, i, j); + dense(data_col, res_col, weights, biases); + FiltLoop: + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k]; + } + } + } +} + +template +void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + + const int nin = CONFIG_T::n_chan; + const int nout = CONFIG_T::n_filt; + const int rufactor = CONFIG_T::reuse_factor; + const int block_factor = DIV_ROUNDUP(nin * nout, rufactor); + + ////#pragma HLS function_instantiate variable=weights,biases + ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose + /// correctly + ////#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + ////#pragma HLS ARRAY_PARTITION variable=biases complete + + data_T data_col[CONFIG_T::n_chan]; + res_T res_col[CONFIG_T::n_filt]; + + //#pragma HLS ARRAY_PARTITION variable=data_col complete + //#pragma HLS ARRAY_PARTITION variable=res_col complete + +HeightLoop: + for (int i = 0; i < CONFIG_T::out_height; i++) { + WidthLoop: + for (int j = 0; j < CONFIG_T::out_width; j++) { + //#pragma HLS PIPELINE + im2col_2d_pointwise_cl(data, data_col, i, j); + dense(data_col, res_col, weights, biases); + FiltLoop: + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k]; + } + } + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h new file mode 100644 index 0000000000..7e76be12a9 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h @@ -0,0 +1,117 @@ +#ifndef NNET_CONV2D_STREAM_H_ +#define NNET_CONV2D_STREAM_H_ + +#include "ac_channel.h" +#include "ap_shift_reg.h" +#include "nnet_common.h" +#include "nnet_conv_stream.h" + +namespace nnet { + +template +void compute_scaled_indices_2d(const unsigned h_idx, const unsigned w_idx, + ac_int *pixel_idx) { + const unsigned sh_idx = CONFIG_T::template scale_index_height::scale_index(h_idx); + unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan); + +ComputeIndex: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) { + // #pragma HLS UNROLL + + unsigned sw_idx = CONFIG_T::template scale_index_width::scale_index(wp_idx + p); + pixel_idx[p] = CONFIG_T::pixels[sh_idx * CONFIG_T::min_width + sw_idx]; + } +} + +template +void conv_2d_encoded_cl( + ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_height == CONFIG_T::filt_width); + + ac_channel data_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + const int win_depth = CONFIG_T::filt_height * CONFIG_T::out_width; + for (unsigned i_out = 0; i_out < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) { + //#pragma HLS STREAM variable=data_window[i_out] depth=win_depth + } + + //#pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + unsigned outputs_ready = 0; + + ac_int pixel_idx[data_T::size / CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=pixel_idx complete + + constexpr int ce_reuse_factor = + CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1); + (void)ce_reuse_factor; +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + //#pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_scaled_indices_2d(i_ih, i_iw, pixel_idx); + compute_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, weights, + biases, pixel_idx); + } + } +} + +// Line Buffer +template +void conv_2d_buffer_cl( + ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] + [CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency); + (void)ce_reuse_factor; +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + //#pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + if (CONFIG_T::filt_height > 1) { + compute_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } + } +} + +template +void conv_2d_cl( + ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + //#pragma HLS inline region + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + conv_2d_buffer_cl(data, res, weights, biases); + break; + case conv_implementation::encoded: + conv_2d_encoded_cl(data, res, weights, biases); + break; + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h new file mode 100644 index 0000000000..4d92cbf69f --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h @@ -0,0 +1,398 @@ +#ifndef NNET_CONV_STREAM_H_ +#define NNET_CONV_STREAM_H_ + +#include "ac_channel.h" +#include "ap_shift_reg.h" +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +enum class conv_implementation { linebuffer = 0, encoded = 1 }; + +// ************************************************* +// Encoded Implementation (Vlad's) +// ************************************************* +template unsigned scale_index_K_gte_S(const unsigned idx) { + //#pragma HLS INLINE + + if (idx < K - S) { + return idx; + } + + constexpr unsigned nW = ((W - K) / S) * S + K; // Nearest W without unused pixels on the right + constexpr unsigned sW = (DIV_ROUNDUP(K, S) - 1) * S + K; // Scaled W that behaves like original W + if (idx >= nW) { + return sW; + } + + const unsigned r = nW - idx; + if (r <= K - S) { + return sW - r; + } + + return K - S + (idx - (K - S)) % S; +} + +template unsigned scale_index_K_lt_S(const unsigned idx) { + //#pragma HLS INLINE + + if (idx < S - K) { + return idx; + } + + constexpr unsigned nW = ((W - K) / S) * S + K; // Nearest W without unused pixels on the right + constexpr unsigned sW = (DIV_ROUNDUP(S, K) - 1) * S + K; // Scaled W that behaves like original W + if (idx >= nW) { + return sW; + } + + const unsigned r = nW - idx; + if (r <= S - K) { + return sW - r; + } + + return S - K + (idx - (S - K)) % S; +} + +template class scale_index_regular { + public: + static unsigned scale_index(const unsigned idx) { + // #pragma HLS INLINE + + if (K >= S) { + return scale_index_K_gte_S(idx); + } else { + return scale_index_K_lt_S(idx); + } + } +}; + +template class scale_index_unscaled { + public: + static unsigned scale_index(const unsigned idx) { + // #pragma HLS INLINE + return idx; + } +}; + +template +void mult_buffer(ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + res_T &res_pack, ac_channel &res_stream, unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + //#pragma HLS INLINE + + typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=data complete + typename res_T::value_type res[CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable=res complete + +InitData: + for (unsigned int id = 0; id < CONFIG_T::kernel_size * CONFIG_T::n_chan; id++) { + // #pragma HLS UNROLL + data[id] = data_window[id].read(); + } + + //#pragma HLS INLINE region + if (CONFIG_T::strategy == nnet::latency) { + dense_latency( + data, res, weights, biases); + } else { + dense_resource( + data, res, weights, biases); + } + +CastLoop: + for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) { + // #pragma HLS UNROLL + if (res_T::size / CONFIG_T::n_filt == 1) { + res_pack[jj] = res[jj]; + } else { + res_pack[outputs_ready * CONFIG_T::n_filt + jj] = res[jj]; + } + } + + if (res_T::size / CONFIG_T::n_filt == 1) { + res_stream.write(res_pack); + } else { + if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) { + res_stream.write(res_pack); + outputs_ready = 0; + } else { + outputs_ready++; + } + } +} + +template +void compute_output_encoded(const data_T &in_elem, + ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + ac_channel &res, res_T &res_pack, unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], + ac_int *pixel_idx) { + //#pragma HLS INLINE + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; +MultLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + CopyDataFilt: + for (unsigned f = 0; f < CONFIG_T::kernel_size; f++) { + // #pragma HLS UNROLL + CopyDataChan: + for (unsigned c = 0; c < CONFIG_T::n_chan; c++) { + // #pragma HLS UNROLL + if (pixel_idx[p][f]) + data_window[f * CONFIG_T::n_chan + c].write(in_elem[p * CONFIG_T::n_chan + c]); + } + } + if (pixel_idx[p][CONFIG_T::kernel_size - 1]) { + mult_buffer(data_window, res_pack, res, outputs_ready, weights, biases); + } + } +} + +// ************************************************* +// Line Buffer Implementation (Phil's) +// ************************************************* +template +void kernel_shift_1d(const data_T &in_elem, + typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]) { + //#pragma HLS inline + //#pragma HLS PIPELINE II = 1 + + // Shift kernel_window by one step to the left (manual shift operation) + static const int filt_width = CONFIG_T::filt_width - 1; +KernelShiftWidth: + for (int i_iw = 0; i_iw < filt_width; i_iw++) { + // #pragma HLS PIPELINE II = 1 + KernelShiftChannel: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + // #pragma HLS UNROLL + // Shift every element in kernel_window to the left + kernel_window[i_iw * CONFIG_T::n_chan + i_ic] = kernel_window[(i_iw + 1) * CONFIG_T::n_chan + i_ic]; + } + } + + // Insert shift_buffer column into right-most column of kernel + static const int lastheight = (CONFIG_T::filt_width - 1) * CONFIG_T::n_chan; +KernelPushChannel: + for (unsigned int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + // #pragma HLS UNROLL + kernel_window[lastheight + i_ic] = in_elem[i_ic]; + } +} + +template +void kernel_shift_2d( + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan], + typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::filt_height * CONFIG_T::n_chan]) { + //#pragma HLS inline + + // Shift kernel_window by one step to the left (manual shift operation) + static const int filt_width = CONFIG_T::filt_width - 1; +KernelShiftWidth: + for (int i_iw = 0; i_iw < filt_width; i_iw++) { + //#pragma HLS PIPELINE II = 1 + KernelShiftHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::filt_height; i_ih++) { + KernelShiftChannel: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + // Shift every element in kernel_window to the left + kernel_window[i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + i_iw * CONFIG_T::n_chan + i_ic] = + kernel_window[i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + (i_iw + 1) * CONFIG_T::n_chan + i_ic]; + } + } + } + + // Insert shift_buffer column into right-most column of kernel + static const int lastheight = (CONFIG_T::filt_width - 1) * CONFIG_T::n_chan; +KernelPushHeight: + for (unsigned int i_ih = 0; i_ih < CONFIG_T::filt_height; i_ih++) { + KernelPushChannel: + for (unsigned int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + kernel_window[lastheight + i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + i_ic] = shift_buffer[i_ih][i_ic]; + } + } +} + +template +void shift_line_buffer( + const data_T &in_elem, + ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] + [CONFIG_T::n_chan], + typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]) { + + //#pragma HLS PIPELINE + + // Temporary buffer for popped (shifted) elements + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable = shift_buffer complete dim = 0 + +UpdateBuffer: + for (unsigned int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + // #pragma HLS UNROLL + + // Insert pixel(s) at end of shift buffer + shift_buffer[CONFIG_T::filt_height - 1][i_ic] = in_elem[i_ic]; + } + +LineBufferDataIn: + for (unsigned int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + // Shift the shift buffer into the line buffer + LineBufferShift: + for (unsigned i_ih = 1; i_ih < CONFIG_T::filt_height; i_ih++) { + // #pragma HLS UNROLL + typename data_T::value_type pop_elem = line_buffer[i_ih - 1][i_ic].shift( + shift_buffer[CONFIG_T::filt_height - i_ih][i_ic]); // Shift the line buffer, return the popped pixel + shift_buffer[CONFIG_T::filt_height - i_ih - 1][i_ic] = + pop_elem; // Popped element placed back into shift_buffer, one row up. + } + } + kernel_shift_2d(shift_buffer, kernel_window); +} + +template +void compute_output_buffer_2d( + const data_T &in_elem, + ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] + [CONFIG_T::n_chan], + ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + //#pragma HLS INLINE + + // Thresholds + const static int lShiftX = CONFIG_T::filt_width - 1; + const static int lShiftY = CONFIG_T::filt_height - 1; + + // Counters + static int pX = 0; // Pixel X + static int pY = 0; // Pixel Y + + static int sX = 0; // Stride X + static int sY = 0; // Stride Y + + static typename data_T::value_type kernel_data[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=kernel_data complete + + typename res_T::value_type res_out[CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0 + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + + // Add pixel to buffer + nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { + + // Dense multiply + //#pragma HLS INLINE region + if (CONFIG_T::strategy == nnet::latency) { + dense_latency( + kernel_data, res_out, weights, biases); + } else { + dense_resource( + kernel_data, res_out, weights, biases); + } + + // Pack output + CastLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + // #pragma HLS UNROLL + res_pack[i_ic] = res_out[i_ic]; + } + + // Write output to stream when output ready + res_stream.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image + pY = 0; + sY = 0; + } else { + pY = pY + 1; + // Update stride (threshold) ? subtract stride : increment stride + sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; + } + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +// Conv 1D compute output +template +void compute_output_buffer_1d( + const data_T &in_elem, ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + //#pragma HLS INLINE + + // Thresholds + const static int lShiftX = CONFIG_T::filt_width - 1; + + // Counters + static int pX = 0; // pixel counter + static int sX = 0; // stride counter + + static typename data_T::value_type kernel_data[CONFIG_T::filt_width * CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=kernel_data complete + + typename res_T::value_type res_out[CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0 + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + + // Add pixel to buffer + nnet::kernel_shift_1d(in_elem, kernel_data); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { + + // Dense multiply + //#pragma HLS INLINE region + if (CONFIG_T::strategy == nnet::latency) { + dense_latency( + kernel_data, res_out, weights, biases); + } else { + dense_resource( + kernel_data, res_out, weights, biases); + } + + // Pack output + CastLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + // #pragma HLS UNROLL + res_pack[i_ic] = res_out[i_ic]; + } + + // Write output to stream when output ready + res_stream.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense.h new file mode 100644 index 0000000000..64b927cc64 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense.h @@ -0,0 +1,49 @@ +#ifndef NNET_DENSE_H_ +#define NNET_DENSE_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_dense_latency.h" +#include "nnet_dense_resource.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +struct dense_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_out = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned strategy = latency; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + // Product function to use + template using product = nnet::product::mult; +}; + +template +void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + //#pragma HLS inline + if (CONFIG_T::strategy == nnet::latency) { + dense_latency(data, res, weights, biases); + } else { + dense_resource(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h new file mode 100644 index 0000000000..f3f27b6db8 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h @@ -0,0 +1,106 @@ +// +// hls4ml: Vivado HLS code for neural-net building blocks +// +// Copyright (C) 2018 Giuseppe Di Guglielmo +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#ifndef NNET_COMPRESSED_LAYER_H_ +#define NNET_COMPRESSED_LAYER_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_dense.h" +#include + +namespace nnet { + +template +void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out], + typename CONFIG_T::accum_t weight) { + for (unsigned k = 0; k < CONFIG_T::n_out; k++) { + // #pragma HLS UNROLL + if (k == index) + mult[k] += weight; + } +} + +template +void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor); + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + //#pragma HLS ARRAY_PARTITION variable=acc complete + //#pragma HLS ARRAY_PARTITION variable=biases complete + //#pragma HLS ARRAY_RESHAPE variable=weights block factor=multiplier_limit + // if (CONFIG_T::store_weights_in_bram){ + ////#pragma HLS RESOURCE variable=weights core=ROM_1P_BRAM + //#pragma HLS data_pack variable=weights struct_level + //} + +InitAccum: + for (unsigned i = 0; i < CONFIG_T::n_out; i++) { + acc[i] = (typename CONFIG_T::accum_t)(biases[i]); + } + + // Do the compressed matrix-multiply + const int rufactor = CONFIG_T::reuse_factor; +ReuseLoop: + for (unsigned ir = 0; ir < rufactor; ir++) { + //#pragma HLS PIPELINE II=1 rewind + + typename CONFIG_T::accum_t mult[CONFIG_T::n_out]; + //#pragma HLS ARRAY_PARTITION variable=mult complete + + ResetMult: + for (int imult = 0; imult < CONFIG_T::n_out; imult++) { + // #pragma HLS UNROLL + mult[imult] = 0; + } + + CompressedMultLoop: + for (unsigned im = 0; im < multiplier_limit; im++) { + // #pragma HLS UNROLL + unsigned w = im * rufactor + ir; + auto row = weights[w].row_index; + auto col = weights[w].col_index; + auto weight_cache = weights[w].weight; + data_T data_cache = data[row]; + // mult[col] += weight_cache * data_cache; + typename CONFIG_T::accum_t prod = + CONFIG_T::template product::product(data_cache, weight_cache); + fill_mult(col, mult, prod); + } + + for (int im = 0; im < CONFIG_T::n_out; im++) { + acc[im] += mult[im]; + } + } + +// Cast to "res_t" type +ResultLoop: + for (unsigned i = 0; i < CONFIG_T::n_out; i++) { + // #pragma HLS UNROLL + // res[i] = (res_T) (acc[i]); + res[i] = cast(acc[i]); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h new file mode 100644 index 0000000000..40e5cd2b9d --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h @@ -0,0 +1,92 @@ + +#ifndef NNET_DENSE_LATENCY_H_ +#define NNET_DENSE_LATENCY_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +template +void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + // Partial unroll config + constexpr int prod1_unroll = + (ce_reuse_factor < CONFIG_T::n_in) ? CONFIG_T::n_in : (int)(CONFIG_T::n_in * CONFIG_T::n_out) / ce_reuse_factor; + constexpr int prod2_unroll = (int)CONFIG_T::n_out / ce_reuse_factor; + + (void)ce_reuse_factor; // to silence compiler warnings + (void)prod1_unroll; + (void)prod2_unroll; + + // For Catapult, add an extra scope so that we can apply the pipeline pragma as if it applied to the function + do { + data_T cache; + typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out]; + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + //#pragma HLS function_instantiate variable=weights,biases + + // For parallel inputs: + // - completely partition arrays -- target fabric + // - if we have an unroll factor, limit number of multipliers + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // //#pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression + // sometimes + //#pragma HLS ARRAY_PARTITION variable=biases complete + //#pragma HLS ARRAY_PARTITION variable=mult complete + //#pragma HLS ARRAY_PARTITION variable=acc complete + + // int multiplier_limit = ceil(float(CONFIG_T::n_in*CONFIG_T::n_out) / float(CONFIG_T::reuse_factor)) - + // floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor)); + constexpr int multiplier_limit = + ((CONFIG_T::n_in * CONFIG_T::n_out) / CONFIG_T::reuse_factor) - CONFIG_T::n_zeros / CONFIG_T::reuse_factor; + CONFIG_T::template product::limit(multiplier_limit); + + // Do the matrix-multiply + Product1: + for (unsigned int ii = 0; ii < CONFIG_T::n_in; ii++) { + cache = data[ii]; + Product2: + for (unsigned int jj = 0; jj < CONFIG_T::n_out; jj++) { + int index = ii * CONFIG_T::n_out + jj; + mult[index] = + CONFIG_T::template product::product(cache, weights[index]); + } + } + + // Initialize accumulator with input biases + ResetAccum: + for (unsigned int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + + // Accumulate multiplication result + Accum1: + for (unsigned int ii = 0; ii < CONFIG_T::n_in; ii++) { + Accum2: + for (unsigned int jj = 0; jj < CONFIG_T::n_out; jj++) { + int index = ii * CONFIG_T::n_out + jj; + acc[jj] += mult[index]; + } + } + + // Cast to "res_t" type + Result: + for (unsigned int ires = 0; ires < CONFIG_T::n_out; ires++) { + // res[ires] = (res_T) (acc[ires]); + res[ires] = cast(acc[ires]); + } + } while (false); // one iteration loop +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h new file mode 100644 index 0000000000..5bcd1a54b7 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h @@ -0,0 +1,262 @@ + +#ifndef NNET_DENSE_RESOURCE_H_ +#define NNET_DENSE_RESOURCE_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_mult.h" +#include +#include + +namespace nnet { + +template +void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int rufactor = CONFIG_T::reuse_factor; + const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit / CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); + + //#pragma HLS function_instantiate variable=weights,biases + ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + //#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + //#pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + //#pragma HLS ARRAY_PARTITION variable=acc complete + +InitAccum: + for (int iacc = 0; iacc < nout; iacc++) { + //#pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + //#pragma HLS PIPELINE II=1 rewind + + int w_index = ir; + int in_index = ir; + int out_index = 0; + int acc_step = 0; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + //#pragma HLS UNROLL + + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights[w_index])); + + // Increment w_index + w_index += rufactor; + // Increment in_index + in_index += rufactor; + if (in_index >= nin) { + in_index = ir; + } + // Increment out_index + if (acc_step + 1 >= multscale) { + acc_step = 0; + out_index++; + } else { + acc_step++; + } + } + } + +// Cast to "res_t" type +Result: + for (unsigned int ires = 0; ires < CONFIG_T::n_out; ires++) { + //#pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out); + const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0"); + + //#pragma HLS function_instantiate variable=weights,biases + ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + //#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + //#pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + //#pragma HLS ARRAY_PARTITION variable=acc complete + +InitAccum: + for (int iacc = 0; iacc < nout; iacc++) { + //#pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + + unsigned int w_index; + int in_index = 0; + int out_index; + int outstep = 0; + const int outscale = rufactor / nin; + + int outidx[rufactor]; +IndexLoop: + for (int ir = 0; ir < rufactor; ir++) { + outidx[ir] = outstep; + if ((ir + 1) % nin == 0) { + outstep++; + } + } + +ReuseLoop: + for (unsigned int ir = 0; ir < rufactor; ir++) { + //#pragma HLS PIPELINE II=1 rewind + + w_index = ir; + out_index = outidx[ir] /*outstep*/; + + MultLoop: + for (unsigned int im = 0; im < block_factor; im++) { + //#pragma HLS UNROLL + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights[w_index])); + + w_index += rufactor; + if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) + break; // check out of bounds + out_index += outscale; + } + + in_index++; + if (in_index >= nin) { + in_index = 0; + // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround. + } + } + +// Cast to "res_t" type +Result: + for (unsigned int ires = 0; ires < CONFIG_T::n_out; ires++) { + //#pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int rufactor = CONFIG_T::reuse_factor; + const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((rufactor > nin) && "This function is correct only for RF > N_IN"); + + //#pragma HLS function_instantiate variable=weights,biases + ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + //#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + //#pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + //#pragma HLS ARRAY_PARTITION variable=acc complete + +InitAccum: + for (int iacc = 0; iacc < nout; iacc++) { + //#pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + //#pragma HLS PIPELINE II=1 rewind + typename CONFIG_T::accum_t tmpmult[block_factor]; + //#pragma HLS ARRAY_PARTITION variable=tmpmult complete + + MultLoop: + for (int im = 0; im < block_factor; im++) { + //#pragma HLS UNROLL + unsigned int w_index = ir + rufactor * im; + int in_index = w_index % nin; + if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) + continue; // check out of bounds + tmpmult[im] = + CONFIG_T::template product::product(data[in_index], weights[w_index]); + } + + typename CONFIG_T::accum_t mult[multiplier_limit]; + //#pragma HLS ARRAY_PARTITION variable=mult complete + + ResetMult: + for (int imult = 0; imult < multiplier_limit; imult++) { + //#pragma HLS UNROLL + mult[imult] = 0; + } + + AccumLoop1: + for (int im = 0; im < block_factor; im++) { + //#pragma HLS UNROLL + int w_index = ir + rufactor * im; + int out_index = w_index / multfactor; + if (out_index >= multiplier_limit) + continue; // check out of bounds + mult[out_index] += tmpmult[im]; + } + + AccumLoop2: + for (int im = 0; im < multiplier_limit; im++) { + //#pragma HLS UNROLL + // int out_index = im/multscale; // This is the general case + // acc[out_index] += mult[im]; + acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out + } + } + +// Cast to "res_t" type +Result: + for (unsigned int ires = 0; ires < CONFIG_T::n_out; ires++) { + //#pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + //#pragma HLS INLINE region + + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_resource_rf_leq_nin(data, res, weights, biases); + } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) { + dense_resource_rf_gt_nin_rem0(data, res, weights, biases); + } else { + dense_resource_rf_gt_nin(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h new file mode 100644 index 0000000000..665d2f43f3 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h @@ -0,0 +1,72 @@ +#ifndef NNET_DENSE_STREAM_H_ +#define NNET_DENSE_STREAM_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_types.h" +#include +#include + +namespace nnet { + +template +void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + //#pragma HLS INLINE region + if (CONFIG_T::strategy == nnet::latency) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + dense_latency(data, res, weights, biases); + } else { + dense_resource(data, res, weights, biases); + } +} + +template +void dense(ac_channel &data_stream, ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + typename data_T::value_type data[CONFIG_T::n_in]; + //#pragma HLS ARRAY_PARTITION variable=data complete + + typename res_T::value_type res[CONFIG_T::n_out]; + //#pragma HLS ARRAY_PARTITION variable=res complete + + if ((CONFIG_T::n_in / data_T::size) > 1) { + } +DataPrepare: + for (unsigned int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) { + if (CONFIG_T::n_in / data_T::size > 1) { + //#pragma HLS PIPELINE + } + data_T data_pack = data_stream.read(); + DataPack: + for (unsigned int i_pack = 0; i_pack < data_T::size; i_pack++) { + //#pragma HLS UNROLL + data[i_in * data_T::size + i_pack] = data_pack[i_pack]; + } + } + + dense_wrapper(data, res, weights, biases); + + if ((CONFIG_T::n_out / res_T::size) > 1) { + } +ResWrite: + for (unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) { + if (CONFIG_T::n_out / res_T::size > 1) { + //#pragma HLS PIPELINE + } + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + ResPack: + for (unsigned int i_pack = 0; i_pack < res_T::size; i_pack++) { + //#pragma HLS UNROLL + res_pack[i_pack] = res[i_out * res_T::size + i_pack]; + } + res_stream.write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_embed.h b/hls4ml/templates/catapult/nnet_utils/nnet_embed.h new file mode 100644 index 0000000000..4cdf507f9d --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_embed.h @@ -0,0 +1,47 @@ +#ifndef NNET_EMBED_H_ +#define NNET_EMBED_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" + +namespace nnet { + +struct embed_config { + // Internal data type definitions + typedef float embeddings_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_out = 16; + static const unsigned vocab_size = 50; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; +}; + +template +void embedding(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) { + + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + // This can save a few cycles, but it will create a large multiplexer due to + // non-constant access pattern, so let's leave it out + ////#pragma HLS ARRAY_PARTITION variable=embeddings complete + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; +InputSequence: + for (int j = 0; j < CONFIG_T::n_in; j++) { + // #pragma HLS UNROLL + DenseEmbedding: + for (int i = 0; i < CONFIG_T::n_out; i++) { + // #pragma HLS UNROLL + res[j * CONFIG_T::n_out + i] = embeddings[data[j] * CONFIG_T::n_out + i]; + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h new file mode 100644 index 0000000000..1378100879 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h @@ -0,0 +1,34 @@ +#ifndef NNET_EMBED_STREAM_H_ +#define NNET_EMBED_STREAM_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_helpers.h" + +namespace nnet { + +template +void embedding(ac_channel &data, ac_channel &res, + typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) { + data_T in_data = data.read(); + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; +InputSequence: + for (int j = 0; j < data_T::size; j++) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + + DenseEmbedding: + for (int i = 0; i < CONFIG_T::n_out; i++) { + // #pragma HLS UNROLL + res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i]; + } + res.write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h b/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h new file mode 100644 index 0000000000..7451110fba --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h @@ -0,0 +1,816 @@ + +#ifndef NNET_GARNET_H_ +#define NNET_GARNET_H_ + +#include "ac_channel.h" +#include "hls_math.h" +#include "nnet_common.h" + +namespace nnet { +namespace garnet_utils { + +template +inline typename std::enable_if::value>::type +initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) { + typedef ac_int index_t; + + unsigned const table_size = (1 << CONFIG_T::distance_width); + + index_t index; + typename CONFIG_T::distance_t distance; + + // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1 + edge_weights_table[0] = ac_fixed(1.); + + for (unsigned iw = 1; iw < table_size; ++iw) { + index = iw; + distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0); + edge_weights_table[iw] = hls::exp(-distance * distance); + } +} + +template +inline typename std::enable_if::value>::type +initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) { + unsigned const table_size = (1 << CONFIG_T::distance_width); + double const step = 64. / table_size; + + typename CONFIG_T::distance_t v = -32.; + for (unsigned iw = 0; iw < table_size; ++iw) { +#ifdef __SYNTHESIS__ + // hack for now to get through the flow + edge_weights_table[iw] = (-v * v); +#else + edge_weights_table[iw] = std::exp(-v * v); +#endif + v += step; + } +} + +template +inline typename std::enable_if::value, typename CONFIG_T::edge_weight_t>::type +get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) { + typedef ac_int index_t; + + index_t index(distance.range(CONFIG_T::distance_width - 1, 0)); + + return edge_weights_table[index]; +} + +template +inline + typename std::enable_if::value, typename CONFIG_T::edge_weight_t>::type + get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) { + unsigned const table_size = (1 << CONFIG_T::distance_width); + double const step = 64. / table_size; + + int index = (distance + 32.) / step; + if (index < 0) + index = 0; + else if (index >= table_size) + index = table_size - 1; + + return edge_weights_table[index]; +} + +template typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) { + if (CONFIG_T::is_stack) { + //#pragma HLS INLINE OFF + } +#ifdef __SYNTHESIS__ + typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width]; + // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices / + // CONFIG_T::reuse_factor); + // //#pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1 + bool initialized = false; +#else + static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width]; + static bool initialized = false; +#endif + if (not initialized) { + initialize_edge_weights_table(edge_weights_table); + initialized = true; + } + + return get_edge_weight(distance, edge_weights_table); +} + +template +inline typename std::enable_if::value, dividend_T>::type normalize_log2(dividend_T dividend, + exponent_T exponent) { + //#pragma HLS INLINE + return dividend >> exponent; +} + +template +inline typename std::enable_if::value, dividend_T>::type normalize_log2(dividend_T dividend, + exponent_T exponent) { + //#pragma HLS INLINE + return dividend / std::pow(2., exponent); +} + +template struct Means { + typedef E edge_weight_t; + + edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators]; + typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features]; + + Means() { + //#pragma HLS INLINE + //#pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete + //#pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete + + Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + edge_weight_mean[ia] = 0.; + + InFeatures: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + weighted_feature_mean[iax] = 0.; + } + } + } + + void set_weight(unsigned, edge_weight_t const &) { + //#pragma HLS INLINE + } + + void add_means_normalized(Means const &local) { + //#pragma HLS INLINE + // Always called within a pipelined region - no UNROLL needed + + unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor; + + Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor); + + InFeatures: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor); + } + } + } + + template + typename std::enable_if::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) { + //#pragma HLS INLINE + + // accum comes divided by unroll factor + typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx; + + Aggregators: + for (unsigned ia = 0; ia < T::n_aggregators; ++ia) { + edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm; + + InFeatures: + for (unsigned ix = 0; ix < T::n_in_features; ++ix) { + unsigned const iax = ia * T::n_in_features + ix; + + weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm; + } + } + } + + template + typename std::enable_if::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) { + //#pragma HLS INLINE + + Aggregators: + for (unsigned ia = 0; ia < T::n_aggregators; ++ia) { + + edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor); + + InFeatures: + for (unsigned ix = 0; ix < T::n_in_features; ++ix) { + unsigned const iax = ia * T::n_in_features + ix; + + weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor); + } + } + } +}; + +template struct WeightsAndMeans : public Means { + typedef E edge_weight_t; + + edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators]; + + WeightsAndMeans() : Means() { + //#pragma HLS INLINE + unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor); + //#pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor + } + + void set_weight(unsigned iva, edge_weight_t const &weight) { + //#pragma HLS INLINE + edge_weights[iva] = weight; + } +}; + +template struct OutputBiasNormalizer; + +template +struct OutputBiasNormalizer::type> { + typedef typename CONFIG_T::output_transform_biases_t biases_t; + + biases_t const (&output_biases)[CONFIG_T::n_out_features]; + + OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} { + //#pragma HLS INLINE + } +}; + +template +struct OutputBiasNormalizer::type> { + typedef typename CONFIG_T::output_transform_biases_t biases_t; + + biases_t output_biases[CONFIG_T::n_out_features]; + + OutputBiasNormalizer(nvtx_T const nvtx) { + //#pragma HLS ARRAY_PARTITION variable=output_biases complete + + // Cannot add a loop label here due to a Vivado HLS bug, apparently + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io]; + bias *= nvtx; + output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width); + } + } +}; + +template struct InputDataGetter { + typedef data_T data_t; + + data_T const *dataref; + + InputDataGetter(data_T const *d) : dataref{d} { + //#pragma HLS INLINE + } + data_T const &get(unsigned iv, unsigned ix) const { + //#pragma HLS INLINE + unsigned const ivx = iv * CONFIG_T::n_in_features + ix; + return dataref[ivx]; + } +}; + +template struct SingleVertexDataGetter { + typedef data_T data_t; + + data_T const (&dataref)[CONFIG_T::n_in_features]; + + SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} { + //#pragma HLS INLINE + } + data_T const &get(unsigned, unsigned ix) const { + //#pragma HLS INLINE + return dataref[ix]; + } +}; + +template struct OutputResSetter { + typedef res_T res_t; + + res_T *resref; + + OutputResSetter(res_T *r) : resref{r} { + //#pragma HLS INLINE + } + void set(unsigned iv, unsigned io, res_T const &acc) { + //#pragma HLS INLINE + unsigned const ivo = iv * CONFIG_T::n_out_features + io; + resref[ivo] = acc; + } +}; + +template struct SingleVertexResSetter { + typedef res_T res_t; + + res_T (&resref)[CONFIG_T::n_out_features]; + + SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} { + //#pragma HLS INLINE + } + void set(unsigned, unsigned io, res_T const &acc) { + //#pragma HLS INLINE + resref[io] = acc; + } +}; + +template +inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local, + arrays_T &arrays) { + //#pragma HLS INLINE + +Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia]; + + InFeatures1: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + + typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax]; + + distance += incr; + } + + typename CONFIG_T::edge_weight_t edge_weight = + garnet_utils::compute_edge_weight(distance); + + arrays_local.edge_weight_mean[ia] += edge_weight; + + InFeatures2: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + + typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight; + + arrays_local.weighted_feature_mean[iax] += incr; + } + + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + arrays.set_weight(iva, edge_weight); + } +} + +template +inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) { + //#pragma HLS INLINE + + unsigned const ioa = io * CONFIG_T::n_aggregators + ia; + typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa]; + +InFeatures: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const ioax = ioa * CONFIG_T::n_in_features + ix; + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + + aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax]; + } + + return aggr; +} + +template +inline void compute_output_base(arrays_T const &arrays, + typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) { + //#pragma HLS INLINE + +OutFeatures: + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const ioa = io * CONFIG_T::n_aggregators + ia; + + output_base[ioa] = compute_output_base_core(arrays, io, ia); + } + } +} + +template +inline void +compute_vertex_output(arrays_T const &arrays, unsigned iv, + typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators], + res_setter_T &res_setter) { + //#pragma HLS INLINE + + typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators]; + //#pragma HLS ARRAY_PARTITION variable=edge_weights complete + +Aggregators1: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + + edge_weights[ia] = arrays.edge_weights[iva]; + } + +OutFeatures: + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io]; + + Aggregators2: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const ioa = io * CONFIG_T::n_aggregators + ia; + + typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa]; + acc += incr; + } + + res_setter.set(iv, io, acc); + } +} + +template +void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) { + InputDataGetter data_getter(data); + + unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor; + + Means means_accum; + +VerticesOuter: + for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) { + //#pragma HLS PIPELINE + + if (ivv * unroll_factor >= nvtx) + break; + + Means means_local; + + VerticesInner: + for (unsigned ir = 0; ir < unroll_factor; ++ir) { + unsigned iv = ivv * unroll_factor + ir; + + if (iv == nvtx) + break; + + compute_weights_aggregates(data_getter, iv, means_local, arrays); + } + + means_accum.add_means_normalized(means_local); + } + + arrays.set_means_normalized(nvtx, means_accum); +} + +template +void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { + OutputResSetter res_setter(res); + + typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]; + //#pragma HLS ARRAY_PARTITION variable=output_base complete + + compute_output_base(arrays, output_base); + + unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor; + +VerticesOuter: + for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) { + //#pragma HLS PIPELINE + + if (ivv * unroll_factor >= nvtx) + break; + + VerticesInner: + for (unsigned ir = 0; ir < unroll_factor; ++ir) { + unsigned iv = ivv * unroll_factor + ir; + + if (iv == nvtx) + break; + + compute_vertex_output(arrays, iv, output_base, res_setter); + } + } +} + +template +void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays, + res_T res[CONFIG_T::n_out_features]) { + //#pragma HLS PIPELINE + +OutFeatures: + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + res_T acc = output_transform_biases.output_biases[io]; + + Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + typename CONFIG_T::aggr_t aggr = compute_output_base_core(arrays, io, ia); + + acc += arrays.edge_weight_mean[ia] * aggr; + } + + res[io] = acc; + } +} + +template +void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T ¤t_arrays) { + typedef typename prev_layer_t::output_t data_T; + + typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators]; + //#pragma HLS ARRAY_PARTITION variable=prev_output_base complete + + compute_output_base(prev_arrays, prev_output_base); + + unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor; + + Means means_accum; + +VerticesOuter: + for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) { + //#pragma HLS PIPELINE + + if (ivv * unroll_factor >= nvtx) + break; + + Means means_local; + + VerticesInner: + for (unsigned ir = 0; ir < unroll_factor; ++ir) { + unsigned iv = ivv * unroll_factor + ir; + + if (iv == nvtx) + break; + + data_T data[prev_layer_t::n_out_features]; + //#pragma HLS ARRAY_PARTITION variable=data complete + + SingleVertexResSetter res_setter(data); + + compute_vertex_output(prev_arrays, iv, prev_output_base, res_setter); + + SingleVertexDataGetter data_getter(data); + + compute_weights_aggregates(data_getter, iv, means_local, current_arrays); + } + + means_accum.add_means_normalized(means_local); + } + + current_arrays.set_means_normalized(nvtx, means_accum); +} + +template +inline typename std::enable_if::value>::type +sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) { + //#pragma HLS INLINE + + distribute_aggregate(nvtx, prev_arrays, last_arrays); +} + +template +inline typename std::enable_if::value>::type +sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) { + //#pragma HLS INLINE + + WeightsAndMeans current_arrays; + + distribute_aggregate(nvtx, prev_arrays, current_arrays); + + sublayer(nvtx, current_arrays, last_arrays); +} +} // namespace garnet_utils + +struct garnet_config { + // Layer specs + static const unsigned n_vertices_width = 8; + static const unsigned n_vertices = (1 << n_vertices_width); + static const unsigned n_in_features = 4; + static const unsigned n_propagate = 4; + static const unsigned n_aggregators = 4; + static const unsigned n_out_features = 4; + static const unsigned distance_width = 12; + + // Internal data type definitions + typedef float input_transform_weights_t; + typedef float input_transform_biases_t; + typedef float output_transform_weights_t; + typedef float output_transform_biases_t; + typedef float aggregator_distance_weights_t; + typedef float aggregator_distance_biases_t; + + typedef float norm_t; + typedef float distance_t; + typedef float edge_weight_t; + typedef float edge_weight_aggr_t; + typedef float aggr_t; + typedef float output_t; + + /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */ + /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */ + /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */ + /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */ + /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */ + + enum OutputCollapse { no_collapse, collapse_mean, collapse_max }; + + static const unsigned output_collapse = no_collapse; + + static const bool mean_by_nvert = false; + static const bool is_stack = false; + + // Optimization specs + static const unsigned reuse_factor = 64; + static const unsigned log2_reuse_factor = 6; +}; + +// vertices -> vertices +template +typename std::enable_if::type +garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { + //#pragma HLS DATAFLOW + + garnet_utils::WeightsAndMeans arrays; + + garnet_utils::aggregate(data, nvtx[0], arrays); + + garnet_utils::distribute(nvtx[0], arrays, res); +} + +// vertices -> out features +template +typename std::enable_if::type +garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_out_features]) { + //#pragma HLS DATAFLOW + + garnet_utils::Means arrays; + + garnet_utils::aggregate(data, nvtx[0], arrays); + + garnet_utils::OutputBiasNormalizer normalize_bias(nvtx[0]); + + garnet_utils::set_output(normalize_bias, arrays, res); +} + +// vertices -> vertices +template +typename std::enable_if::type +garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { + //#pragma HLS DATAFLOW + + typedef typename CONFIG_T::template sublayer_t<0> first_layer_t; + unsigned const ilast = CONFIG_T::n_sublayers - 1; + typedef typename CONFIG_T::template sublayer_t last_layer_t; + + garnet_utils::WeightsAndMeans arrays_first; + garnet_utils::Means arrays_last; + + garnet_utils::aggregate(data, nvtx[0], arrays_first); + + garnet_utils::sublayer(nvtx[0], arrays_first, + arrays_last); + + garnet_utils::distribute(nvtx[0], arrays_last, res); +} + +// vertices -> out features +template +typename std::enable_if::type +garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_out_features]) { + //#pragma HLS DATAFLOW + + typedef typename CONFIG_T::template sublayer_t<0> first_layer_t; + unsigned const ilast = CONFIG_T::n_sublayers - 1; + typedef typename CONFIG_T::template sublayer_t last_layer_t; + + garnet_utils::WeightsAndMeans arrays_first; + garnet_utils::Means arrays_last; + + garnet_utils::aggregate(data, nvtx[0], arrays_first); + + garnet_utils::sublayer(nvtx[0], arrays_first, + arrays_last); + + garnet_utils::OutputBiasNormalizer normalize_bias(nvtx[0]); + + garnet_utils::set_output(normalize_bias, arrays_last, res); +} + +/* Reference (dumb) implementation returning (Vertices, Features) */ +template +typename std::enable_if::type +garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { + typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators]; + typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate]; + + for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) { + if (iv == nvtx[0]) + break; + + for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) { + unsigned const ivp = iv * CONFIG_T::n_propagate + ip; + + propagated_features[ivp] = CONFIG_T::input_transform_biases[ip]; + + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const ivx = iv * CONFIG_T::n_in_features + ix; + unsigned const ipx = ip * CONFIG_T::n_in_features + ix; + + propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx]; + } + } + + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + + typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia]; + + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const ivx = iv * CONFIG_T::n_in_features + ix; + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + + distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax]; + } + + edge_weights[iva] = garnet_utils::compute_edge_weight(distance); + } + } + + typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate]; + + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) { + unsigned const iap = ia * CONFIG_T::n_propagate + ip; + + aggregated_features[iap] = 0.; + + for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) { + if (iv == nvtx[0]) + break; + + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + unsigned const ivp = iv * CONFIG_T::n_propagate + ip; + + aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp]; + } + } + } + + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) { + unsigned const iap = ia * CONFIG_T::n_propagate + ip; + + if (CONFIG_T::mean_by_nvert) + aggregated_features[iap] /= nvtx[0]; + else { + // Not using right shift in case aggr_t is float or double + aggregated_features[iap] /= CONFIG_T::n_vertices; + } + } + } + + for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) { + if (iv == nvtx[0]) + break; + + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + unsigned const ivo = iv * CONFIG_T::n_out_features + io; + + typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io]; + + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + unsigned const ioa = io * CONFIG_T::n_aggregators + ia; + + typename CONFIG_T::aggr_t aggr = 0.; + + for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) { + unsigned const iap = ia * CONFIG_T::n_propagate + ip; + unsigned const ioap = ioa * CONFIG_T::n_propagate + ip; + + aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap]; + } + + acc += edge_weights[iva] * aggr; + } + + res[ivo] = acc; + } + } +} + +/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */ +template +typename std::enable_if::type +garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_out_features]) { + typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]; + + garnet_ref(data, nvtx, vertex_res); + + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + typename CONFIG_T::aggr_t acc = 0.; + + for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) { + if (iv == nvtx[0]) + break; + + unsigned const ivo = iv * CONFIG_T::n_out_features + io; + + acc += vertex_res[ivo]; + } + + if (CONFIG_T::mean_by_nvert) + acc /= nvtx[0]; + else { + // Not using right shift in case aggr_t is float or double + acc /= CONFIG_T::n_vertices; + } + + res[io] = acc; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_helpers.h b/hls4ml/templates/catapult/nnet_utils/nnet_helpers.h new file mode 100644 index 0000000000..ed701e5c59 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_helpers.h @@ -0,0 +1,461 @@ + +#ifndef NNET_HELPERS_H +#define NNET_HELPERS_H + +#include "ac_channel.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern const char *get_weights_dir(); + +namespace nnet { + +#ifndef __SYNTHESIS__ + +#ifndef WEIGHTS_DIR +#define WEIGHTS_DIR get_weights_dir() +#endif + +template void load_weights_from_txt(T *w, const char *fname) { + + std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname); + std::ifstream infile(full_path.c_str(), std::ios::binary); + + if (infile.fail()) { + std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl; + exit(1); + } + + std::string line; + if (std::getline(infile, line)) { + std::istringstream iss(line); + std::string token; + + size_t i = 0; + while (std::getline(iss, token, ',')) { + // CATAPULT_PORT + // std::istringstream(token) >> w[i]; + double tmp; + std::istringstream(token) >> tmp; + w[i] = tmp; + i++; + } + + if (SIZE != i) { + std::cerr << "ERROR: Expected " << SIZE << " values"; + std::cerr << " but read only " << i << " values" << std::endl; + } + } +} + +template void load_compressed_weights_from_txt(T *w, const char *fname) { + + std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname); + std::ifstream infile(full_path.c_str(), std::ios::binary); + + if (infile.fail()) { + std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl; + exit(1); + } + + std::string line; + if (std::getline(infile, line)) { + std::istringstream iss(line); + std::string token; + std::string extra_chars = "} "; + + size_t i = 0; + while (std::getline(iss, token, '{')) { + if (token.length() == 0) { + continue; + } + for (char c : extra_chars) { + token.erase(std::remove(token.begin(), token.end(), c), token.end()); + } + if (token.back() == ',') { + token.erase(token.end() - 1); + } + + std::replace(token.begin(), token.end(), ',', ' '); + std::istringstream structss(token); + + if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) { + std::cerr << "ERROR: Unable to parse file " << std::string(fname); + exit(1); + } + i++; + } + + if (SIZE != i) { + std::cerr << "ERROR: Expected " << SIZE << " values"; + std::cerr << " but read only " << i << " values" << std::endl; + } + } +} + +template void load_exponent_weights_from_txt(T *w, const char *fname) { + + std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname); + std::ifstream infile(full_path.c_str(), std::ios::binary); + + if (infile.fail()) { + std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl; + exit(1); + } + + std::string line; + if (std::getline(infile, line)) { + std::istringstream iss(line); + std::string token; + std::string extra_chars = "} "; + + size_t i = 0; + while (std::getline(iss, token, '{')) { + if (token.length() == 0) { + continue; + } + for (char c : extra_chars) { + token.erase(std::remove(token.begin(), token.end(), c), token.end()); + } + if (token.back() == ',') { + token.erase(token.end() - 1); + } + + std::replace(token.begin(), token.end(), ',', ' '); + std::istringstream structss(token); + + double sign; + double weight; + if (!(structss >> sign >> weight)) { + std::cerr << "ERROR: Unable to parse file " << std::string(fname); + exit(1); + } + w[i].sign = sign; + w[i].weight = weight; + i++; + } + + if (SIZE != i) { + std::cerr << "ERROR: Expected " << SIZE << " values"; + std::cerr << " but read only " << i << " values" << std::endl; + } + } +} + +template +void convert_single_data(ac_fixed &src, double &dst) { + dst = src.to_double(); +} +template +void convert_single_data(ac_fixed &src, float &dst) { + dst = src.to_double(); +} +template void convert_single_data(srcType &src, dstType &dst) { dst = dstType(src); } +template void convert_data(srcType *src, dstType *dst) { + for (size_t i = 0; i < SIZE; i++) { + convert_single_data(src[i], dst[i]); + } +} + +template void convert_data(srcType *src, ac_channel &dst) { + for (size_t i = 0; i < SIZE / dstType::size; i++) { + dstType ctype; + for (size_t j = 0; j < dstType::size; j++) { + ctype[j] = typename dstType::value_type(src[i * dstType::size + j]); + } + dst.write(ctype); + } +} + +template void convert_data(ac_channel &src, dstType *dst) { + for (size_t i = 0; i < SIZE / srcType::size; i++) { + srcType ctype = src.read(); + for (size_t j = 0; j < srcType::size; j++) { + dst[i * srcType::size + j] = dstType(ctype[j].to_double()); // this may only work for ac_fixed + } + } +} + +extern bool trace_enabled; +extern std::map *trace_outputs; +extern size_t trace_type_size; + +template void save_output_array(data_T *data, save_T *ptr, size_t layer_size) { + for (int i = 0; i < layer_size; i++) { + ptr[i] = static_cast(data[i].to_double()); + } +} + +template void save_output_array(ac_channel &data, save_T *ptr, size_t layer_size) { + for (size_t i = 0; i < layer_size / data_T::size; i++) { + data_T ctype = data.read(); + for (size_t j = 0; j < data_T::size; j++) { + ptr[i * data_T::size + j] = save_T(ctype[j]); + } + data.write(ctype); + } +} + +template void save_output_array(ac_channel &data, float *ptr, size_t layer_size) { + for (size_t i = 0; i < layer_size / data_T::size; i++) { + data_T ctype = data.read(); + for (size_t j = 0; j < data_T::size; j++) { + ptr[i * data_T::size + j] = ctype[j].to_double(); + } + data.write(ctype); + } +} + +template void save_output_array(ac_channel &data, double *ptr, size_t layer_size) { + for (size_t i = 0; i < layer_size / data_T::size; i++) { + data_T ctype = data.read(); + for (size_t j = 0; j < data_T::size; j++) { + ptr[i * data_T::size + j] = ctype[j].to_double(); + } + data.write(ctype); + } +} + +// We don't want to include save_T in this function because it will be inserted into myproject.cpp +// so a workaround with element size is used +template void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) { + if (!trace_enabled) + return; + + if (trace_outputs) { + if (trace_outputs->count(layer_name) > 0) { + if (trace_type_size == 4) { + save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size); + } else if (trace_type_size == 8) { + save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size); + } else { + std::cout << "Unknown trace type!" << std::endl; + } + } else { + std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl; + } + } else { + std::ostringstream filename; + filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data + std::fstream out; + out.open(filename.str(), std::ios::app); + assert(out.is_open()); + for (int i = 0; i < layer_size; i++) { + out << data[i] << " "; // We don't care about precision in text files + } + out << std::endl; + out.close(); + } +} + +template void save_layer_output(ac_channel &data, const char *layer_name, size_t layer_size) { + if (!trace_enabled) + return; + + if (trace_outputs) { + if (trace_outputs->count(layer_name) > 0) { + if (trace_type_size == 4) { + save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size); + } else if (trace_type_size == 8) { + save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size); + } else { + std::cout << "Unknown trace type!" << std::endl; + } + } else { + std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl; + } + } else { + std::ostringstream filename; + filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data + std::fstream out; + out.open(filename.str(), std::ios::app); + assert(out.is_open()); + for (size_t i = 0; i < layer_size / data_T::size; i++) { + data_T ctype = data.read(); + for (size_t j = 0; j < data_T::size; j++) { + out << ctype[j].to_double(); + out << " "; // We don't care about precision in text files + } + data.write(ctype); + } + out << std::endl; + out.close(); + } +} + +#endif + +template void copy_data(std::vector src, dst_T dst[SIZE]) { + typename std::vector::const_iterator in_begin = src.cbegin() + OFFSET; + typename std::vector::const_iterator in_end = in_begin + SIZE; + std::copy(in_begin, in_end, dst); +} + +template +void copy_data(std::vector src, ac_channel &dst) { + typename std::vector::const_iterator in_begin = src.cbegin() + OFFSET; + typename std::vector::const_iterator in_end = in_begin + SIZE; + + size_t i_pack = 0; + dst_T dst_pack; + for (typename std::vector::const_iterator i = in_begin; i != in_end; ++i) { + dst_pack[i_pack++] = typename dst_T::value_type(*i); + if (i_pack == dst_T::size) { + i_pack = 0; + dst.write(dst_pack); + } + } +} + +template void copy_data_axi(std::vector src, dst_T dst[SIZE]) { + for (auto i = 0; i < SIZE; i++) + if (i == SIZE - 1) { + dst[i].data = src[i]; + dst[i].last = 1; + } else { + dst[i].data = src[i]; + dst[i].last = 0; + } +} + +template void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) { + for (unsigned i = 0; i < SIZE; i++) { + out << result[i] << " "; + } + out << std::endl; +} + +template void print_result(ac_channel &result, std::ostream &out, bool keep = false) { + if (!keep) { + while (result.available(1)) { + res_T res_pack = result.read(); + for (unsigned int j = 0; j < res_T::size; j++) { + out << res_pack[j] << " "; + } + } + out << std::endl; + } else { + if (result.debug_size() >= SIZE / res_T::size) { + for (unsigned int i = 0; i < SIZE / res_T::size; i++) { + res_T res_pack = result[i]; // peek + for (unsigned int j = 0; j < res_T::size; j++) { + out << res_pack[j] << " "; + } + } + out << std::endl; + } + } +} + +template void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); } + +template void fill_zero(ac_channel &data) { + for (unsigned int i = 0; i < SIZE / data_T::size; i++) { + data_T data_pack; + for (unsigned int j = 0; j < data_T::size; j++) { + data_pack[j] = 0.; + } + data.write(data_pack); + } +} + +// Fix for CAT-36531 +template void fill_random(data_T data[SIZE]) { + // std::cout << "Fill_Random SIZE:"<< SIZE << std::endl; + data_T MAX_VALUE; + for (unsigned int i = 0; i < SIZE; i++) { + // Generate a random value (for example, between 0 and 1) + data_T random_value = (data_T)rand() / MAX_VALUE.template set_val(); + data[i] = random_value; + } +} + +template void fill_random(ac_channel &data) { + typedef typename data_T::value_type base_T; + base_T MAX_VALUE; + for (unsigned int i = 0; i < SIZE / data_T::size; i++) { + data_T data_pack; + for (unsigned int j = 0; j < data_T::size; j++) { + // Generate a random value (for example, between 0 and 1) + base_T random_value = (base_T)rand() / MAX_VALUE.template set_val(); + data_pack[j] = random_value; + } + data.write(data_pack); + } + // std::cout << "Fill_Random AC_CHANNEL" << std::endl; +} + +template int read_file_1D(const char *filename, dataType data[nrows]) { + FILE *fp; + fp = fopen(filename, "r"); + if (fp == 0) { + return -1; + } + // Read data from file + float newval; + for (int ii = 0; ii < nrows; ii++) { + if (fscanf(fp, "%f\n", &newval) != 0) { + data[ii] = newval; + } else { + return -2; + } + } + fclose(fp); + return 0; +} + +template +int read_file_2D(const char *filename, dataType data[nrows][ncols]) { + FILE *fp; + fp = fopen(filename, "r"); + if (fp == 0) { + return -1; + } + // Read data from file + float newval; + for (int ii = 0; ii < nrows; ii++) { + for (int jj = 0; jj < ncols; jj++) { + if (fscanf(fp, "%f\n", &newval) != 0) { + data[ii][jj] = newval; + } else { + return -2; + } + } + } + fclose(fp); + return 0; +} + +template void change_type(ac_channel &in, ac_channel &out) { + in_T datareg; + ac_channel input_trunc; + for (int ii = 0; ii < N_IN; ii++) { + out << (out_T)in.read(); + } +} + +template void hls_stream_debug(ac_channel &data, ac_channel &res) { + data_T datareg; + for (int ii = 0; ii < N_IN; ii++) { + datareg = data.read(); + std::cout << "[" << ii << "]: " << datareg << std::endl; + res << datareg; + } +} + +constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); } + +constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); } + +constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); } + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_image.h b/hls4ml/templates/catapult/nnet_utils/nnet_image.h new file mode 100755 index 0000000000..26947fae01 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_image.h @@ -0,0 +1,41 @@ +#ifndef NNET_IMAGE_H_ +#define NNET_IMAGE_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include + +namespace nnet { + +struct resize_config { + static const unsigned height = 10; + static const unsigned width = 10; + static const unsigned n_chan = 10; + static const unsigned new_height = 10; + static const unsigned new_width = 10; +}; + +template +void resize_nearest(data_T image[CONFIG_T::height * CONFIG_T::width * CONFIG_T::n_chan], + data_T resized[CONFIG_T::new_height * CONFIG_T::new_width * CONFIG_T::n_chan]) { + int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1; + int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1; + int x2, y2; + + //#pragma HLS PIPELINE + + for (int i = 0; i < CONFIG_T::new_height; i++) { + for (int j = 0; j < CONFIG_T::new_width; j++) { + x2 = ((j * x_ratio) >> 16); + y2 = ((i * y_ratio) >> 16); + for (int k = 0; k < CONFIG_T::n_chan; k++) { + resized[(i * CONFIG_T::new_width * CONFIG_T::n_chan) + j * CONFIG_T::n_chan + k] = + image[(y2 * CONFIG_T::width * CONFIG_T::n_chan) + x2 * CONFIG_T::n_chan + k]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_image_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_image_stream.h new file mode 100644 index 0000000000..1757f7bfb8 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_image_stream.h @@ -0,0 +1,66 @@ +#ifndef NNET_IMAGE_STREAM_H_ +#define NNET_IMAGE_STREAM_H_ + +#include "ac_channel.h" +#include "nnet_common.h" + +namespace nnet { + +template void resize_nearest(ac_channel &image, ac_channel &resized) { + assert(CONFIG_T::new_height % CONFIG_T::height == 0); + assert(CONFIG_T::new_width % CONFIG_T::width == 0); + constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height; + constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width; + +ImageHeight: + for (unsigned h = 0; h < CONFIG_T::height; h++) { + //#pragma HLS PIPELINE + + data_T data_in_row[CONFIG_T::width]; + + ImageWidth: + for (unsigned i = 0; i < CONFIG_T::width; i++) { + //#pragma HLS UNROLL + + data_T in_data = image.read(); + + ImageChan: + for (unsigned j = 0; j < CONFIG_T::n_chan; j++) { + //#pragma HLS UNROLL + + data_in_row[i][j] = in_data[j]; + } + } + + ResizeHeight: + for (unsigned i = 0; i < ratio_height; i++) { + //#pragma HLS UNROLL + + ImageWidth2: + for (unsigned l = 0; l < CONFIG_T::width; l++) { + //#pragma HLS UNROLL + + ResizeWidth: + for (unsigned j = 0; j < ratio_width; j++) { + //#pragma HLS UNROLL + + data_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ResizeChan: + for (unsigned k = 0; k < CONFIG_T::n_chan; k++) { + //#pragma HLS UNROLL + + out_data[k] = data_in_row[l][k]; + } + + resized.write(out_data); + } + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_math.h b/hls4ml/templates/catapult/nnet_utils/nnet_math.h new file mode 100644 index 0000000000..c25f7187b6 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_math.h @@ -0,0 +1,178 @@ +#ifndef NNET_MATH_H_ +#define NNET_MATH_H_ + +#include "hls_math.h" + +namespace nnet { + +// This header defines the functions that return type different from the input +// For example, hls::sin(x) returns ac_fixed +// By ensuring we return the same type we can avoid casting issues in expressions + +template T sin(T x) { return (T)hls::sin(x); }; + +template T cos(T x) { return (T)hls::cos(x); }; + +template T asin(T x) { return (T)hls::asin(x); }; + +template T acos(T x) { return (T)hls::acos(x); }; + +template T atan(T x) { return (T)hls::atan(x); }; + +template T atan2(T x, T y) { return (T)hls::atan2(x, y); }; + +template void init_sincos_table(T table[1 << (W - I - 3)][2]) { + unsigned int NTE = 1 << (W - I - 3); // No of table entries + double step = M_PI / (4 * NTE); // Interval between angles + double y = 0; + // double scaled_angle = 0; + + for (unsigned int i = 0; i < NTE; i++) { + table[i][0] = std::cos(y); + table[i][1] = std::sin(y); + y += step; + // scaled_angle = y/(2*M_PI); + // printf("cos(%f) = %23.22f, sin(%f) = %23.22f index = %d, scaled angle = %13.12f \n", y, cos(y), y, sin(y), i, + // scaled_angle); + } +} + +template void sincos_lut(const T &input, T output[2]) { + + #pragma HLS INLINE + + // This implementation is based on ac_sincos_lut.h from AC math library + + static bool flag = true; + if (flag && T::width - T::iwidth > 12) { +#if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG) + std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl; + std::cout << "Warning: The output of sincos_lut will not be accurate" << std::endl; +#endif + flag = false; + } + // Datatype for lookup table entries + typedef ac_fixed luttype; + // Datatype for posinput which is used to handle negative inputs + typedef ac_fixed posinputtype; + + typedef ac_int<9, false> lutindextype; // 9 bits required for indexing into 512 entry table + typedef ac_int<3, false> octanttype; // 3 bits required for octant value range of 0 thru 7 + T outputtemp[2]; + lutindextype luTdex = 0; + posinputtype posinput = input; + + // Initialize the lookup table +#ifdef __SYNTHESIS__ + bool initialized = false; + luttype sincos[512][2]; +#else + static bool initialized = false; + static luttype sincos[512][2]; +#endif + if (!initialized) { + init_sincos_table(sincos); + initialized = true; + } + + // Leaving this commented out makes the table to to BRAM + //#pragma HLS ARRAY_PARTITION variable=sincos complete dim=0 + + typedef ac_int lutindextype1; + // Extracting (MSB-3:LSB) bits of scaled input to determine the lookup table index + lutindextype1 luTdex1 = posinput.range(AP_MAX(T::width - T::iwidth - 3, 1), 0); // Extracting the lookup table index + + if (T::width - T::iwidth >= 4 && T::width - T::iwidth <= 12) { + luTdex(8, 12 - (T::width - T::iwidth)) = luTdex1; // stride + } + // Approximation for the scaled inputs whose number of bits are greater than 12 + else if (T::width - T::iwidth > 12) { + // Lookup table index for the scaled inputs whose number of bits are greater than 12 + luTdex = luTdex1 / (1 << (AP_MAX(T::width - T::iwidth - 12, 0))); + if ((luTdex1 % (1 << (AP_MAX(T::width - T::iwidth - 12, 0)))) > (1 << (AP_MAX(T::width - T::iwidth - 13, 0)))) { + luTdex = luTdex + 1; + } + typedef ac_fixed + datatype; + datatype x = (datatype)luTdex1; + x = x >> AP_MAX(T::width - T::iwidth - 12, 0); + if (x > 511.5) { + luTdex = 511; + } + if (luTdex1 <= 1 << (AP_MAX(T::width - T::iwidth - 13, 0)) && luTdex1 != 0) { + luTdex = 1; + } + } + + if (T::width - T::iwidth >= 3) { + // Getting the octant 0-7 by extracting the first 3 bits from MSB side of scaled input where + // octant 0 corresponds to [0-PI/4), + // octant 1 corresponds to [PI/4-2PI/4), + // octant 2 corresponds to [2PI/4-3PI/4) and so on + // octanttype octant = posinput.template slc<3>(T::width-T::iwidth-3); + octanttype octant = posinput(T::width - T::iwidth - 1, T::width - T::iwidth - 3); + luTdex = (octant[0] == 1) ? (lutindextype)(512 - luTdex) : (lutindextype)(luTdex); + // imaginary part is sine + outputtemp[1] = ((octant == 0) | (octant == 3)) ? (T)sincos[luTdex][1] + : ((octant == 2) | (octant == 1)) ? (T)sincos[luTdex][0] + : ((octant == 7) | (octant == 4)) ? (T)-sincos[luTdex][1] + : (T)-sincos[luTdex][0]; + // real part is cosine + outputtemp[0] = ((octant == 6) | (octant == 1)) ? (T)sincos[luTdex][1] + : ((octant == 3) | (octant == 4)) ? (T)-sincos[luTdex][0] + : ((octant == 2) | (octant == 5)) ? (T)-sincos[luTdex][1] + : (T)sincos[luTdex][0]; + // Below two are the cases when the output corresponds to + or - (0 or 1) for which there is no entry in the lookup + // table + output[1] = ((posinput == 0.125) | (posinput == 0.375)) ? T(0.7071067811865475244008) + : ((posinput == 0.625) | (posinput == 0.875)) ? T(-0.7071067811865475244008) + : outputtemp[1]; + output[0] = ((posinput == 0.125) | (posinput == 0.875)) ? T(0.7071067811865475244008) + : ((posinput == 0.375) | (posinput == 0.625)) ? T(-0.7071067811865475244008) + : outputtemp[0]; + } + + if (T::width - T::iwidth <= 2) { + output[1] = (posinput == 0) ? (T)0 + : (posinput == 0.25) ? (T)1 + : (posinput == 0.5) ? (T)0 + : (posinput == 0.75) ? (T)-1 + : outputtemp[1]; + output[0] = (posinput == 0) ? (T)1 + : (posinput == 0.25) ? (T)0 + : (posinput == 0.5) ? (T)-1 + : (posinput == 0.75) ? (T)0 + : outputtemp[0]; + } + +#if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG) + std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl; + std::cout << "============AP_FIXED SINCOS======================" << std::endl; + std::cout << "positive input is = " << posinput << std::endl; + std::cout << "lut index is = " << luTdex << std::endl; + std::cout << "sin value is = " << output[1] << std::endl; + std::cout << "cos value is = " << output[0] << std::endl; + std::cout << "=================================================" << std::endl; +#endif +} + +template T sin_lut(const T input) { + #pragma HLS INLINE + T sincos_res[2]; + T scaled_input = input * ac_fixed<16, 0, false>(0.15915494309); // 1/(2*pi) + sincos_lut(scaled_input, sincos_res); + return sincos_res[1]; +} + +template T cos_lut(const T input) { + #pragma HLS INLINE + T sincos_res[2]; + T scaled_input = input * ac_fixed<16, 0, false>(0.15915494309); // 1/(2*pi) + sincos_lut(scaled_input, sincos_res); + return sincos_res[0]; +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_merge.h b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h new file mode 100644 index 0000000000..00c2cf5e12 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h @@ -0,0 +1,232 @@ + +#ifndef NNET_MERGE_H_ +#define NNET_MERGE_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +struct merge_config { + static const unsigned n_elem = 10; +}; + +struct dot_config { + static const unsigned n_in = 10; + static const unsigned n_out = 1; + static const unsigned reuse_factor = 1; + typedef float accum_t; + // Product function to use + template using product = nnet::product::mult; +}; + +struct concat_config { + static const unsigned n_elem1_0 = 10; + static const unsigned n_elem1_1 = 10; + static const unsigned n_elem1_2 = 10; + static const unsigned n_elem2_0 = 10; + static const unsigned n_elem2_1 = 10; + static const unsigned n_elem2_2 = 10; + + static const int axis = -1; +}; + +template +void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = data1[ii] + data2[ii]; + } +} + +template +void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = data1[ii] - data2[ii]; + } +} + +template +void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = data1[ii] * data2[ii]; + } +} + +template +void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = (data1[ii] + data2[ii]) / (res_T)2; + } +} + +template +void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii]; + } +} + +template +void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii]; + } +} + +template +void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); + CONFIG_T::template product::limit(multiplier_limit); + + typename CONFIG_T::accum_t mult[CONFIG_T::n_in]; + //#pragma HLS ARRAY_PARTITION variable=mult complete + typename CONFIG_T::accum_t acc = 0; + +Product: + for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) { + // #pragma HLS UNROLL + mult[i_mult] = CONFIG_T::template product::product(data1[i_mult], data2[i_mult]); + } + +Accum: + for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) { + // #pragma HLS UNROLL + acc += mult[i_acc]; + } + + res[0] = cast(acc); +} + +template +void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0], + res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) { + for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { + res[ii] = data1[ii]; + } + for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) { + res[CONFIG_T::n_elem1_0 + ii] = data2[ii]; + } +} + +template +void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { + for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) { + res[ii] = data1[ii]; + } + for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii]; + } +} + +template +void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { + for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { + for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) { + res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj]; + } + for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) { + res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] = + data2[ii * CONFIG_T::n_elem2_1 + jj]; + } + } +} + +template +void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1(data1, data2, res); + } else { + concatenate2d_0(data1, data2, res); + } +} + +template +void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) { + res[ii] = data1[ii]; + } + for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii]; + } +} + +template +void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { + for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) { + for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) { + int res_idx = + ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk; + int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk; + res[res_idx] = data1[data_idx]; + } + } + for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) { + for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) { + int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + + (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk; + int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk; + res[res_idx] = data2[data_idx]; + } + } + } +} + +template +void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { + for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) { + for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) { + int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk; + int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk; + res[res_idx] = data1[data_idx]; + } + for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) { + int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2; + int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk; + res[res_idx] = data2[data_idx]; + } + } + } +} + +template +void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2(data1, data2, res); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1(data1, data2, res); + } else { + concatenate3d_0(data1, data2, res); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_merge_stream.h new file mode 100644 index 0000000000..ef0d542fc0 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_merge_stream.h @@ -0,0 +1,380 @@ + +#ifndef NNET_MERGE_STREAM_H_ +#define NNET_MERGE_STREAM_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include + +namespace nnet { + +template +void add(ac_channel &data1, ac_channel &data2, ac_channel &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +AddLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + //#pragma HLS PIPELINE + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + AddPack: + for (int j = 0; j < res_T::size; j++) { + // #pragma HLS UNROLL + out_data[j] = in_data1[j] + in_data2[j]; + } + + res.write(out_data); + } +} + +template +void subtract(ac_channel &data1, ac_channel &data2, ac_channel &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +SubtractLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + //#pragma HLS PIPELINE + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + SubtractPack: + for (int j = 0; j < res_T::size; j++) { + // #pragma HLS UNROLL + out_data[j] = in_data1[j] - in_data2[j]; + } + + res.write(out_data); + } +} + +template +void multiply(ac_channel &data1, ac_channel &data2, ac_channel &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; +MultiplyLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + MultiplyPack: + for (int j = 0; j < res_T::size; j++) { + // #pragma HLS UNROLL + out_data[j] = in_data1[j] * in_data2[j]; + } + + res.write(out_data); + } +} + +template +void average(ac_channel &data1, ac_channel &data2, ac_channel &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; +AverageLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + AveragePack: + for (int j = 0; j < res_T::size; j++) { + // #pragma HLS UNROLL + out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2; + } + + res.write(out_data); + } +} + +template +void maximum(ac_channel &data1, ac_channel &data2, ac_channel &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; +MaximumLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + MaximumPack: + for (int j = 0; j < res_T::size; j++) { + // #pragma HLS UNROLL + out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j]; + } + + res.write(out_data); + } +} + +template +void minimum(ac_channel &data1, ac_channel &data2, ac_channel &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; +MinimumLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + MinimumPack: + for (int j = 0; j < res_T::size; j++) { + // #pragma HLS UNROLL + out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j]; + } + + res.write(out_data); + } +} + +template +void concatenate3d_0(ac_channel &data1, ac_channel &data2, ac_channel &res) { +ConcatLoopHeight1: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + //#pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + // #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + res.write(out_data); + } + } +ConcatLoopHeight2: + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + ConcatLoopWidth2: + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + //#pragma HLS PIPELINE II=1 + + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + // #pragma HLS UNROLL + out_data[k] = in_data2[k]; + } + + res.write(out_data); + } + } +} + +template +void concatenate3d_1(ac_channel &data1, ac_channel &data2, ac_channel &res) { +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + //#pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + // #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + res.write(out_data); + } + ConcatLoopWidth2: + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + //#pragma HLS PIPELINE II=1 + + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + // #pragma HLS UNROLL + out_data[k] = in_data2[k]; + } + + res.write(out_data); + } + } +} + +template +void concatenate3d_2(ac_channel &data1, ac_channel &data2, ac_channel &res) { +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth: + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + //#pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + // #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + // #pragma HLS UNROLL + out_data[input1_T::size + k] = in_data2[k]; + } + + res.write(out_data); + } + } +} + +template +void concatenate3d(ac_channel &data1, ac_channel &data2, ac_channel &res) { + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2(data1, data2, res); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1(data1, data2, res); + } else { + concatenate3d_0(data1, data2, res); + } +} + +template +void concatenate2d_0(ac_channel &data1, ac_channel &data2, ac_channel &res) { +ConcatLoopHeight1: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + // pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + // #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + res.write(out_data); + } +ConcatLoopHeight2: + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + //#pragma HLS PIPELINE II=1 + + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + // #pragma HLS UNROLL + out_data[k] = in_data2[k]; + } + + res.write(out_data); + } +} + +template +void concatenate2d_1(ac_channel &data1, ac_channel &data2, ac_channel &res) { +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + //#pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + // #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + // #pragma HLS UNROLL + out_data[input1_T::size + k] = in_data2[k]; + } + + res.write(out_data); + } +} + +template +void concatenate2d(ac_channel &data1, ac_channel &data2, ac_channel &res) { + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1(data1, data2, res); + } else { + concatenate2d_0(data1, data2, res); + } +} + +template +void concatenate1d(ac_channel &data1, ac_channel &data2, ac_channel &res) { + res_T out_data; +//#pragma HLS DATA_PACK variable=out_data +ConcatLoop1: + for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) { + //#pragma HLS PIPELINE + input1_T in_data1 = data1.read(); + ConcatPack1: + for (int j = 0; j < res_T::size; j++) { + // #pragma HLS UNROLL + out_data[j] = in_data1[j]; + } + res.write(out_data); + } +ConcatLoop2: + for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) { + //#pragma HLS PIPELINE + input2_T in_data2 = data2.read(); + ConcatPack2: + for (int j = 0; j < res_T::size; j++) { + // #pragma HLS UNROLL + out_data[j] = in_data2[j]; + } + res.write(out_data); + } +} +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_mult.h b/hls4ml/templates/catapult/nnet_utils/nnet_mult.h new file mode 100755 index 0000000000..7379eec489 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_mult.h @@ -0,0 +1,127 @@ +#ifndef NNET_MULT_H_ +#define NNET_MULT_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_helpers.h" +#include +#include + +namespace nnet { + +namespace product { + +/* --- + * different methods to perform the product of input and weight, depending on the + * types of each. + * --- */ + +class Product { + public: + static void limit(unsigned multiplier_limit) {} // Nothing to do here +}; + +template class both_binary : public Product { + public: + static x_T product(x_T a, w_T w) { + // specialisation for 1-bit weights and incoming data + //#pragma HLS INLINE + return a == w; + } +}; + +template class weight_binary : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 1-bit weights, arbitrary data + //#pragma HLS INLINE + if (w == 0) + return -a; + else + return a; + } +}; + +template class data_binary : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(-w) { + // Specialisation for 1-bit data, arbitrary weight + //#pragma HLS INLINE + if (a == 0) + return -w; + else + return w; + } +}; + +template class weight_ternary : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 2-bit weights, arbitrary data + //#pragma HLS INLINE + if (w == 0) + return 0; + else if (w == -1) + return -a; + else + return a; // if(w == 1) + } +}; + +template class mult : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(a * w) { + // 'Normal' product + //#pragma HLS INLINE + return a * w; + } + static void limit(unsigned multiplier_limit) { + //#pragma HLS INLINE + //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + } +}; + +template class weight_exponential : public Product { + public: + // Construct the return type from the multiplication equivalent to the largest shifts + // ap_int is the type if the multiplicand equivalent to the largest lshift << + // ap_fixed is the type of the multiplicand equivalent to the largest rshift >> + using r_T = decltype(x_T(0) * (ac_int(1) + + ac_fixed(1))); + static r_T product(x_T a, w_T w) { + // Shift product for exponential weights + //#pragma HLS INLINE + // shift by the exponent. Negative weights shift right + r_T y = static_cast(a) << w.weight; + // negate or not depending on weight sign + return w.sign == 1 ? y : static_cast(-y); + } +}; + +} // namespace product + +template +inline typename std::enable_if>::value && + std::is_same>::value, + ac_int>::type +cast(typename CONFIG_T::accum_t x) { + return (ac_int)(x - CONFIG_T::n_in / 2) * 2; +} + +template +inline typename std::enable_if>::value && + !std::is_same>::value, + res_T>::type +cast(typename CONFIG_T::accum_t x) { + return (res_T)x; +} + +template +inline typename std::enable_if<(!std::is_same>::value), res_T>::type +cast(typename CONFIG_T::accum_t x) { + return (res_T)x; +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_padding.h b/hls4ml/templates/catapult/nnet_utils/nnet_padding.h new file mode 100755 index 0000000000..47986523fb --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_padding.h @@ -0,0 +1,145 @@ +#ifndef NNET_PADDING_H_ +#define NNET_PADDING_H_ + +#include + +namespace nnet { + +struct padding1d_config { + static const unsigned n_chan = 10; + static const unsigned in_width = 10; + static const unsigned out_width = 10; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template +void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) { + //#pragma HLS PIPELINE + + for (int j = 0; j < CONFIG_T::n_chan; j++) { + for (int i = 0; i < CONFIG_T::pad_left; i++) { + *(res++) = 0; + } + + for (int i = 0; i < CONFIG_T::in_width; i++) { + *(res++) = (res_T) * (data++); + } + + for (int i = 0; i < CONFIG_T::pad_right; i++) { + *(res++) = 0; + } + } +} + +template +void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) { + //#pragma HLS PIPELINE + + for (int i = 0; i < CONFIG_T::pad_left; i++) { + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(res++) = 0; + } + } + + for (int i = 0; i < CONFIG_T::in_width; i++) { + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(res++) = (res_T) * (data++); + } + } + + for (int i = 0; i < CONFIG_T::pad_right; i++) { + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(res++) = 0; + } + } +} + +struct padding2d_config { + static const unsigned n_chan = 10; + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned out_height = 10; + static const unsigned out_width = 10; + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template +void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) { + //#pragma HLS PIPELINE + + for (int k = 0; k < CONFIG_T::n_chan; k++) { + + for (int i = 0; i < CONFIG_T::pad_top; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + *(res++) = 0; + } + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::pad_left; j++) { + *(res++) = 0; + } + for (int j = 0; j < CONFIG_T::in_width; j++) { + *(res++) = (res_T) * (data++); + } + for (int j = 0; j < CONFIG_T::pad_right; j++) { + *(res++) = 0; + } + } + + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + *(res++) = 0; + } + } + } +} + +template +void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) { + //#pragma HLS PIPELINE + + for (int i = 0; i < CONFIG_T::pad_top; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::pad_left; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + for (int j = 0; j < CONFIG_T::in_width; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = (res_T) * (data++); + } + } + for (int j = 0; j < CONFIG_T::pad_right; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_padding_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_padding_stream.h new file mode 100644 index 0000000000..9c11683746 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_padding_stream.h @@ -0,0 +1,95 @@ +#ifndef NNET_PADDING_STREAM_H_ +#define NNET_PADDING_STREAM_H_ + +#include + +namespace nnet { + +template void fill_zero(ac_channel &res) { + //#pragma HLS INLINE + res_T res_part; + for (unsigned int c = 0; c < CONFIG_T::n_chan; c++) { + //#pragma HLS UNROLL + res_part[c] = 0; + } + res.write(res_part); +} + +template void fill_data(ac_channel &data, ac_channel &res) { + //#pragma HLS INLINE + data_T data_part = data.read(); + res_T res_part; + for (unsigned int c = 0; c < CONFIG_T::n_chan; c++) { + //#pragma HLS UNROLL + res_part[c] = data_part[c]; + } + res.write(res_part); +} + +template void zeropad1d_cl(ac_channel &data, ac_channel &res) { +PadLeft: + for (int i = 0; i < CONFIG_T::pad_left; i++) { + fill_zero(res); + } + +CopyMain: + for (int i = 0; i < CONFIG_T::in_width; i++) { + fill_data(data, res); + } + +PadRight: + for (int i = 0; i < CONFIG_T::pad_right; i++) { + fill_zero(res); + } +} + +// Description: +// apply zero padding to input feature data "data" based on +// padding parameters in CONFIG_T +// +// CONFIG_T::pad_top +// CONFIG_T::pad_left "data" CONFIG_T::pad_right +// CONFIG_T::pad_bottom +// +// Template Params: +// data_T - typically nnet::array< ac_fixed<>, 3*1> (see myproject.cpp -> firmware/defines.h) +// res_T - typically nnet::array< ac_fixed<>, 3*1> + +template void zeropad2d_cl(ac_channel &data, ac_channel &res) { + +PadTop: + for (unsigned i = 0; i < CONFIG_T::pad_top; i++) { + PadTopWidth: + for (unsigned j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(res); + } + } + +PadMain: + for (unsigned i = 0; i < CONFIG_T::in_height; i++) { + PadLeft: + for (unsigned j = 0; j < CONFIG_T::pad_left; j++) { + fill_zero(res); + } + CopyMain: + for (unsigned j = 0; j < CONFIG_T::in_width; j++) { + fill_data(data, res); + } + PadRight: + for (unsigned j = 0; j < CONFIG_T::pad_right; j++) { + fill_zero(res); + } + } + +PadBottom: + for (unsigned i = 0; i < CONFIG_T::pad_bottom; i++) { + PadBottomWidth: + for (unsigned j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(res); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h new file mode 100644 index 0000000000..82e281023b --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h @@ -0,0 +1,362 @@ +#ifndef NNET_POOLING_H_ +#define NNET_POOLING_H_ + +#include "nnet_helpers.h" +#include + +namespace nnet { + +// Return the maximum value from an array +template T max(T x[N]) { + T y = x[0]; + for (int i = 1; i < N; i++) { + y = x[i] > y ? x[i] : y; + } + return y; +} + +template ac_int avg(ac_int (&x)[N]) { + // Use a wider accumulator than the input to avoid overflow + ac_int tmp = 0; + for (int i = 0; i < N; i++) { + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ac_int y = tmp; + return tmp; +} + +template ac_fixed avg(ac_fixed (&x)[N]) { + // Use a wider accumulator than the input to avoid overflow + ac_fixed tmp = 0; + for (int i = 0; i < N; i++) { + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ac_fixed y = tmp; + return y; +} + +// Return the mean value of an array +template T avg(T (&x)[N]) { + T y = 0; + for (int i = 0; i < N; i++) { + y += x[i]; + } + y /= N; + return y; +} + +// Enumeration for pooling operation (max, avg, l2norm pooling) +enum Pool_Op { Max, Average }; // L2Norm }; +template T pool_op(T (&x)[N]) { + switch (op) { + case Max: + return max(x); + case Average: + return avg(x); + // case L2Norm: return l2norm(x); + } +} + +template T pad_val() { + /*--- + *- In Tensorflow, pooling ignores the value in the padded cells + *- For Avg pooling, return 0 (the divisior is modified to the + *- area overlapping the unpadded image. + *- For max pooling, return the most negative value for the type. + *- TODO this is not really generic, it assumes fixed point or integer T + ---*/ + switch (op) { + case Max: { + T x = 0; + x[x.width - 1] = 1; + return x; + break; + } + case Average: + return 0; + } +} + +struct pooling1d_config { + // IO size + static const unsigned n_in = 10; + static const unsigned pool_width = 2; + static const unsigned stride_width = 2; + static const unsigned n_out = (n_in - pool_width) / stride_width + 1; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template constexpr int pool_op_limit_1d() { + return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor; +} + +template +void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) { + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit_1d(); + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + // Add any necessary padding + unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Loop over input image x in steps of stride + for (int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) { + data_T pool[CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window x + for (int jj = 0; jj < CONFIG_T::stride_width; jj++) { + if (ii + jj < CONFIG_T::pad_left || ii + jj >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[jj] = pad_val(); + if (CONFIG_T::count_pad) { + img_overlap++; + } + } else { + pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff]; + img_overlap++; + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce width to area of pool window + // not overlapping padding region + res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if (CONFIG_T::pool_op == Average) { + data_T rescale = static_cast(CONFIG_T::pool_width) / img_overlap; + res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale; + } + } + } +} + +template +void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) { + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit_1d(); + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + data_T pool[CONFIG_T::n_in]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + for (int jj = 0; jj < CONFIG_T::n_in; jj++) { + pool[jj] = data[jj * CONFIG_T::n_filt + ff]; + } + // do the pooling + res[ff] = pool_op(pool); + } +} + +struct pooling2d_config { + // IO size + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_filt = 4; + static const unsigned stride_height = 2; + static const unsigned stride_width = 2; + static const unsigned pool_height = 2; + static const unsigned pool_width = 2; + static const unsigned out_height = (in_height - pool_height) / stride_height + 1; + static const unsigned out_width = (in_width - pool_width) / stride_width + 1; + // Padding + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + // Pooling function + static const Pool_Op pool_op = Max; + // Reuse factor + static const unsigned reuse_factor = 1; + + // Internal data type definitions + typedef float accum_t; +}; + +template constexpr int pool_op_limit() { + return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor; +} + +template +void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) { + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + // Add any necessary padding + unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Loop over input image y in steps of stride + for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + // Loop over input image x in steps of stride + for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window y + for (int kk = 0; kk < CONFIG_T::stride_height; kk++) { + // Loop over pool window x + for (int ll = 0; ll < CONFIG_T::stride_width; ll++) { + if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) || + jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[kk * CONFIG_T::stride_width + ll] = pad_val(); + if (CONFIG_T::count_pad) { + img_overlap++; + } + } else { + pool[kk * CONFIG_T::stride_width + ll] = + data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + + (jj + ll - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff]; + img_overlap++; + } + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce height * width to area of pool window + // not overlapping padding region + res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + + (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if (CONFIG_T::pool_op == Average) { + data_T rescale = + static_cast(CONFIG_T::pool_height) * static_cast(CONFIG_T::pool_width) / img_overlap; + res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + + (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale; + } + } + } + } +} + +template +void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) { + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + // Add any necessary padding + unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Loop over input image y in steps of stride + for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + // Loop over input image x in steps of stride + for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window y + for (int kk = 0; kk < CONFIG_T::stride_height; kk++) { + // Loop over pool window x + for (int ll = 0; ll < CONFIG_T::stride_width; ll++) { + if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) || + jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[kk * CONFIG_T::stride_width + ll] = pad_val(); + if (CONFIG_T::count_pad) { + img_overlap++; + } + } else { + pool[kk * CONFIG_T::stride_width + ll] = + data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + + ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left]; + img_overlap++; + } + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce height * width to area of pool window + // not overlapping padding region + res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) + + ff * CONFIG_T::out_height * CONFIG_T::out_width] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if (CONFIG_T::pool_op == Average) { + data_T rescale = + static_cast(CONFIG_T::pool_height) * static_cast(CONFIG_T::pool_width) / img_overlap; + res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) + + ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale; + } + } + } + } +} + +template +void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height); + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION instances=pool_op limit=limit function + +FiltLoop: + for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + data_T pool[CONFIG_T::in_height * CONFIG_T::in_width]; + + InputLoop: + for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) { + pool[i] = data[i * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast(pool_op(pool)); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h new file mode 100644 index 0000000000..051a27a54b --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h @@ -0,0 +1,601 @@ +#ifndef NNET_POOLING_STREAM_H_ +#define NNET_POOLING_STREAM_H_ + +// #include "utils/x_hls_utils.h" +#include "ac_channel.h" +#include "ap_shift_reg.h" +#include "nnet_common.h" +#include "nnet_conv_stream.h" +#include "nnet_pooling.h" + +namespace nnet { + +// ************************************************* +// Max/average pooling +// ************************************************* + +template T reduce_pool(T x[N]) { + //#pragma HLS INLINE + if (CONFIG_T::pool_op == Max) { + Op_max op_max; + return reduce>(x, op_max); + } else { + Op_add op_add; + T sum = reduce>(x, op_add); + return sum / N; + } +} + +template void init_pool_table(unsigned table[TABLE_SIZE]) { + for (unsigned ii = 0; ii < TABLE_SIZE; ii++) { + table[ii] = ii % POOL_SIZE; + } +} + +template +void compute_pool_encoded_2d( + const unsigned h_idx, const unsigned w_idx, const data_T &in_elem, + ac_channel data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt], + ac_channel &res, res_T &res_pack, unsigned &outputs_ready) { + // Nearest H without unused pixels on the right + constexpr unsigned nH = + ((CONFIG_T::in_height - CONFIG_T::pool_height) / CONFIG_T::stride_height) * CONFIG_T::stride_height + + CONFIG_T::pool_height; + // Scaled H that behaves like original H + constexpr unsigned sH = + (DIV_ROUNDUP(CONFIG_T::pool_height, CONFIG_T::stride_height) - 1) * CONFIG_T::stride_height + CONFIG_T::pool_height; + // Nearest W without unused pixels on the right + constexpr unsigned nW = ((CONFIG_T::in_width - CONFIG_T::pool_width) / CONFIG_T::stride_width) * CONFIG_T::stride_width + + CONFIG_T::pool_width; + // Scaled W that behaves like original W + constexpr unsigned sW = + (DIV_ROUNDUP(CONFIG_T::pool_width, CONFIG_T::stride_width) - 1) * CONFIG_T::stride_width + CONFIG_T::pool_width; + +#ifdef __SYNTHESIS__ + bool initialized = false; + unsigned pool_table_height[CONFIG_T::in_height]; + unsigned pool_table_width[CONFIG_T::in_width]; +#else + static bool initialized = false; + static unsigned pool_table_height[CONFIG_T::in_height]; + static unsigned pool_table_width[CONFIG_T::in_width]; +#endif + if (!initialized) { + init_pool_table(pool_table_height); + init_pool_table(pool_table_width); + initialized = true; + } + + //#pragma HLS INLINE + + if (data_T::size / CONFIG_T::n_filt > 1) { + //#pragma HLS ARRAY_PARTITION variable=pool_table_height complete + //#pragma HLS ARRAY_PARTITION variable=pool_table_width complete + } + + typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width]; + //#pragma HLS ARRAY_PARTITION variable=pool_window complete + + const unsigned sh_idx = pool_table_height[h_idx] * CONFIG_T::pool_width; + const unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_filt); +PixelLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) { + //#pragma HLS PIPELINE + + ac_int filt_mask = 0; + if ((h_idx < nH) && (wp_idx + p < nW)) { + filt_mask = sh_idx + pool_table_width[wp_idx + p] + 1; + } + CopyDataFilt: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + if (filt_mask > 0) + data_window[c * CONFIG_T::pool_height * CONFIG_T::pool_width + filt_mask.to_uint() - 1].write( + in_elem[p * CONFIG_T::n_filt + c]); + } + + if (filt_mask == CONFIG_T::pool_height * CONFIG_T::pool_width) { + FiltLoop: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + PoolLoop: + for (unsigned f = 0; f < CONFIG_T::pool_height * CONFIG_T::pool_width; f++) { + pool_window[f] = data_window[c * CONFIG_T::pool_height * CONFIG_T::pool_width + f].read(); + } + if (res_T::size / CONFIG_T::n_filt == + 1) { // Saves resources if we don't pack output, compiler will remove the else branch + res_pack[c] = + reduce_pool( + pool_window); + } else { + res_pack[outputs_ready * CONFIG_T::n_filt + c] = + reduce_pool( + pool_window); + } + } + if (res_T::size / CONFIG_T::n_filt == + 1) { // Saves resources if we don't pack output, compiler will remove the else branch + res.write(res_pack); + } else { + if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) { + res.write(res_pack); + outputs_ready = 0; + } else { + outputs_ready++; + } + } + } + } +} + +template +void pooling2d_encoded_cl(ac_channel &data, ac_channel &res) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + unsigned outputs_ready = 0; + + static ac_channel + data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]; + // constexpr int win_depth = CONFIG_T::pool_height * CONFIG_T::out_width; + // for (unsigned i_out = 0; i_out < CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt; i_out++) { + // #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + // } + + constexpr int pack_factor = (data_T::size / CONFIG_T::n_filt) * (res_T::size / CONFIG_T::n_filt == 1); + (void)pack_factor; +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (pack_factor); i_iw++) { + //#pragma HLS LOOP_FLATTEN + if (res_T::size / CONFIG_T::n_filt == 1) { + //#pragma HLS PIPELINE II=pack_factor + } + compute_pool_encoded_2d(i_ih, i_iw, data.read(), data_window, res, res_pack, + outputs_ready); + } + } +} + +// ************************************************* +// Line Buffer Implementation (Phil's) +// ************************************************* +template +void compute_pool_buffer_2d(const data_T &in_elem, + ap_shift_reg + line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt], + ac_channel &res) { + //#pragma HLS INLINE + const static int lShiftX = CONFIG_T::pool_width - 1; + const static int lShiftY = CONFIG_T::pool_height - 1; + static int pX = 0; // pixel X + static int pY = 0; // pixel Y + static int sX = 0; // stride X + static int sY = 0; // stride Y + + typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width]; + //#pragma HLS ARRAY_PARTITION variable=pool_window complete + + static typename data_T::value_type kernel_data[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + + // Add pixel into line buffer, return pooling kernels + nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); + + // Can compute pooling output + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { + FiltLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + //#pragma HLS PIPELINE + + // Retrieve data for current channel + PoolLoop: + for (unsigned i_ihw = 0; i_ihw < CONFIG_T::pool_height * CONFIG_T::pool_width; i_ihw++) { + pool_window[i_ihw] = kernel_data[i_ihw * CONFIG_T::n_filt + i_ic]; + } + + // Compute Pooling + res_pack[i_ic] = + reduce_pool( + pool_window); + } + + // Write to output + res.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image + pY = 0; + sY = 0; + } else { // Next line + pY = pY + 1; + // Update stride (threshold) ? subtract stride : increment stride + sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; + } + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void pooling2d_buffer_cl(ac_channel &data, ac_channel &res) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::pool_height - 1, 1)] + [CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + //#pragma HLS LOOP_FLATTEN + //#pragma HLS PIPELINE + + compute_pool_buffer_2d(data.read(), line_buffer, res); + } + } +} + +template void pooling2d_cl(ac_channel &data, ac_channel &res) { + //#pragma HLS inline region + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + pooling2d_buffer_cl(data, res); + break; + case conv_implementation::encoded: + pooling2d_encoded_cl(data, res); + break; + } +} + +// ************************************************* +// Pooling 1D +// ************************************************* + +template +void compute_pool_encoded_1d(const unsigned w_idx, const data_T &in_elem, + ac_channel data_window[CONFIG_T::pool_width * CONFIG_T::n_filt], + ac_channel &res, res_T &res_pack, unsigned &outputs_ready) { + // Nearest W without unused pixels on the right + constexpr unsigned nW = + ((CONFIG_T::n_in - CONFIG_T::pool_width) / CONFIG_T::stride_width) * CONFIG_T::stride_width + CONFIG_T::pool_width; + // Scaled W that behaves like original W + constexpr unsigned sW = + (DIV_ROUNDUP(CONFIG_T::pool_width, CONFIG_T::stride_width) - 1) * CONFIG_T::stride_width + CONFIG_T::pool_width; + +#ifdef __SYNTHESIS__ + bool initialized = false; + unsigned pool_table_width[CONFIG_T::n_in]; +#else + static bool initialized = false; + static unsigned pool_table_width[CONFIG_T::n_in]; +#endif + if (!initialized) { + init_pool_table(pool_table_width); + initialized = true; + } + + //#pragma HLS INLINE + + if (data_T::size / CONFIG_T::n_filt > 1) { + //#pragma HLS ARRAY_PARTITION variable=pool_table_width complete + } + + typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_width]; + //#pragma HLS ARRAY_PARTITION variable=pool_window complete + + const unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_filt); + +PixelLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) { + //#pragma HLS PIPELINE + + ac_int filt_mask = 0; + if (wp_idx + p < nW) { + filt_mask = pool_table_width[wp_idx + p] + 1; + } + + CopyDataFilt: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + if (filt_mask > 0) + data_window[c * CONFIG_T::pool_width + filt_mask.to_uint() - 1].write(in_elem[p * CONFIG_T::n_filt + c]); + } + + if (filt_mask == CONFIG_T::pool_width) { + FiltLoop: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + PoolLoop: + for (unsigned f = 0; f < CONFIG_T::pool_width; f++) { + pool_window[f] = data_window[c * CONFIG_T::pool_width + f].read(); + } + if (res_T::size / CONFIG_T::n_filt == + 1) { // Saves resources if we don't pack output, compiler will remove the else branch + res_pack[c] = reduce_pool(pool_window); + } else { + res_pack[outputs_ready * CONFIG_T::n_filt + c] = + reduce_pool(pool_window); + } + } + if (res_T::size / CONFIG_T::n_filt == + 1) { // Saves resources if we don't pack output, compiler will remove the else branch + res.write(res_pack); + } else { + if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) { + res.write(res_pack); + outputs_ready = 0; + } else { + outputs_ready++; + } + } + } + } +} + +template +void pooling1d_encoded_cl(ac_channel &data, ac_channel &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + unsigned outputs_ready = 0; + + ac_channel data_window[CONFIG_T::pool_width * CONFIG_T::n_filt]; + // constexpr int win_depth = CONFIG_T::n_out; + // for (unsigned i_out = 0; i_out < CONFIG_T::pool_width * CONFIG_T::n_filt; i_out++) { + // #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + // } + + constexpr int pack_factor = data_T::size / CONFIG_T::n_filt; + +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (pack_factor); i_iw++) { + //#pragma HLS LOOP_FLATTEN + if (res_T::size / CONFIG_T::n_filt == 1) { + //#pragma HLS PIPELINE II=pack_factor + } + compute_pool_encoded_1d(i_iw, data.read(), data_window, res, res_pack, outputs_ready); + } +} + +// ************************************************* +// Line Buffer Implementation (Phil's) 1D +// ************************************************* +template +void compute_pool_buffer_1d(const data_T &in_elem, ac_channel &res) { + //#pragma HLS INLINE + const static int lShiftX = CONFIG_T::pool_width - 1; + // Counters + static int pX = 0; + static int sX = 0; + + typename data_T::value_type pool_window[CONFIG_T::pool_width]; + //#pragma HLS ARRAY_PARTITION variable=pool_window complete + + static typename data_T::value_type kernel_data[CONFIG_T::pool_width * CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + + // Add pixel into line buffer, return pooling kernels + // 1D case line buffer not necessary. Put directly into the kernel_data buffer + nnet::kernel_shift_1d(in_elem, kernel_data); + + // Can compute pooling output + if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { + FiltLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + //#pragma HLS PIPELINE + + // Retrieve data for current channel + PoolLoop: + for (unsigned i_iw = 0; i_iw < CONFIG_T::pool_width; i_iw++) { + pool_window[i_iw] = kernel_data[i_iw * CONFIG_T::n_filt + i_ic]; + } + + // Compute Pooling + res_pack[i_ic] = reduce_pool(pool_window); + } + + // Write to output + res.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::n_in) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void pooling1d_buffer_cl(ac_channel &data, ac_channel &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in; i_iw++) { + //#pragma HLS LOOP_FLATTEN + //#pragma HLS PIPELINE + compute_pool_buffer_1d(data.read(), res); + } +} + +template void pooling1d_cl(ac_channel &data, ac_channel &res) { + //#pragma HLS inline region + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + pooling1d_buffer_cl(data, res); + break; + case conv_implementation::encoded: + pooling1d_encoded_cl(data, res); + break; + } +} + +// ************************************************* +// Global max/average pooling +// ************************************************* + +template T reduce_global_pool(T x, T y[N]) { + //#pragma HLS INLINE + if (CONFIG_T::pool_op == Max) { + Op_max op_max; + T y_max = reduce>(y, op_max); + return (x > y_max) ? x : y_max; + } else { + Op_add op_add; + T y_sum = reduce>(y, op_add); + return x + y_sum; + } +} + +template +void compute_global_pool(const data_T &in_elem, typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]) { +PoolFilt: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + + typename CONFIG_T::accum_t data_pack[data_T::size / CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable=data_pack complete dim=0 + + PixelLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) { + data_pack[p] = in_elem[p * CONFIG_T::n_filt + c]; + } + data_window[c] = reduce_global_pool( + data_window[c], data_pack); + } +} + +template +void global_pooling2d_cl(ac_channel &data, ac_channel &res) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable=data_window complete + + typename CONFIG_T::accum_t init = 0; + if (CONFIG_T::pool_op == Max) { + // init = hls::numeric_limits::min(); + init.template set_val(); + } + +PoolInitLoop: + for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) { + data_window[i_init] = init; + } + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_filt); i_iw++) { + //#pragma HLS LOOP_FLATTEN + compute_global_pool(data.read(), data_window); + } + } + + if (CONFIG_T::pool_op == Max) { + MaxPoolRes: + for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + //#pragma HLS PIPELINE + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + MaxPoolPack: + for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + res_pack[i_pack] = data_window[i_pack]; + } + res.write(res_pack); + } + } else { + AvgPoolRes: + for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + //#pragma HLS PIPELINE + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + AvgPoolPack: + for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width); + } + res.write(res_pack); + } + } +} + +template +void global_pooling1d_cl(ac_channel &data, ac_channel &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable=data_window complete + + typename CONFIG_T::accum_t init = 0; + if (CONFIG_T::pool_op == Max) { + // init = hls::numeric_limits::min(); + init.template set_val(); + } + +PoolInitLoop: + for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) { + data_window[i_init] = init; + } + +ReadInput: + for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (data_T::size / CONFIG_T::n_filt); i_iw++) { + //#pragma HLS LOOP_FLATTEN + compute_global_pool(data.read(), data_window); + } + + if (CONFIG_T::pool_op == Max) { + MaxPoolRes: + for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + //#pragma HLS PIPELINE + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + MaxPoolPack: + for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + res_pack[i_pack] = data_window[i_pack]; + } + res.write(res_pack); + } + } else { + AvgPoolRes: + for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + //#pragma HLS PIPELINE + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + AvgPoolPack: + for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in; + } + res.write(res_pack); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_recr_activations.h b/hls4ml/templates/catapult/nnet_utils/nnet_recr_activations.h new file mode 100755 index 0000000000..fd2019f3d5 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_recr_activations.h @@ -0,0 +1,56 @@ +#ifndef NNET_RECR_ACTIVATION_H_ +#define NNET_RECR_ACTIVATION_H_ + +#include "ac_channel.h" +#include "nnet_activation.h" +#include "nnet_common.h" +#include "nnet_helpers.h" +#include + +namespace nnet { + +namespace activation { + +template class Activation { + public: + // ************************************************* + // Blank Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here +}; + +template class relu : public Activation { + public: + // ************************************************* + // Relu Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + nnet::relu(data, res); + } +}; + +template class sigmoid : public Activation { + public: + // ************************************************* + // Sigmoid Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + nnet::sigmoid(data, res); + } +}; + +template class tanh : public Activation { + public: + // ************************************************* + // TanH Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + nnet::tanh(data, res); + } +}; + +} // namespace activation + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h b/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h new file mode 100755 index 0000000000..f08d4d1050 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h @@ -0,0 +1,572 @@ + +#ifndef NNET_RECURSIVE_H_ +#define NNET_RECURSIVE_H_ + +#include "ac_channel.h" +#include "nnet_activation.h" +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_recr_activations.h" + +namespace nnet { + +struct lstm_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + + // Layer Sizes + static const unsigned n_in = 2; + static const unsigned n_parts = 20; + static const unsigned n_out = 2; + static const unsigned n_state = 2; + static const unsigned n_4state = 8; + static const unsigned table_size = 1024; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; + static const bool store_weights_in_bram = false; + static const bool use_static = true; + + template using activation_recr = nnet::activation::relu; + template using activation = nnet::activation::relu; +}; +// Long Short term Memory NN (LSTM) +// Resources: +// https://github.com/nicodjimenez/lstm/blob/master/lstm.py +// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb +// https://en.wikipedia.org/wiki/Long_short-term_memory +// Notes: +// - LSTM naming conventions adopted from the above links +// - s_newstate = activation(U*input + W*state) +// - h_output = activation(U*input + W*state)*activation(s_newstate) +// - If softmax is needed on output, perform *outside* this operations +// Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE +// dense network at the end) +template +void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], + res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { + // Initialize the state variable -- will maintain state between function calls + + typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4]; + typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4]; + typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3]; // activated i,f,o matrices (keras notation) + typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state]; // activated c-matrix (keras notation) + typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation) + typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state]; // c-matrix (keras notation) + typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state]; + + //#pragma HLS ARRAY_PARTITION variable=h_newstate complete + //#pragma HLS ARRAY_PARTITION variable=s_newstate complete + //#pragma HLS ARRAY_PARTITION variable=tmpres complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_state complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_ifo complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_c complete + //#pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete + //#pragma HLS ARRAY_PARTITION variable=inputacc_c complete + //#pragma HLS ARRAY_PARTITION variable=s_actstate complete + + nnet::dense(data, tmpres, param, param_b); + nnet::dense(h_newstate, tmpres_state, param_r, param_br); + + for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + int index = iacc; + if (iacc > 2 * CONFIG_T::n_state - 1) + index = iacc + CONFIG_T::n_state; + inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index]; + } + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + int index = iacc + CONFIG_T::n_state * 2; + inputacc_c[iacc] = tmpres[index] + tmpres_state[index]; + } + + CONFIG_T::template activation_recr::activation( + inputacc_ifo, tmpres_ifo); + + // Now for the confusion matrix + CONFIG_T::template activation::activation( + inputacc_c, tmpres_c); + + // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues) + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)]; + } + // Operation: h=act(s)*o + CONFIG_T::template activation::activation( + s_newstate, s_actstate); + + for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) { + //#pragma HLS UNROLL + h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc]; + } +} + +template +void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], + res_T s_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { + static res_T h_state[CONFIG_T::n_state]; + static res_T s_state[CONFIG_T::n_state]; + // Initialize the state variable -- will maintain state between function calls + typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4]; + typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4]; + typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3]; // activated i,f,o matrices (keras notation) + typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state]; // activated c-matrix (keras notation) + typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation) + typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state]; // c-matrix (keras notation) + typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state]; + + //#pragma HLS ARRAY_PARTITION variable=h_newstate complete + //#pragma HLS ARRAY_PARTITION variable=s_newstate complete + //#pragma HLS ARRAY_PARTITION variable=h_state complete + //#pragma HLS ARRAY_PARTITION variable=s_state complete + //#pragma HLS ARRAY_PARTITION variable=tmpres complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_state complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_ifo complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_c complete + //#pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete + //#pragma HLS ARRAY_PARTITION variable=inputacc_c complete + //#pragma HLS ARRAY_PARTITION variable=s_actstate complete + + if (reset_state) { + for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) { + //#pragma HLS UNROLL + s_state[i_state] = 0; + h_state[i_state] = 0; + } + } + + nnet::dense(data, tmpres, param, param_b); + nnet::dense(h_state, tmpres_state, param_r, + param_br); + + for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + int index = iacc; + if (iacc > 2 * CONFIG_T::n_state - 1) + index = iacc + CONFIG_T::n_state; + inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index]; + } + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + int index = iacc + CONFIG_T::n_state * 2; + inputacc_c[iacc] = tmpres[index] + tmpres_state[index]; + } + + CONFIG_T::template activation_recr::activation( + inputacc_ifo, tmpres_ifo); + + // Now for the confusion matrix + CONFIG_T::template activation::activation( + inputacc_c, tmpres_c); + + // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues) + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)]; + s_newstate[iacc] = s_state[iacc]; + } + // Operation: h=act(s)*o + CONFIG_T::template activation::activation( + s_state, s_actstate); + + for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) { + //#pragma HLS UNROLL + h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc]; + h_newstate[iacc] = h_state[iacc]; + } +} + +template +void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { + + res_T h_newstate[CONFIG_T::n_state]; + res_T s_newstate[CONFIG_T::n_state]; + data_T data_in[CONFIG_T::n_in]; + bool reset_state = true; + + //#pragma HLS ARRAY_PARTITION variable=h_newstate complete + //#pragma HLS ARRAY_PARTITION variable=s_newstate complete + + for (int ii = 0; ii < CONFIG_T::n_state; ii++) { + //#pragma HLS UNROLL + h_newstate[ii] = 0; + s_newstate[ii] = 0; + } + for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) { + for (int j = 0; j < CONFIG_T::n_in; j++) { + //#pragma HLS UNROLL + data_in[j] = data[j + iloop * CONFIG_T::n_in]; + } + if (CONFIG_T::use_static) + nnet::lstm_static(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, + param_br); + else + nnet::lstm(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, + param_br); + if (CONFIG_T::n_sequence_out > 1) + for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) { + //#pragma HLS UNROLL + res[i] = h_newstate[j]; + } + reset_state = false; + } + if (CONFIG_T::n_sequence_out == 1) + for (int i = 0; i < (CONFIG_T::n_state); i++) { + //#pragma HLS UNROLL + res[i] = h_newstate[i]; + } +} + +template +void lstm_stack(ac_channel &data_stream, ac_channel &res_stream, + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { + + typename res_T::value_type h_newstate[CONFIG_T::n_state]; + typename res_T::value_type s_newstate[CONFIG_T::n_state]; + //#pragma HLS ARRAY_PARTITION variable=h_newstate complete + //#pragma HLS ARRAY_PARTITION variable=s_newstate complete + + for (int ii = 0; ii < CONFIG_T::n_state; ii++) { + //#pragma HLS UNROLL + h_newstate[ii] = 0; + s_newstate[ii] = 0; + } + + typename data_T::value_type data_in[CONFIG_T::n_in]; + bool reset_state = true; + +DataPropagation: + for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) { + if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) { + // //#pragma HLS PIPELINE + } + data_T data_pack = data_stream.read(); + DataPack: + for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + //#pragma HLS UNROLL + data_in[i_pack] = data_pack[i_pack]; + } + if (CONFIG_T::use_static) + nnet::lstm_static( + reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br); + else + nnet::lstm( + reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br); + if (CONFIG_T::n_sequence_out > 1) { + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + ResPack_sequences: + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + //#pragma HLS UNROLL + res_pack[i_pack] = h_newstate[i_pack]; + } + res_stream.write(res_pack); + } + reset_state = false; + } + + if (CONFIG_T::n_sequence_out == 1) { + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + ResPack: + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + //#pragma HLS UNROLL + res_pack[i_pack] = h_newstate[i_pack]; + } + res_stream.write(res_pack); + } +} + +// Struct for the GRU template + +struct gru_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 2; + static const unsigned n_out = 2; + static const unsigned n_state = 2; + static const unsigned n_sequence = 2; + static const unsigned n_4state = 8; + static const unsigned table_size = 1024; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const bool use_static = true; + static const unsigned n_zeros = 0; + + template using activation_recr = nnet::activation::relu; + template using activation = nnet::activation::relu; +}; + +template +void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param + // weights - refer page in copy!! + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { + // Initialize the state variable -- will maintain state between function calls + typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3]; + typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3]; + typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state]; + typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2]; // activated i,f,o matrices (keras notation) + typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state]; // activated c-matrix (keras notation) + typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation) + typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state]; // c-matrix (keras notation) + + //#pragma HLS ARRAY_PARTITION variable=h_newstate complete + //#pragma HLS ARRAY_PARTITION variable=tmpres complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_state_h complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_zr complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_h complete + //#pragma HLS ARRAY_PARTITION variable=inputacc_zr complete + //#pragma HLS ARRAY_PARTITION variable=inputacc_h complete + + nnet::dense(data, tmpres, param, param_b); + nnet::dense(h_newstate, tmpres_state_zr, param_zr, + param_br); + + // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres + // initialized with biases -- DONE + for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + int index = iacc; + inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index]; + } + + // Activation function Sub layer -- START + CONFIG_T::template activation_recr::activation(inputacc_zr, tmpres_zr); + + // Activation function Sub layer -- END + + // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; + } + + // Assuming reset_after is false + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + int index = iacc + CONFIG_T::n_state * 2; + inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc]; + } + + // Now run the activation on this guy + CONFIG_T::template activation::activation(inputacc_h, tmpres_h); + + // Mix the stat with the previous state + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]); + } +} + +template +void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { + // Initialize the state variable -- will maintain state between function calls + + static res_T h_state[CONFIG_T::n_state]; + typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3]; + typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3]; + typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state]; + typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2]; // activated i,f,o matrices (keras notation) + typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state]; // activated c-matrix (keras notation) + typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation) + typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state]; // c-matrix (keras notation) + + //#pragma HLS ARRAY_PARTITION variable=h_state complete + //#pragma HLS ARRAY_PARTITION variable=h_newstate complete + //#pragma HLS ARRAY_PARTITION variable=tmpres complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_state_h complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_zr complete + //#pragma HLS ARRAY_PARTITION variable=tmpres_h complete + //#pragma HLS ARRAY_PARTITION variable=inputacc_zr complete + //#pragma HLS ARRAY_PARTITION variable=inputacc_h complete + + if (reset_state) { + for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) { + //#pragma HLS UNROLL + h_state[i_h_state] = 0; + } + } + + nnet::dense(data, tmpres, param, param_b); + nnet::dense(h_state, tmpres_state_zr, param_zr, + param_br); + + // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres + // initialized with biases -- DONE + for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + int index = iacc; + inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index]; + } + + // Activation function Sub layer -- START + CONFIG_T::template activation_recr::activation(inputacc_zr, tmpres_zr); + + // Activation function Sub layer -- END + + // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; + } + + // Assuming reset_after is false + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + int index = iacc + CONFIG_T::n_state * 2; + inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc]; + } + + // Now run the activation on this guy + CONFIG_T::template activation::activation(inputacc_h, tmpres_h); + + // Mix the stat with the previous state + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + //#pragma HLS UNROLL + h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]); + h_newstate[iacc] = h_state[iacc]; + } +} + +template +void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { + + res_T h_state[CONFIG_T::n_state]; + data_T data_in[CONFIG_T::n_in]; + bool reset_state = true; + + //#pragma HLS ARRAY_PARTITION variable=h_state complete + //#pragma HLS ARRAY_PARTITION variable=data_in complete + + for (int ii = 0; ii < CONFIG_T::n_state; ii++) { + //#pragma HLS UNROLL + h_state[ii] = 0; + } + for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) { + for (int j = 0; j < CONFIG_T::n_in; j++) { + //#pragma HLS UNROLL + data_in[j] = data[j + iloop * CONFIG_T::n_in]; + } + if (CONFIG_T::use_static) + nnet::gru_static(reset_state, data_in, h_state, param, param_zr, param_b, param_br); + else + nnet::gru(reset_state, data_in, h_state, param, param_zr, param_b, param_br); + if (CONFIG_T::n_sequence_out > 1) + for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) { + //#pragma HLS UNROLL + res[i] = h_state[j]; + } + reset_state = false; + } + if (CONFIG_T::n_sequence_out == 1) + for (int i = 0; i < (CONFIG_T::n_state); i++) { + //#pragma HLS UNROLL + res[i] = h_state[i]; + } +} + +template +void gru_stack(ac_channel &data_stream, ac_channel &res_stream, + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { + + typename res_T::value_type h_newstate[CONFIG_T::n_state]; + //#pragma HLS ARRAY_PARTITION variable=h_newstate complete + for (int ii = 0; ii < CONFIG_T::n_state; ii++) { + //#pragma HLS UNROLL + h_newstate[ii] = 0; + } + + typename data_T::value_type data_in[CONFIG_T::n_in]; + bool reset_state = true; + +DataPropagation: + for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) { + if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) { + // //#pragma HLS PIPELINE + } + data_T data_pack = data_stream.read(); + DataPack: + for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + //#pragma HLS UNROLL + data_in[i_pack] = data_pack[i_pack]; + } + if (CONFIG_T::use_static) + nnet::gru_static( + reset_state, data_in, h_newstate, param, param_zr, param_b, param_br); + else + nnet::gru(reset_state, data_in, h_newstate, + param, param_zr, param_b, param_br); + if (CONFIG_T::n_sequence_out > 1) { + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + ResPack_sequences: + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + //#pragma HLS UNROLL + res_pack[i_pack] = h_newstate[i_pack]; + } + res_stream.write(res_pack); + } + reset_state = false; + } + + if (CONFIG_T::n_sequence_out == 1) { + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + ResPack: + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + //#pragma HLS UNROLL + res_pack[i_pack] = h_newstate[i_pack]; + } + res_stream.write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h new file mode 100644 index 0000000000..eb5ef9f7db --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h @@ -0,0 +1,127 @@ +#ifndef NNET_SEPARABLE_CONV1D_STREAM_H_ +#define NNET_SEPARABLE_CONV1D_STREAM_H_ + +#include "ac_channel.h" +#include "nnet_common.h" +#include "nnet_conv1d_stream.h" +#include "nnet_sepconv_stream.h" + +namespace nnet { + +template +void depthwise_conv_1d_encoded_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + ac_channel data_window[CONFIG_T::filt_width * CONFIG_T::n_chan]; + // const int win_depth = CONFIG_T::out_width; + // for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) { + // #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + // } + + //#pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete + + res_T res_pack; + //#pragma HLS DATA_PACK variable=res_pack + unsigned outputs_ready = 0; + + ac_int pixel_idx[data_T::size / CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=pixel_idx complete + + constexpr int ce_reuse_factor = + CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1); + (void)ce_reuse_factor; +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + //#pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_scaled_indices_1d(i_iw, pixel_idx); + compute_depthwise_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, + weights, biases, pixel_idx); + } +} + +template +void depthwise_conv_1d_buffer_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency); + (void)ce_reuse_factor; +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + //#pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } +} + +template +void depthwise_conv_1d_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + #pragma HLS inline recursive + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + depthwise_conv_1d_buffer_cl(data, res, weights, biases); + break; + case conv_implementation::encoded: + depthwise_conv_1d_encoded_cl(data, res, weights, biases); + break; + } +} + +template +void pointwise_conv_1d_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_width == 1); + + //#pragma HLS ARRAY_PARTITION variable=weights complete + //#pragma HLS ARRAY_PARTITION variable=biases complete + + constexpr int ce_reuse_factor = + CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1); + (void)ce_reuse_factor; +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + if (i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } +} + +template +void separable_conv_1d_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::depthwise_config::weight_t + depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { + //#pragma HLS DATAFLOW + + ac_channel depthwise_res; + unsigned res_depth = CONFIG_T::depthwise_config::out_width; + //#pragma HLS STREAM variable=depthwise_res depth=res_depth + + depthwise_conv_1d_cl(data, depthwise_res, depthwise_weights, + depthwise_biases); + pointwise_conv_1d_cl(depthwise_res, res, pointwise_weights, + pointwise_biases); +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h new file mode 100644 index 0000000000..d98dd8c315 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h @@ -0,0 +1,82 @@ +#ifndef NNET_SEPARABLE_CONV2D_H_ +#define NNET_SEPARABLE_CONV2D_H_ + +#include "nnet_common.h" +#include + +namespace nnet { + +template +void depthwise_conv_2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_chan], + typename CONFIG_T::weight_t depthwise_weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t depthwise_biases[CONFIG_T::n_chan]) { + const int in_height = CONFIG_T::in_height; + const int in_width = CONFIG_T::in_width; + const int n_chan = CONFIG_T::n_chan; + const int filt_height = CONFIG_T::filt_height; + const int filt_width = CONFIG_T::filt_width; + const int out_height = CONFIG_T::out_height; + const int out_width = CONFIG_T::out_width; + + // constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; (void)ce_reuse_factor; + + // do { + + //#pragma HLS ARRAY_PARTITION variable=res complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=depthwise_biases complete dim=0 + //#pragma HLS ARRAY_PARTITION variable=depthwise_weights complete dim=0 + for (int h = 0; h < in_height - filt_height + 1; h++) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind + for (int w = 0; w < in_width - filt_width + 1; w++) { + //#pragma HLS UNROLL + for (int c = 0; c < n_chan; c++) { + //#pragma HLS UNROLL + res_T sum = depthwise_biases[c]; + + // Apply the filter + for (int i = 0; i < filt_height; i++) { + //#pragma HLS UNROLL + for (int j = 0; j < filt_width; j++) { + //#pragma HLS UNROLL + int data_idx = (h + i) * in_width * n_chan + (w + j) * n_chan + c; + int weight_idx = i * filt_width * n_chan + j * n_chan + c; + sum += data[data_idx] * depthwise_weights[weight_idx]; + } + } + + int res_idx = (h * out_width * n_chan) + w * n_chan + c; + res[res_idx] = sum; + } + } + } + // } while (false); +} + +template +void separable_conv_2d_cl(data_T data[CONFIG_T::depthwise_config::in_height * CONFIG_T::depthwise_config::in_width * + CONFIG_T::depthwise_config::n_chan], + res_T res[CONFIG_T::pointwise_config::out_height * CONFIG_T::pointwise_config::out_width * + CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::weight_t + depthwise_weights[CONFIG_T::depthwise_config::filt_height * + CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { + + //#pragma HLS INLINE region + + dw_res_T depthwise_results[CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width * + CONFIG_T::depthwise_config::n_chan]; + depthwise_conv_2d_cl(data, depthwise_results, depthwise_weights, + depthwise_biases); + pointwise_conv_2d_cl(depthwise_results, res, pointwise_weights, + pointwise_biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h new file mode 100644 index 0000000000..a4f7d4faa9 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h @@ -0,0 +1,152 @@ +#ifndef NNET_SEPARABLE_CONV2D_STREAM_H_ +#define NNET_SEPARABLE_CONV2D_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_conv2d_stream.h" +#include "nnet_sepconv_stream.h" +#include "nnet_types.h" +#include + +namespace nnet { + +template +void depthwise_conv_2d_encoded_cl( + ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_height == CONFIG_T::filt_width); + + static ac_channel + data_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + // const int win_depth = CONFIG_T::filt_height * CONFIG_T::out_width; + // for (unsigned i_out = 0; i_out < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) { + // #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + // } + + // #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete + + res_T res_pack; + // PRAGMA_DATA_PACK(res_pack) + unsigned outputs_ready = 0; + + ac_int pixel_idx[data_T::size / CONFIG_T::n_chan]; + // #pragma HLS ARRAY_PARTITION variable=pixel_idx complete + + constexpr int ce_reuse_factor = + CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1); + (void)ce_reuse_factor; +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + // #pragma HLS LOOP_FLATTEN + // if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + // #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + // } + compute_scaled_indices_2d(i_ih, i_iw, pixel_idx); + compute_depthwise_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, + weights, biases, pixel_idx); + } + } +} + +// Line Buffer Implementation (Phil's) +template +void depthwise_conv_2d_buffer_cl( + ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[CONFIG_T::filt_height - 1] + [CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency); + (void)ce_reuse_factor; +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + //#pragma HLS LOOP_FLATTEN + // if (CONFIG_T::strategy == nnet::latency) { + // #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + // } + if (CONFIG_T::filt_height > 1) { + compute_depthwise_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } + } +} + +template +void depthwise_conv_2d_cl( + ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + // #pragma HLS inline recursive + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + depthwise_conv_2d_buffer_cl(data, res, weights, biases); + break; + case conv_implementation::encoded: + depthwise_conv_2d_encoded_cl(data, res, weights, biases); + break; + } +} + +template +void pointwise_conv_2d_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + + // #pragma HLS ARRAY_PARTITION variable=weights complete + // #pragma HLS ARRAY_PARTITION variable=biases complete + + constexpr int ce_reuse_factor = + CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1); + (void)ce_reuse_factor; +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + // #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } +} + +template +void separable_conv_2d_cl(ac_channel &data, ac_channel &res, + typename CONFIG_T::depthwise_config::weight_t + depthwise_weights[CONFIG_T::depthwise_config::filt_height * + CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { + // #pragma HLS DATAFLOW + + static ac_channel depthwise_res; + unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; + // #pragma HLS STREAM variable=depthwise_res depth=res_depth + + depthwise_conv_2d_cl(data, depthwise_res, depthwise_weights, + depthwise_biases); + pointwise_conv_2d_cl(depthwise_res, res, pointwise_weights, + pointwise_biases); +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h new file mode 100644 index 0000000000..753d260a77 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h @@ -0,0 +1,315 @@ +#ifndef NNET_SEPARABLE_CONV_STREAM_H_ +#define NNET_SEPARABLE_CONV_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_conv_stream.h" +#include +#include + +namespace nnet { + +template +void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + // #pragma HLS INLINE + + typename CONFIG_T::accum_t mult[CONFIG_T::kernel_size * CONFIG_T::n_chan]; + typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + // #pragma HLS function_instantiate variable=weights + + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; + + // Add dummy loop to which the pipeline pragma can be applied + do { + + //#pragma HLS ARRAY_PARTITION variable=mult complete + + //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit + + // Do the matrix-multiply + Product: + for (int ii = 0; ii < CONFIG_T::kernel_size * CONFIG_T::n_chan; ii++) { + // #pragma HLS UNROLL + mult[ii] = CONFIG_T::mult_config::template product::product( + data[ii], weights[ii]); + } + + // Initialize accumulator with input biases + ResetAccum: + for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { + //#pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + + // Accumulate multiplication result + Accum1: + for (int ii = 0; ii < CONFIG_T::kernel_size; ii++) { + Accum2: + for (int jj = 0; jj < CONFIG_T::n_chan; jj++) { + int index = ii * CONFIG_T::n_chan + jj; + acc[jj] += mult[index]; + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_chan; ires++) { + //#pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } + } while (0); +} + +template +void depthwise_mult_buffer(ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + res_T &res_pack, ac_channel &res_stream, unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + //#pragma HLS INLINE + + typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=data complete + typename res_T::value_type res[CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=res complete + +InitData: + for (int id = 0; id < CONFIG_T::kernel_size * CONFIG_T::n_chan; id++) { + //#pragma HLS UNROLL + data[id] = data_window[id].read(); + } + + //#pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + depthwise_product(data, res, weights, biases); + } else { + assert("Resource strategy for DepthwiseConv2D is not supported." && false); + } + +CastLoop: + for (unsigned jj = 0; jj < CONFIG_T::n_chan; jj++) { + //#pragma HLS UNROLL + if (res_T::size / CONFIG_T::n_chan == 1) { + res_pack[jj] = res[jj]; + } else { + res_pack[outputs_ready * CONFIG_T::n_chan + jj] = res[jj]; + } + } + + if (res_T::size / CONFIG_T::n_chan == 1) { + res_stream.write(res_pack); + } else { + if (outputs_ready == (res_T::size / CONFIG_T::n_chan) - 1) { + res_stream.write(res_pack); + outputs_ready = 0; + } else { + outputs_ready++; + } + } +} + +template +void compute_depthwise_output_encoded( + const data_T &in_elem, ac_channel data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + ac_channel &res, res_T &res_pack, unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan], ac_int *pixel_idx) { + //#pragma HLS INLINE + + constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; + (void)ce_reuse_factor; +MultLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) { + //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor + CopyDataFilt: + for (unsigned f = 0; f < CONFIG_T::kernel_size; f++) { + //#pragma HLS UNROLL + CopyDataChan: + for (unsigned c = 0; c < CONFIG_T::n_chan; c++) { + //#pragma HLS UNROLL + if (pixel_idx[p][f]) + data_window[f * CONFIG_T::n_chan + c].write(in_elem[p * CONFIG_T::n_chan + c]); + } + } + if (pixel_idx[p][CONFIG_T::kernel_size - 1]) { + depthwise_mult_buffer(data_window, res_pack, res, outputs_ready, weights, biases); + } + } +} + +template +void pointwise_mult_buffer(const data_T &data_pack, ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + //#pragma HLS INLINE + + typename data_T::value_type data[CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=data complete + + typename res_T::value_type res[CONFIG_T::n_filt]; + //#pragma HLS ARRAY_PARTITION variable=res complete + + res_T res_pack; + // PRAGMA_DATA_PACK(res_pack) + +InitData: + for (int id = 0; id < CONFIG_T::n_chan; id++) { + //#pragma HLS UNROLL + data[id] = data_pack[id]; + } + + //#pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + dense_latency( + data, res, weights, biases); + } else { + dense_resource( + data, res, weights, biases); + } + +CastLoop: + for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) { + //#pragma HLS UNROLL + res_pack[jj] = res[jj]; + } + + res_stream.write(res_pack); +} + +// Line Buffer Implementation (Phil's) +template +void compute_depthwise_output_buffer_1d(const data_T &in_elem, ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + //#pragma HLS INLINE + + // Thresholds + const static int lShiftX = CONFIG_T::filt_width - 1; + + // Counters + static int pX = 0; + static int sX = 0; + + static typename data_T::value_type kernel_data[CONFIG_T::filt_width * CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=kernel_data complete + + typename res_T::value_type res_out[CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0 + + res_T res_pack; + // PRAGMA_DATA_PACK(res_pack) + + // Add pixel to buffer + nnet::kernel_shift_1d(in_elem, kernel_data); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { + // Dense multiply + //#pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + depthwise_product(kernel_data, res_out, + weights, biases); + } else { + assert("Resource strategy for DepthwiseConv1D is not supported." && false); + } + + // Pack output + CastLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + //#pragma HLS UNROLL + res_pack[i_ic] = res_out[i_ic]; + } + + // Write output to stream when output ready + res_stream.write(res_pack); + } + + // Pointer Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + } else { + pX = pX + 1; + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void compute_depthwise_output_buffer_2d(const data_T &in_elem, + ap_shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], + ac_channel &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + //#pragma HLS INLINE + + // Thresholds + const static int lShiftX = CONFIG_T::filt_width - 1; + const static int lShiftY = CONFIG_T::filt_height - 1; + + // counters + static int pX = 0; // pixel X + static int pY = 0; // pixel Y + + static int sX = 0; // stride X + static int sY = 0; // stride Y + + static typename data_T::value_type kernel_data[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=kernel_data complete + + typename res_T::value_type res_out[CONFIG_T::n_chan]; + //#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0 + + res_T res_pack; + // PRAGMA_DATA_PACK(res_pack) + + // Add pixel to buffer + nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { + // Dense multiply + //#pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + depthwise_product(kernel_data, res_out, + weights, biases); + } else { + assert("Resource strategy for DepthwiseConv2D is not supported." && false); + } + + // Pack output + CastLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + //#pragma HLS UNROLL + res_pack[i_ic] = res_out[i_ic]; + } + + // Write output to stream when output ready + res_stream.write(res_pack); + } + + // Pointer Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image + pY = 0; + sY = 0; + } else { + pY = pY + 1; + sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; + } + } else { + pX = pX + 1; + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h new file mode 100644 index 0000000000..c76bfba5a6 --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h @@ -0,0 +1,156 @@ + +#ifndef NNET_STREAM_H +#define NNET_STREAM_H + +#include "ac_channel.h" + +namespace nnet { + +struct broadcast_config { + static const unsigned in_height = 1; + static const unsigned in_width = 1; + static const unsigned in_chan = 3; + static const unsigned out_height = 2; + static const unsigned out_width = 2; + static const unsigned out_chan = 3; +}; + +template +void clone_stream(ac_channel &data, ac_channel &res1, ac_channel &res2) { +// CloneLoop: for (int i = 0; i < N / data_T::size; i++) { +//#pragma HLS PIPELINE +#ifndef __SYNTHESIS__ + while (data.available(1)) +#endif + { + data_T in_data = data.read(); + res_T out_data; + // res_T out_data2; + //#pragma HLS DATA_PACK variable=out_data1 + //#pragma HLS DATA_PACK variable=out_data2 + + ClonePack: + for (int j = 0; j < data_T::size; j++) { + //#pragma HLS UNROLL + out_data[j] = in_data[j]; + // out_data2[j] = in_data[j]; + } + + res1.write(out_data); + res2.write(out_data); + } +} + +template void repack_stream(ac_channel &data, ac_channel &res) { + if (data_T::size == res_T::size) { + for (int i = 0; i < N / data_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + for (int j = 0; j < data_T::size; j++) { + //#pragma HLS UNROLL + out_data[j] = in_data[j]; + } + + res.write(out_data); + } + } else if (data_T::size > res_T::size) { + constexpr unsigned pack_diff = data_T::size / res_T::size; + for (int i = 0; i < N / data_T::size; i++) { + if (N / data_T::size > 1) { + //#pragma HLS PIPELINE + } + + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + + for (int j = 0; j < pack_diff; j++) { + //#pragma HLS PIPELINE + + res_T out_data; + for (int k = 0; k < res_T::size; k++) { + //#pragma HLS UNROLL + out_data[k] = in_data[j * res_T::size + k]; + } + res.write(out_data); + } + } + } else { // data_T::size < res_T::size + res_T out_data; + constexpr unsigned pack_diff = res_T::size / data_T::size; + unsigned pack_cnt = 0; + for (int i = 0; i < N / data_T::size; i++) { + //#pragma HLS PIPELINE + + data_T in_data = data.read(); + for (int j = 0; j < data_T::size; j++) { + //#pragma HLS UNROLL + out_data[pack_cnt * data_T::size + j] = in_data[j]; + } + + if (pack_cnt == pack_diff - 1) { + res.write(out_data); + pack_cnt = 0; + } else { + pack_cnt++; + } + } + } +} + +template +void broadcast_stream_1x1xC(ac_channel &data, ac_channel &res) { + assert(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan); + int n_dupl = (CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::out_chan) / + (CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan); +BroadcastLoop: + for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) { + //#pragma HLS PIPELINE + data_T in_data = data.read(); + for (int j = 0; j < n_dupl; j++) { + //#pragma HLS PIPELINE + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + for (int k = 0; k < res_T::size; k++) { + //#pragma HLS UNROLL + out_data[k] = in_data[k]; + } + res.write(out_data); + } + } +} + +template +void broadcast_stream_HxWx1(ac_channel &data, ac_channel &res) { + assert(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && + CONFIG_T::in_width == CONFIG_T::out_width); +BroadcastLoop: + for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) { + //#pragma HLS PIPELINE + data_T in_data = data.read(); + res_T out_data; + //#pragma HLS DATA_PACK variable=out_data + for (int k = 0; k < res_T::size; k++) { + //#pragma HLS UNROLL + out_data[k] = in_data[0]; + } + res.write(out_data); + } +} + +template +void broadcast_stream(ac_channel &data, ac_channel &res) { + if (CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) { + broadcast_stream_1x1xC(data, res); + } else if (CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && + CONFIG_T::in_width == CONFIG_T::out_width) { + broadcast_stream_HxWx1(data, res); + } +} +} // namespace nnet + +#endif diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_types.h b/hls4ml/templates/catapult/nnet_utils/nnet_types.h new file mode 100644 index 0000000000..d761891fdc --- /dev/null +++ b/hls4ml/templates/catapult/nnet_utils/nnet_types.h @@ -0,0 +1,64 @@ +#ifndef NNET_TYPES_H_ +#define NNET_TYPES_H_ + +#include +#include +#include + +namespace nnet { + +// Fixed-size array +template struct array { + typedef T value_type; + static const unsigned size = N; + + T data[N]; + + T &operator[](size_t pos) { return data[pos]; } + + const T &operator[](size_t pos) const { return data[pos]; } + + array &operator=(const array &other) { + if (&other == this) + return *this; + + assert(N == other.size && "Array sizes must match."); + + for (unsigned i = 0; i < N; i++) { + //#pragma HLS UNROLL + data[i] = other[i]; + } + return *this; + } +}; + +// Generic lookup-table implementation, for use in approximations of math functions +template class lookup_table { + public: + lookup_table(T from, T to) : range_start(from), range_end(to), base_div(ac_int<16, false>(N) / T(to - from)) { + T step = (range_end - range_start) / ac_int<16, false>(N); + for (size_t i = 0; i < N; i++) { + T num = range_start + ac_int<16, false>(i) * step; + T sample = func(num); + samples[i] = sample; + } + } + + T operator()(T n) const { + int index = (n - range_start) * base_div; + if (index < 0) + index = 0; + else if (index > N - 1) + index = N - 1; + return samples[index]; + } + + private: + T samples[N]; + const T range_start, range_end; + ac_fixed<20, 16, true> base_div; +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/vivado_accelerator/build_lib.sh b/hls4ml/templates/vivado_accelerator/build_lib.sh old mode 100644 new mode 100755 diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index 759a7115b1..c49b23f58c 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -1,3 +1,4 @@ +from hls4ml.writer.catapult_writer import CatapultWriter from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter from hls4ml.writer.vitis_writer import VitisWriter @@ -11,4 +12,5 @@ register_writer('Vitis', VitisWriter) register_writer('VitisAccelerator', VitisAcceleratorWriter) register_writer('Quartus', QuartusWriter) +register_writer('Catapult', CatapultWriter) register_writer('SymbolicExpression', SymbolicExpressionWriter) diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py new file mode 100755 index 0000000000..48d44e4a59 --- /dev/null +++ b/hls4ml/writer/catapult_writer.py @@ -0,0 +1,929 @@ +import glob +import os +import tarfile +from collections import OrderedDict +from shutil import copyfile, copytree, rmtree + +import numpy as np +import yaml + +from hls4ml.backends import get_backend +from hls4ml.writer.writers import Writer + +config_filename = 'hls4ml_config.yml' + + +class CatapultWriter(Writer): + def print_array_to_cpp(self, var, odir, write_txt_file=True): + """Write a weights array to C++ header files. + + Args: + var (WeightVariable): Weight to write + odir (str): Output directory + write_txt_file (bool, optional): Write txt files in addition to .h files. Defaults to True. + """ + + h_file = open(f"{odir}/firmware/weights/{var.name}.h", "w") + if write_txt_file: + txt_file = open(f"{odir}/firmware/weights/{var.name}.txt", "w") + + # meta data + h_file.write(f"//Numpy array shape {var.shape}\n") + h_file.write(f"//Min {np.min(var.min):.12f}\n") + h_file.write(f"//Max {np.max(var.max):.12f}\n") + h_file.write(f"//Number of zeros {var.nzeros}\n") + h_file.write("\n") + + h_file.write(f"#ifndef {var.name.upper()}_H_\n") + h_file.write(f"#define {var.name.upper()}_H_\n") + h_file.write("\n") + + if write_txt_file: + h_file.write("#ifndef __SYNTHESIS__\n") + h_file.write("// global extern pointer only - actual array allocated in myproject_test.cpp\n") + h_file.write("extern " + var.definition_cpp() + ";\n") + h_file.write("#else\n") + + h_file.write(var.definition_cpp() + " = {") + + # fill c++ array. + # not including internal brackets for multidimensional case + sep = '' + for x in var: + h_file.write(sep + x) + if write_txt_file: + txt_file.write(sep + x) + sep = ", " + h_file.write("};\n") + if write_txt_file: + h_file.write("#endif\n") + txt_file.close() + h_file.write("\n#endif\n") + h_file.close() + + def write_output_dir(self, model): + """Write the base output directory + + Args: + model (ModelGraph): the hls4ml model. + """ + if not os.path.isdir(f"{model.config.get_output_dir()}/firmware/weights"): + os.makedirs(f"{model.config.get_output_dir()}/firmware/weights") + + @staticmethod + def _make_array_pragma(variable, model): + """ + Layers in hls_model.py can specify output array partitioning through the `pragma` attribute. + If `pragma` is a string: options are 'partition', 'reshape', or 'stream'. + If `pragma` is a tuple: (mode, type, factor) where mode is 'partition' or 'reshape', type is + 'complete', 'cyclic', or 'block', and factor is an integer only used when the type is not 'complete'. + """ + + config = variable.pragma + if type(config) is tuple: + mode = config[0] + if mode in ['partition', 'reshape']: + typ = config[1] + if typ != 'complete': + factor = config[2] + elif mode == 'stream': + depth = config[1] + else: + mode = config + typ = 'complete' + factor = 0 + + if mode in ['partition', 'reshape']: + if typ == 'complete': + template = '// #pragma HLS ARRAY_{mode} variable={name} {type} dim={dim}' + else: + template = '// #pragma HLS ARRAY_{mode} variable={name} {type} factor={factor} dim={dim}' + + return template.format(mode=mode.upper(), name=variable.name, type=typ, factor=factor, dim=0) + + elif mode == 'stream': + fifo = model.config.get_config_value("FIFO") + if fifo is not None: + retstr = f'#pragma hls_resource {variable.name}:cns variables="{variable.name}"' + retstr += f' map_to_module="{fifo}" // depth="{depth}"' + return retstr + else: + return '' + else: + return '' + + @staticmethod + def _make_array_fifo_pragma(variable, model): + config = variable.pragma + factor = '' + if type(config) is tuple: + mode = config[0] + if mode in ['partition', 'reshape']: + typ = config[1] + if typ != 'complete': + factor = config[2] + elif mode == 'stream': + depth = config[1] + else: + mode = config + typ = 'complete' + factor = 0 + + if mode == 'stream': + fifo = model.config.get_config_value("FIFO") + if fifo is not None: + return f'// #pragma hls_fifo_depth {depth} {factor}' + else: + return '' + else: + return '' + + def write_project_cpp(self, model): + """Write the main architecture source file (myproject.cpp) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = os.path.dirname(os.path.abspath(__file__)) + + fout = open(f'{model.config.get_output_dir()}/firmware/layer_summary.txt', 'w') + outstr = "" + outstr = outstr + "{}".format("Layer Name").ljust(25) + outstr = outstr + " {}".format("Layer Class").ljust(20) + outstr = outstr + " {}".format("Input Type").ljust(40) + outstr = outstr + " {}".format("Input Shape").ljust(15) + outstr = outstr + " {}".format("Output Type").ljust(40) + outstr = outstr + " {}".format("Output Shape").ljust(15) + # outstr = outstr + " {}".format("Weight Type").ljust(24) + # outstr = outstr + " {}".format("Bias Type").ljust(24) + outstr = outstr + " {}".format("Filter Shape").ljust(15) + outstr = outstr + " {}".format("Stride").ljust(10) + outstr = outstr + " {}".format("IOType").ljust(15) + outstr = outstr + " {}".format("Reuse").ljust(10) + + fout.write(outstr + "\n") + input_shape = "" + input_datatype = "" + for layer in model.get_layers(): + datatype = layer.get_output_variable().type.precision.definition_cpp() + " " + shape = "" + # layer.get_output_variable().type.precision.width + # layer.get_output_variable().type.precision.integer + # layer.get_output_variable().type.precision.sign + for _k, v in layer.get_output_variable().get_shape(): + shape = shape + "[" + str(v) + "]" + + if layer.attributes.layer.class_name != 'Input': + my_class_name = layer.class_name + if layer.attributes.layer.class_name == 'Activation': + my_class_name = layer.get_attr('activation') + + # filter_datatype = "" + # print(layer.weights.__dir__()) + # layer_precision = layer.get_layer_precision() + # for wname, weights in layer.weights.items(): + # print(wname) + # print(weights.type.name) + # print(weights.type.precision.definition_cpp()) + # #print(weights.type.precision.__dir__()) + # print(weights.type.precision.width) + # if 'ACFixed' in weights.type.precision.__class__: + # print(weights.type.precision.integer) + # print(weights.type.precision.signed) + # print(weights.data_length) + + filter = "" + filt_width = layer.get_attr('filt_width') + filt_height = layer.get_attr('filt_height') + if filt_width is not None: + filter = "[" + str(filt_width) + "]" + if filt_height is not None: + filter = filter + "[" + str(filt_height) + "]" + + stride = "" + stride_width = layer.get_attr('stride_width') + if stride_width is not None: + stride = str(stride_width) + + outstr = "" + outstr = outstr + f"{layer.name}".ljust(25) + outstr = outstr + f" {my_class_name}".ljust(20) + outstr = outstr + f" {input_datatype}".ljust(40) + outstr = outstr + f" {input_shape}".ljust(15) + outstr = outstr + f" {datatype}".ljust(40) + outstr = outstr + f" {shape}".ljust(15) + # outstr = outstr + " {}".format("weight type").ljust(24) + # outstr = outstr + " {}".format("bias type").ljust(24) + outstr = outstr + f" {filter}".ljust(15) + outstr = outstr + f" {stride}".ljust(10) + outstr = outstr + " {}".format(layer.model.config.get_config_value('IOType')).ljust(15) + outstr = outstr + f" {str(layer.model.config.get_reuse_factor(layer))}".ljust(10) + fout.write(outstr + "\n") + + input_shape = shape + input_datatype = datatype + + fout.close() + + f = open(os.path.join(filedir, '../templates/catapult/firmware/myproject.cpp')) + fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}.cpp', 'w') + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + indent = ' ' + + for line in f.readlines(): + # Add headers to weights and biases + if 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert header' in line: + inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs]) + outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs]) + brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) + + newline = '' + newline += indent + inputs_str + ',\n' + newline += indent + outputs_str + if len(model_brams) > 0: + newline += ',\n' + brams_str + newline += '\n' + + elif '// hls-fpga-machine-learning insert load weights' in line: + newline = line + for layer in model.get_layers(): + for w in layer.get_weights(): + if w.weight_class == 'CompressedWeightVariable': + newline += indent + ' nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.nonzeros, w.name, w.name + ) + elif w.weight_class == 'ExponentWeightVariable': + newline += indent + ' nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.data_length, w.name, w.name + ) + else: + newline += indent + ' nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.data_length, w.name, w.name + ) + + # Add Interface Synthesis resource pragmas + elif '// hls-fpga-machine-learning insert IFSynPragmas' in line: + newline = line + all_inputs = [i.name for i in model_inputs] + all_outputs = [o.name for o in model_outputs] + all_brams = [b.name for b in model_brams] + io_type = model.config.get_config_value("IOType") + + if io_type == 'io_serial' or io_type == 'io_stream': + # Eventually this will be amba.ccs_axi4stream_in and amba.ccs_axi4stream_out + for dut_input in all_inputs: + newline += f'#pragma hls_resource {dut_input}:rsc variables="{dut_input}"' + newline += ' map_to_module="ccs_ioport.ccs_in_wait"\n' + for dut_output in all_outputs: + newline += f'#pragma hls_resource {dut_output}:rsc variables="{dut_output}"' + newline += ' map_to_module="ccs_ioport.ccs_out_wait"\n' + + # Add input/output type + elif '// hls-fpga-machine-learning insert IO' in line: + newline = line + all_inputs = [i.name for i in model_inputs] + all_outputs = [o.name for o in model_outputs] + all_brams = [b.name for b in model_brams] + io_type = model.config.get_config_value("IOType") + + if io_type == 'io_parallel': + for i in model_inputs: + newline += indent + self._make_array_pragma(i, model) + '\n' + for o in model_outputs: + newline += indent + self._make_array_pragma(o, model) + '\n' + # TODO discussed adding a handle for setting the interface mode for individual input and output arrays + # Probably the handle doesn't need to be exposed to the user but should be just set in hls_model.py + newline += indent + '// #pragma HLS INTERFACE ap_vld port={},{} \n'.format( + ','.join(all_inputs), ','.join(all_outputs) + ) + if model.config.model_strategy.lower() == 'dataflow': + newline += indent + '// #pragma HLS DATAFLOW \n' + else: + newline += indent + '// #pragma HLS PIPELINE \n' + if io_type == 'io_stream': + newline += indent + '// #pragma HLS INTERFACE axis port={},{} \n'.format( + ','.join(all_inputs), ','.join(all_outputs) + ) + if all_brams: + newline += indent + '// #pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams)) + newline += indent + '// #pragma HLS DATAFLOW \n' + + elif '// hls-fpga-machine-learning insert layers' in line: + io_type = model.config.get_config_value("IOType") + newline = line + '\n' + for layer in model.get_layers(): + vars = layer.get_variables() + for var in vars: + if var not in model_inputs and var not in model_outputs: + def_cpp = var.definition_cpp() + if def_cpp is not None: + if var.pragma: + newline += ' ' + self._make_array_fifo_pragma(var, model) + '\n' + if io_type == 'io_serial' or io_type == 'io_stream': + newline += ' static ' + def_cpp + '; \n' + else: + newline += ' ' + def_cpp + '; \n' + if var.pragma: + newline += ' ' + self._make_array_pragma(var, model) + '\n' + func = layer.get_attr('function_cpp', None) + if func: + if not isinstance(func, (list, set)): + func = [func] + if len(func) == 1: + newline += ' ' + func[0] + ' // ' + layer.name + '\n' + else: + newline += ' // ' + layer.name + '\n' + for line in func: + newline += ' ' + line + '\n' + if model.config.trace_output and layer.get_attr('trace', False): + newline += '#ifndef __SYNTHESIS__\n' + for var in vars: + newline += ' nnet::save_layer_output<{}>({}, "{}", {});\n'.format( + var.type.name, var.name, layer.name, var.size_cpp() + ) + newline += '#endif\n' + newline += '\n' + + # Just copy line + else: + newline = line + + fout.write(newline) + + f.close() + fout.close() + + def write_project_header(self, model): + """Write the main architecture header file (myproject.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = os.path.dirname(os.path.abspath(__file__)) + f = open(os.path.join(filedir, '../templates/catapult/firmware/myproject.h')) + fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}.h', 'w') + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + indent = ' ' + + for line in f.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) + elif 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert header' in line: + inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs]) + outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs]) + brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) + + newline = '' + newline += indent + inputs_str + ',\n' + newline += indent + outputs_str + if len(model_brams) > 0: + newline += ',\n' + brams_str + newline += '\n' + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + + def write_defines(self, model): + """Write the C++ type definitions file (defines.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + filedir = os.path.dirname(os.path.abspath(__file__)) + f = open(os.path.join(filedir, '../templates/catapult/firmware/defines.h')) + fout = open(f'{model.config.get_output_dir()}/firmware/defines.h', 'w') + + for line in f.readlines(): + # Insert numbers + if '// hls-fpga-machine-learning insert numbers' in line: + newline = line + + defines_list = [] + for layer in model.get_layers(): + defines = '' + for k, v in layer.get_output_variable().get_shape(): + defines += f'#define {k} {v}\n' + + defines_list.append(defines) + + newline += ''.join(defines_list) + + elif '// hls-fpga-machine-learning insert layer-precision' in line: + newline = line + all_precision = OrderedDict() + for layer in model.get_layers(): + layer_precision = layer.get_layer_precision() + for type_name, type_var in layer_precision.items(): + # Ensure that layer's types doesn't override existing types + # This can happen in case of InplaceVariable types + if type_name not in all_precision: + all_precision[type_name] = type_var + for used_type in all_precision.values(): + newline += used_type.definition_cpp() + + else: + newline = line + fout.write(newline) + f.close() + fout.close() + + def write_parameters(self, model): + """Write the C++ layer config file (parameters.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + filedir = os.path.dirname(os.path.abspath(__file__)) + f = open(os.path.join(filedir, '../templates/catapult/firmware/parameters.h')) + fout = open(f'{model.config.get_output_dir()}/firmware/parameters.h', 'w') + + for line in f.readlines(): + if '// hls-fpga-machine-learning insert includes' in line: + newline = line + for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))): + newline += '#include "%s"\n' % include + + elif '// hls-fpga-machine-learning insert weights' in line: + newline = line + for layer in model.get_layers(): + for w in layer.get_weights(): + if w.storage.lower() != 'bram': + newline += f'#include "weights/{w.name}.h"\n' + + elif "// hls-fpga-machine-learning insert layer-config" in line: + newline = line + for layer in model.get_layers(): + config = layer.get_attr('config_cpp', None) + if config: + newline += '// ' + layer.name + '\n' + newline += config + '\n' + else: + newline = line + fout.write(newline) + f.close() + fout.close() + + def write_weights(self, model): + """Write the weights into header files + + Args: + model (ModelGraph): the hls4ml model. + """ + for layer in model.get_layers(): + for weights in layer.get_weights(): + self.print_array_to_cpp(weights, model.config.get_output_dir()) + + def __make_dat_file(self, original_path, project_path): + """ + Convert other input/output data types into a dat file, which is + a text file with the falttened matrix printed out. Note that ' ' is + assumed to be the delimiter. + """ + + # Take in data from current supported data files + if original_path[-3:] == "npy": + data = np.load(original_path) + else: + raise Exception("Unsupported input/output data files.") + + # Faltten data, just keep first dimension + data = data.reshape(data.shape[0], -1) + + def print_data(f): + for i in range(data.shape[0]): + for j in range(data.shape[1]): + f.write(str(data[i][j]) + " ") + f.write("\n") + + # Print out in dat file + with open(project_path, "w") as f: + print_data(f) + + def write_test_bench(self, model): + """Write the testbench files (myproject_test.cpp and input/output .dat files) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = os.path.dirname(os.path.abspath(__file__)) + + if not os.path.exists(f'{model.config.get_output_dir()}/tb_data/'): + os.mkdir(f'{model.config.get_output_dir()}/tb_data/') + + input_data = model.config.get_config_value('InputData') + output_predictions = model.config.get_config_value('OutputPredictions') + + if input_data: + if input_data[-3:] == "dat": + copyfile(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat') + else: + self.__make_dat_file(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat') + + if output_predictions: + if output_predictions[-3:] == "dat": + copyfile(output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat') + else: + self.__make_dat_file( + output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat' + ) + + f = open(os.path.join(filedir, '../templates/catapult/myproject_test.cpp')) + fout = open(f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp', 'w') + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + for line in f.readlines(): + indent = ' ' * (len(line) - len(line.lstrip(' '))) + + # Insert numbers + if 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert bram' in line: + newline = line + for bram in model_brams: + newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + + elif '// hls-fpga-machine-learning insert declare weights' in line: + newline = line + for layer in model.get_layers(): + for w in layer.get_weights(): + newline += w.definition_cpp() + ";\n" + + elif '// hls-fpga-machine-learning insert load weights' in line: + newline = line + for layer in model.get_layers(): + for w in layer.get_weights(): + if w.weight_class == 'CompressedWeightVariable': + newline += indent + ' nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.nonzeros, w.name, w.name + ) + elif w.weight_class == 'ExponentWeightVariable': + newline += indent + ' nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.data_length, w.name, w.name + ) + else: + newline += indent + ' nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format( + w.type.name, w.data_length, w.name, w.name + ) + + elif '// hls-fpga-machine-learning insert data' in line: + newline = line + offset = 0 + for inp in model_inputs: + newline += ' ' + inp.definition_cpp() + ';\n' + newline += ' nnet::copy_data(in, {});\n'.format( + inp.type.name, offset, inp.size_cpp(), inp.name + ) + offset += inp.size() + for out in model_outputs: + newline += ' ' + out.definition_cpp() + ';\n' + elif '// hls-fpga-machine-learning insert random' in line: + newline = line + for inp in model_inputs: + newline += ' ' + inp.definition_cpp() + ';\n' + newline += f' nnet::fill_random<{inp.type.name}, {inp.size_cpp()}>({inp.name});\n' + for out in model_outputs: + newline += ' ' + out.definition_cpp() + ';\n' + elif '// hls-fpga-machine-learning insert zero' in line: + newline = line + for inp in model_inputs: + newline += ' ' + inp.definition_cpp() + ';\n' + newline += f' nnet::fill_zero<{inp.type.name}, {inp.size_cpp()}>({inp.name});\n' + for out in model_outputs: + newline += ' ' + out.definition_cpp() + ';\n' + elif '// hls-fpga-machine-learning insert top-level-function' in line: + newline = line + + input_vars = ','.join([i.name for i in model_inputs]) + output_vars = ','.join([o.name for o in model_outputs]) + bram_vars = ','.join([b.name for b in model_brams]) + + # Concatenate the input, output, and bram variables. Filter out empty/null values + all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) + + top_level = indent + f'{model.config.get_project_name()}({all_vars});\n' + + newline += top_level + elif '// hls-fpga-machine-learning insert predictions' in line: + newline = line + for out in model_outputs: + newline += indent + f'for(int i = 0; i < {out.size_cpp()}; i++) {{\n' + newline += indent + ' std::cout << pr[i] << " ";\n' + newline += indent + '}\n' + newline += indent + 'std::cout << std::endl;\n' + elif '// hls-fpga-machine-learning insert tb-output' in line: + newline = line + for out in model_outputs: + newline += indent + 'nnet::print_result<{}, {}>({}, fout);\n'.format( + out.type.name, out.size_cpp(), out.name + ) # TODO enable this + elif ( + '// hls-fpga-machine-learning insert output' in line + or '// hls-fpga-machine-learning insert quantized' in line + ): + newline = line + for out in model_outputs: + newline += indent + 'nnet::print_result<{}, {}>({}, std::cout, true);\n'.format( + out.type.name, out.size_cpp(), out.name + ) + else: + newline = line + fout.write(newline) + f.close() + fout.close() + + def write_bridge(self, model): + """Write the Python-C++ bridge (myproject_bridge.cpp) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = os.path.dirname(os.path.abspath(__file__)) + f = open(os.path.join(filedir, '../templates/catapult/myproject_bridge.cpp')) + fout = open(f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp', 'w') + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + indent = ' ' + + for line in f.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) + elif 'myproject' in line: + newline = line.replace('myproject', format(model.config.get_project_name())) + elif '// hls-fpga-machine-learning insert bram' in line: + newline = line + for bram in model_brams: + newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + elif '// hls-fpga-machine-learning insert declare weights' in line: + newline = line + for layer in model.get_layers(): + for w in layer.get_weights(): + newline += w.definition_cpp() + ";\n" + elif '// hls-fpga-machine-learning insert header' in line: + dtype = line.split('#', 1)[1].strip() + inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs]) + outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs]) + + newline = '' + newline += indent + inputs_str + ',\n' + newline += indent + outputs_str + '\n' + elif '// hls-fpga-machine-learning insert wrapper' in line: + dtype = line.split('#', 1)[1].strip() + newline = '' + for i in model_inputs: + newline += indent + '{var};\n'.format(var=i.definition_cpp(name_suffix='_ap')) + newline += indent + 'nnet::convert_data<{}, {}, {}>({}, {}_ap);\n'.format( + dtype, i.type.name, i.size_cpp(), i.name, i.name + ) + newline += '\n' + + for o in model_outputs: + newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap')) + + newline += '\n' + + input_vars = ','.join([i.name + '_ap' for i in model_inputs]) + bram_vars = ','.join([b.name for b in model_brams]) + output_vars = ','.join([o.name + '_ap' for o in model_outputs]) + + # Concatenate the input, output, and bram variables. Filter out empty/null values + all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) + + top_level = indent + f'{model.config.get_project_name()}({all_vars});\n' + newline += top_level + + newline += '\n' + + for o in model_outputs: + newline += indent + 'nnet::convert_data<{}, {}, {}>({}_ap, {});\n'.format( + o.type.name, dtype, o.size_cpp(), o.name, o.name + ) + elif '// hls-fpga-machine-learning insert trace_outputs' in line: + newline = '' + for layer in model.get_layers(): + func = layer.get_attr('function_cpp', None) + if func and model.config.trace_output and layer.get_attr('trace', False): + vars = layer.get_variables() + for var in vars: + newline += ( + indent + + 'nnet::trace_outputs->insert(std::pair(' + + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n' + ) + + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + + def write_build_script(self, model): + """Write the TCL/Shell build scripts (build_prj.tcl, build_lib.sh) + + Args: + model (ModelGraph): the hls4ml model. + """ + + filedir = os.path.dirname(os.path.abspath(__file__)) + + # build_prj.tcl + srcpath = os.path.join(filedir, '../templates/catapult/build_prj.tcl') + dstpath = f'{model.config.get_output_dir()}/build_prj.tcl' + # copyfile(srcpath, dstpath) + f = open(srcpath) + fout = open(dstpath, 'w') + for line in f.readlines(): + indent = line[: len(line) - len(line.lstrip())] + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('CATAPULT_DIR', model.config.get_project_dir()) + if '#hls-fpga-machine-learning insert techlibs' in line: + if model.config.get_config_value('Technology') is None: + if model.config.get_config_value('Part') is not None: + line = indent + 'setup_xilinx_part {{{}}}\n'.format(model.config.get_config_value('Part')) + elif model.config.get_config_value('ASICLibs') is not None: + line = indent + 'setup_asic_libs {{{}}}\n'.format(model.config.get_config_value('ASICLibs')) + else: + if model.config.get_config_value('Technology') == 'asic': + line = indent + 'setup_asic_libs {{{}}}\n'.format(model.config.get_config_value('ASICLibs')) + else: + line = indent + 'setup_xilinx_part {{{}}}\n'.format(model.config.get_config_value('Part')) + elif '#hls-fpga-machine-learning insert invoke_args' in line: + tb_in_file = model.config.get_config_value('InputData') + tb_out_file = model.config.get_config_value('OutputPredictions') + invoke_args = '$sfd/firmware/weights' + if tb_in_file is not None: + invoke_args = invoke_args + f' $sfd/tb_data/{tb_in_file}' + if tb_out_file is not None: + invoke_args = invoke_args + f' $sfd/tb_data/{tb_out_file}' + line = indent + f'flow package option set /SCVerify/INVOKE_ARGS "{invoke_args}"\n' + elif 'set hls_clock_period 5' in line: + line = indent + 'set hls_clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')) + fout.write(line) + f.close() + fout.close() + + # build_lib.sh + f = open(os.path.join(filedir, '../templates/catapult/build_lib.sh')) + fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w') + + for line in f.readlines(): + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) + + fout.write(line) + f.close() + fout.close() + + def write_nnet_utils(self, model): + """Copy the nnet_utils, AP types headers and any custom source to the project output directory + + Args: + model (ModelGraph): the hls4ml model. + """ + + # nnet_utils + filedir = os.path.dirname(os.path.abspath(__file__)) + + srcpath = os.path.join(filedir, '../templates/catapult/nnet_utils/') + dstpath = f'{model.config.get_output_dir()}/firmware/nnet_utils/' + + if not os.path.exists(dstpath): + os.mkdir(dstpath) + + headers = [os.path.basename(h) for h in glob.glob(srcpath + '*.h')] + + if model.config.get_config_value('DontCopyNNET') is not None: + h = 'nnet_code_gen.h' + copyfile(srcpath + h, dstpath + h) + return + + for h in headers: + copyfile(srcpath + h, dstpath + h) + + print("Copying NNET files to local firmware directory") + + filedir = os.path.dirname(os.path.abspath(__file__)) + for pkg in ('ac_types', 'ac_math', 'ac_simutils'): + dstpath = f'{model.config.get_output_dir()}/firmware/{pkg}/' + + # backward compatibility, look in root dir + srcpath = os.path.join(filedir, '../../' + pkg + '/') + if not os.path.exists(srcpath): + # look next in Catapult-specific templates + srcpath = os.path.join(filedir, '../templates/catapult/' + pkg + '/') + + if os.path.exists(srcpath): + if os.path.exists(dstpath): + rmtree(dstpath) + print("... copying AC " + pkg + " headers from " + srcpath) + copytree(srcpath, dstpath) + else: + print("... skipping copy of " + pkg + " headers - assumed to located in Catapult install tree") + + # custom source + filedir = os.path.dirname(os.path.abspath(__file__)) + + custom_source = get_backend('Catapult').get_custom_source() + for dst, srcpath in custom_source.items(): + dstpath = f'{model.config.get_output_dir()}/firmware/{dst}' + copyfile(srcpath, dstpath) + + def write_generated_code(self, model): + """Write the generated code (nnet_code_gen.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + path = f'{model.config.get_output_dir()}/firmware/nnet_utils/nnet_code_gen.h' + f = open(path) + contents = f.readlines() + f.close() + f = open(path, 'w') + + for line in contents: + if '// hls4ml insert code' in line: + newline = line + for layer in model.get_layers(): + for generated_code in layer.code.values(): + newline += str(generated_code) + else: + newline = line + f.write(newline) + f.close() + + def write_yml(self, model): + """Write the config to the YAML file + + Args: + model (ModelGraph): the hls4ml model. + """ + + def keras_model_representer(dumper, keras_model): + model_path = model.config.get_output_dir() + '/keras_model.h5' + keras_model.save(model_path) + return dumper.represent_scalar('!keras_model', model_path) + + try: + from tensorflow.keras import Model as KerasModel + + yaml.add_multi_representer(KerasModel, keras_model_representer) + except Exception: + pass + + with open(model.config.get_output_dir() + '/' + config_filename, 'w') as file: + yaml.dump(model.config.config, file) + + def write_tar(self, model): + """Write the generated project as a .tar.gz archive + + Args: + model (ModelGraph): the hls4ml model. + """ + + if not os.path.exists(model.config.get_output_dir() + '.tar.gz'): + with tarfile.open(model.config.get_output_dir() + '.tar.gz', mode='w:gz') as archive: + archive.add(model.config.get_output_dir(), recursive=True) + else: + print("Project .tar.gz archive already exists") + + def write_hls(self, model): + print('Writing HLS project') + self.write_output_dir(model) + self.write_project_cpp(model) + self.write_project_header(model) + self.write_weights(model) + self.write_defines(model) + self.write_parameters(model) + self.write_test_bench(model) + self.write_bridge(model) + self.write_build_script(model) + self.write_nnet_utils(model) + self.write_generated_code(model) + self.write_yml(model) + self.write_tar(model) + print('Done') diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml index 5477da933a..50e9f799f6 100644 --- a/test/pytest/ci-template.yml +++ b/test/pytest/ci-template.yml @@ -5,7 +5,8 @@ - k8s-default before_script: - source ~/.bashrc - - if [ $EXAMPLEMODEL == 1 ]; then git submodule init; git submodule update; fi + - git submodule update --init --recursive hls4ml/templates/catapult/ + - if [ $EXAMPLEMODEL == 1 ]; then git submodule update --init example-models; fi - conda activate hls4ml-testing - pip install .[testing,sr,optimization] script: diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py index caaaed636a..5ab9481e1a 100644 --- a/test/pytest/test_activations.py +++ b/test/pytest/test_activations.py @@ -12,7 +12,7 @@ # Variable 'name' is simply used as an identifier for the activation -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult', 'Quartus']) @pytest.mark.parametrize('shape, io_type', [((8,), 'io_parallel'), ((8,), 'io_stream'), ((8, 8, 3), 'io_stream')]) @pytest.mark.parametrize( 'activation, name', diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py index c0ef0705ae..727d2ee574 100644 --- a/test/pytest/test_batchnorm.py +++ b/test/pytest/test_batchnorm.py @@ -29,7 +29,7 @@ def model(request): @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('model', [True, False], indirect=True) def test_batchnorm(model, data, backend, io_type): default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>' diff --git a/test/pytest/test_batchnorm_pytorch.py b/test/pytest/test_batchnorm_pytorch.py index a7a0c80247..93cda2729c 100644 --- a/test/pytest/test_batchnorm_pytorch.py +++ b/test/pytest/test_batchnorm_pytorch.py @@ -21,7 +21,7 @@ def data(): @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) def test_batchnorm(data, backend, io_type): model = nn.Sequential( nn.BatchNorm1d(in_shape), diff --git a/test/pytest/test_clone_flatten.py b/test/pytest/test_clone_flatten.py index 12f30985bf..5f631d027f 100644 --- a/test/pytest/test_clone_flatten.py +++ b/test/pytest/test_clone_flatten.py @@ -28,7 +28,7 @@ def keras_model(): @pytest.fixture @pytest.mark.parametrize('io_type', ['io_stream']) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult']) def hls_model(keras_model, backend, io_type): hls_config = hls4ml.utils.config_from_keras_model( keras_model, diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py index ab3365f228..27b966f51d 100644 --- a/test/pytest/test_cnn_mnist.py +++ b/test/pytest/test_cnn_mnist.py @@ -61,7 +61,7 @@ def keras_model(mnist_data): ('Vitis', 'io_parallel', 'resource'), ('Vitis', 'io_parallel', 'latency'), ('Vitis', 'io_stream', 'latency'), - ('Vitis', 'io_stream', 'resource'), + ('Vitis', 'io_stream', 'latency'), ], ) def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy): diff --git a/test/pytest/test_conv1d.py b/test/pytest/test_conv1d.py index 79beb01a2c..48357a42a1 100644 --- a/test/pytest/test_conv1d.py +++ b/test/pytest/test_conv1d.py @@ -41,6 +41,8 @@ def keras_model(): ('Vitis', 'io_parallel', 'latency'), ('Vitis', 'io_stream', 'latency'), ('Vitis', 'io_stream', 'resource'), + ('Catapult', 'io_stream', 'latency'), + ('Catapult', 'io_stream', 'resource'), ], ) def hls_model(keras_model, backend, io_type, strategy): @@ -91,6 +93,8 @@ def hls_model(keras_model, backend, io_type, strategy): ('Vitis', 'io_parallel', 'latency'), ('Vitis', 'io_stream', 'latency'), ('Vitis', 'io_stream', 'resource'), + ('Catapult', 'io_stream', 'latency'), + ('Catapult', 'io_stream', 'resource'), ], ) def test_accuracy(data, keras_model, hls_model): diff --git a/test/pytest/test_embed.py b/test/pytest/test_embed.py index fd8e39cdb9..a27fc45b93 100644 --- a/test/pytest/test_embed.py +++ b/test/pytest/test_embed.py @@ -25,7 +25,7 @@ def keras_model(): @pytest.fixture -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def hls_model(keras_model, backend, io_type): hls_config = hls4ml.utils.config_from_keras_model(keras_model, default_precision='ap_fixed<16,6>', granularity='name') @@ -39,7 +39,7 @@ def hls_model(keras_model, backend, io_type): return hls_model -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_embedding_accuracy(data, keras_model, hls_model): X = data diff --git a/test/pytest/test_globalpooling.py b/test/pytest/test_globalpooling.py index c402a53cdf..b99f0d8212 100644 --- a/test/pytest/test_globalpooling.py +++ b/test/pytest/test_globalpooling.py @@ -32,7 +32,7 @@ def keras_model_1d(request): return model, model_type, keepdims -@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado']) +@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult']) @pytest.mark.parametrize( 'keras_model_1d', [ @@ -87,7 +87,7 @@ def keras_model_2d(request): return model, model_type, keepdims -@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado']) +@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult']) @pytest.mark.parametrize( 'keras_model_2d', [ diff --git a/test/pytest/test_keras_h5_loader.py b/test/pytest/test_keras_h5_loader.py index b53bb3a668..0c42adee31 100644 --- a/test/pytest/test_keras_h5_loader.py +++ b/test/pytest/test_keras_h5_loader.py @@ -9,7 +9,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) def test_keras_h5_loader(backend): input_shape = (10,) model = tf.keras.models.Sequential( diff --git a/test/pytest/test_keras_nested_model.py b/test/pytest/test_keras_nested_model.py index 8c4670ad51..66fa81e2f9 100755 --- a/test/pytest/test_keras_nested_model.py +++ b/test/pytest/test_keras_nested_model.py @@ -127,7 +127,7 @@ def randX_20_15(): return randX(20, 15) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_nested_model(randX_20_15, backend, io_type): n_in = 15 @@ -150,7 +150,7 @@ def test_nested_model(randX_20_15, backend, io_type): np.testing.assert_allclose(y_keras.ravel(), y_hls4ml.ravel(), rtol=1e-2, atol=0.02) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_sub_nested_model(randX_20_15, backend, io_type): n_in = 15 diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py index b7fee0a4ab..060b9877de 100644 --- a/test/pytest/test_pointwiseconv.py +++ b/test/pytest/test_pointwiseconv.py @@ -31,6 +31,8 @@ ('Vivado', 'io_stream', 'resource'), ('Vitis', 'io_stream', 'latency'), ('Vitis', 'io_stream', 'resource'), + ('Catapult', 'io_stream', 'latency'), + ('Catapult', 'io_stream', 'resource'), ], ) def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): @@ -87,6 +89,8 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy): ('Vivado', 'io_parallel', 'latency'), ('Vivado', 'io_stream', 'latency'), ('Vivado', 'io_stream', 'resource'), + ('Catapult', 'io_stream', 'latency'), + ('Catapult', 'io_stream', 'resource'), ], ) def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy): diff --git a/test/pytest/test_pooling.py b/test/pytest/test_pooling.py index 1f958696d8..d7de80a5a7 100644 --- a/test/pytest/test_pooling.py +++ b/test/pytest/test_pooling.py @@ -32,7 +32,7 @@ def keras_model_1d(request): return model, model_type, pads -@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado']) +@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult']) @pytest.mark.parametrize( 'keras_model_1d', [ @@ -87,7 +87,7 @@ def keras_model_2d(request): return model, model_type, pads -@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado']) +@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult']) @pytest.mark.parametrize( 'keras_model_2d', [ diff --git a/test/pytest/test_repack_stream.py b/test/pytest/test_repack_stream.py index 12d44a66b7..04cc9867a9 100644 --- a/test/pytest/test_repack_stream.py +++ b/test/pytest/test_repack_stream.py @@ -9,7 +9,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) def test_repack_precision(backend: str): inp = keras.Input(shape=(3, 3), name='inp') out = keras.layers.Reshape((3, 3), name='reshape')(inp) @@ -41,7 +41,7 @@ def test_repack_precision(backend: str): assert repack_precision.signed is True, 'Precision mismatch' -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('strategy', ['Latency', 'Resource']) def test_repack(backend: str, strategy: str): inp1 = keras.Input(shape=(4,), name='inp1') diff --git a/test/pytest/test_reshape.py b/test/pytest/test_reshape.py index 3c421c1474..ac277bb491 100755 --- a/test/pytest/test_reshape.py +++ b/test/pytest/test_reshape.py @@ -21,7 +21,7 @@ def randX_20_10(): return randX(20, 10) -@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_reshape_parallel(randX_20_10, backend, io_type): model = tf.keras.models.Sequential( diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py index a75d854283..64b72db48a 100644 --- a/test/pytest/test_sepconv1d.py +++ b/test/pytest/test_sepconv1d.py @@ -25,7 +25,7 @@ @pytest.mark.parametrize('kernels', kernel_options) @pytest.mark.parametrize('bias', bias_options) @pytest.mark.parametrize('io_type', io_type_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult']) def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() input_shape = (28, 3) diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index 1ce85c5016..2fa2d94afe 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -25,7 +25,7 @@ @pytest.mark.parametrize("kernels", kernel_options) @pytest.mark.parametrize("bias", bias_options) @pytest.mark.parametrize("io_type", io_type_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult']) def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() input_shape = (28, 28, 3) diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py index 3cab00745c..19c9042465 100644 --- a/test/pytest/test_softmax.py +++ b/test/pytest/test_softmax.py @@ -19,7 +19,7 @@ def generate_data(input_shape): return np.clip(d, -32, 31) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('strategy', ['stable', 'latency', 'argmax']) @pytest.mark.parametrize( 'input_bits,input_shape,table_bits,io_type', @@ -65,7 +65,7 @@ def test_softmax(backend, strategy, generate_data, input_bits, input_shape, tabl assert acc_hls4ml >= 0.98 -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream']) def test_softmax_skipped(backend, io_type): X = np.random.rand(100, 10) diff --git a/test/pytest/test_softsign.py b/test/pytest/test_softsign.py index a23e89e7da..217865fe46 100644 --- a/test/pytest/test_softsign.py +++ b/test/pytest/test_softsign.py @@ -10,7 +10,7 @@ test_root_path = Path(__file__).parent -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('input_shape, io_type', [((8,), 'io_parallel'), ((8,), 'io_stream'), ((8, 8, 3), 'io_stream')]) def test_softsign(backend, input_shape, io_type): X = np.random.rand(1000, *input_shape) diff --git a/test/pytest/test_upsampling.py b/test/pytest/test_upsampling.py index 8ec5cabda9..9051d582bd 100644 --- a/test/pytest/test_upsampling.py +++ b/test/pytest/test_upsampling.py @@ -46,7 +46,7 @@ def keras_model_2d(): @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('model_type', ['1d', '2d']) def test_upsampling(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend): if model_type == '1d': diff --git a/test/pytest/test_zeropadding.py b/test/pytest/test_zeropadding.py index 962a3334a6..95f7d79a7d 100644 --- a/test/pytest/test_zeropadding.py +++ b/test/pytest/test_zeropadding.py @@ -50,7 +50,7 @@ def keras_model_2d(): @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult']) @pytest.mark.parametrize('model_type', ['1d', '2d']) def test_zeropadding(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend): if model_type == '1d': From 2a71a8391efb4533374bb4d1eb5019c265558f98 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Apr 2024 20:06:25 +0000 Subject: [PATCH 023/103] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.3.0 → 24.4.0](https://github.com/psf/black/compare/24.3.0...24.4.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a817208398..275b349422 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte repos: - repo: https://github.com/psf/black - rev: 24.3.0 + rev: 24.4.0 hooks: - id: black language_version: python3 From 6ac964c74b45ac3c1d6da7753f1297bb4094a537 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Mon, 15 Apr 2024 20:36:48 -0700 Subject: [PATCH 024/103] fix unwanted tested file change in #956 --- test/pytest/test_cnn_mnist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py index 27b966f51d..ab3365f228 100644 --- a/test/pytest/test_cnn_mnist.py +++ b/test/pytest/test_cnn_mnist.py @@ -61,7 +61,7 @@ def keras_model(mnist_data): ('Vitis', 'io_parallel', 'resource'), ('Vitis', 'io_parallel', 'latency'), ('Vitis', 'io_stream', 'latency'), - ('Vitis', 'io_stream', 'latency'), + ('Vitis', 'io_stream', 'resource'), ], ) def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy): From ec95e010e2c30728f074f0210e912a2a7b94447b Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Wed, 10 Apr 2024 20:43:21 +0200 Subject: [PATCH 025/103] Fix SR backend synth missing variables --- hls4ml/writer/symbolic_writer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hls4ml/writer/symbolic_writer.py b/hls4ml/writer/symbolic_writer.py index 8ab5c53806..b442d3cd39 100644 --- a/hls4ml/writer/symbolic_writer.py +++ b/hls4ml/writer/symbolic_writer.py @@ -68,6 +68,10 @@ def write_build_script(self, model): f.write('set part "{}"\n'.format(model.config.get_config_value('Part'))) f.write('variable clock_period\n') f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) + f.write('variable clock_uncertainty\n') + f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '0%'))) + f.write('variable version\n') + f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) f.close() # build_prj.tcl From 5de1bf5cc7954ea624aad683349dc3e28c7109a7 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 16 Apr 2024 18:38:36 +0200 Subject: [PATCH 026/103] Test for SR backend config --- hls4ml/backends/symbolic/symbolic_backend.py | 14 ++-- test/pytest/test_sr.py | 67 ++++++++++++++++++++ 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/hls4ml/backends/symbolic/symbolic_backend.py b/hls4ml/backends/symbolic/symbolic_backend.py index 29956f147b..bad75c2417 100644 --- a/hls4ml/backends/symbolic/symbolic_backend.py +++ b/hls4ml/backends/symbolic/symbolic_backend.py @@ -42,7 +42,7 @@ def create_initial_config( self, part='xcvu9p-flga2577-2-e', clock_period=5, - clock_uncertainty='12.5%', + clock_uncertainty=None, io_type='io_parallel', compiler='vivado_hls', hls_include_path=None, @@ -50,11 +50,17 @@ def create_initial_config( ): config = {} - config['Part'] = part if part is not None else 'xcvu9p-flga2577-2-e' - config['ClockPeriod'] = clock_period + config['Part'] = part if part is not None else 'xcvu13p-flga2577-2-e' + config['ClockPeriod'] = clock_period if clock_period is not None else 5 config['ClockUncertainty'] = clock_uncertainty - config['IOType'] = io_type + config['IOType'] = io_type if io_type is not None else 'io_parallel' config['Compiler'] = compiler if compiler is not None else 'vivado_hls' + if config['ClockUncertainty'] is None: + if config['Compiler'] == 'vivado_hls': + config['ClockUncertainty'] = '12.5%' + else: + config['ClockUncertainty'] = '27%' + if not all([hls_include_path, hls_libs_path]): # Try to infer the include path from Vivado path bin_path = os.popen(f'command -v {compiler}').read().strip() diff --git a/test/pytest/test_sr.py b/test/pytest/test_sr.py index e4c922cc34..272450b658 100644 --- a/test/pytest/test_sr.py +++ b/test/pytest/test_sr.py @@ -69,3 +69,70 @@ def test_pysr_luts(data): eq = str(model.sympy()) assert 'cos_lut' in eq + + +@pytest.mark.parametrize('part', ['some_part', None]) +@pytest.mark.parametrize('clock_period', [8, None]) +@pytest.mark.parametrize('clock_unc', ['15%', None]) +@pytest.mark.parametrize('compiler', ['vivado_hls', 'vitis_hls']) +def test_sr_backend_config(part, clock_period, clock_unc, compiler): + + expr = 'x0**2 + 2.5382*cos_lut(x3) - 0.5' + + if clock_unc is not None: + unc_str = clock_unc.replace('%', '') + else: + unc_str = clock_unc + + compiler_str = compiler.replace('_hls', '') + + test_dir = f'hls4mlprj_sr_backend_config_part_{part}_period_{clock_period}_unc_{unc_str}_{compiler_str}' + output_dir = test_root_path / test_dir + + hls_model = hls4ml.converters.convert_from_symbolic_expression( + expr, + n_symbols=5, + precision='ap_fixed<18,6>', + output_dir=str(output_dir), + part=part, + clock_period=clock_period, + clock_uncertainty=clock_unc, + compiler=compiler, + hls_include_path='', + hls_libs_path='', + ) + hls_model.write() + + # Check if config was properly parsed into the ModelGraph + + read_part = hls_model.config.get_config_value('Part') + expected_part = part if part is not None else 'xcvu13p-flga2577-2-e' + assert read_part == expected_part + + read_clock_period = hls_model.config.get_config_value('ClockPeriod') + expected_period = clock_period if clock_period is not None else 5 + assert read_clock_period == expected_period + + read_clock_unc = hls_model.config.get_config_value('ClockUncertainty') + expected_unc = clock_unc + if expected_unc is None: + if compiler == 'vivado_hls': + expected_unc = '12.5%' + else: + expected_unc = '27%' + assert read_clock_unc == expected_unc + + # Check if Writer properly wrote tcl scripts + part_ok = period_ok = unc_ok = False + + prj_tcl_path = output_dir / 'project.tcl' + with open(prj_tcl_path) as f: + for line in f.readlines(): + if 'set part' in line and expected_part in line: + part_ok = True + if f'set clock_period {expected_period}' in line: + period_ok = True + if f'set clock_uncertainty {expected_unc}' in line: + unc_ok = True + + assert part_ok and period_ok and unc_ok From a6fec3646f97b39e72a812fc47a01bb12cba9a0a Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 6 Mar 2024 20:39:47 +0100 Subject: [PATCH 027/103] Upsampling support for PyTorch models --- hls4ml/converters/pytorch/reshape.py | 44 +++++++++++ hls4ml/model/layers.py | 32 ++++++-- test/pytest/test_upsampling_pytorch.py | 100 +++++++++++++++++++++++++ 3 files changed, 170 insertions(+), 6 deletions(-) create mode 100644 test/pytest/test_upsampling_pytorch.py diff --git a/hls4ml/converters/pytorch/reshape.py b/hls4ml/converters/pytorch/reshape.py index 5e5cde5261..37191135a1 100644 --- a/hls4ml/converters/pytorch/reshape.py +++ b/hls4ml/converters/pytorch/reshape.py @@ -1,6 +1,7 @@ import numpy as np from hls4ml.converters.pytorch_to_hls import pytorch_handler +from hls4ml.converters.utils import parse_data_format reshape_layers = ['View'] @@ -106,3 +107,46 @@ def parse_flatten_layer(operation, layer_name, input_names, input_shapes, node, output_shape = layer['target_shape'] return layer, output_shape + + +@pytorch_handler('Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d') +def handle_upsample(operation, layer_name, input_names, input_shapes, node, class_object, data_reader, config): + + assert operation in ['Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d'] + layer = {} + layer['name'] = layer_name + layer['inputs'] = input_names + layer['class_name'] = 'Resize' + layer['data_format'] = 'channels_first' + + input_shape = parse_data_format(input_shapes[0], 'channels_first') + if len(input_shape) == 2: + layer['in_height'] = 1 + layer['in_width'], layer['n_chan'] = input_shape + + layer['out_height'] = 1 + layer['out_width'] = int(layer['in_width'] * class_object.scale_factor) + + output_shape = [input_shapes[0][0], layer['n_chan'], layer['out_width']] + elif len(input_shape) == 3: + layer['in_height'], layer['in_width'], layer['n_chan'] = input_shape + + scale_factor = class_object.scale_factor + if isinstance(scale_factor, tuple): + scale_height = scale_factor[0] + scale_width = scale_factor[1] + else: + scale_height = scale_factor + scale_width = scale_factor + + layer['out_height'] = int(layer['in_height'] * scale_height) + layer['out_width'] = int(layer['in_width'] * scale_width) + + output_shape = [layer['n_chan'], layer['out_height'], layer['out_width']] + else: + raise Exception(f'Parsing "Upsample" with {len(input_shape)}-dimensional tensors is not yet supported.') + + layer['algorithm'] = class_object.mode + layer['align_corners'] = bool(class_object.align_corners) + + return layer, output_shape diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index de191baa40..0d9cc0622c 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -912,14 +912,34 @@ def initialize(self): class Resize(Layer): + _expected_attributes = [ + Attribute('in_height'), + Attribute('in_width'), + Attribute('out_height'), + Attribute('out_width'), + Attribute('n_chan'), + ChoiceAttribute('algorithm', ['nearest', 'bilinear'], default='nearest'), + Attribute('align_corners', value_type=bool, default=False), + ] + def initialize(self): inp = self.get_input_variable() - if len(inp.shape) == 2: # 1D -> width + chan - shape = [self.get_attr('out_width'), self.get_attr('n_chan')] - dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}'] - elif len(inp.shape) == 3: # 2D -> height + width + chan - shape = [self.get_attr('out_height'), self.get_attr('out_width'), self.get_attr('n_chan')] - dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}'] + + if self.get_attr('data_format') == 'channels_last': + if len(inp.shape) == 2: # 1D -> width + chan + shape = [self.get_attr('out_width'), self.get_attr('n_chan')] + dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}'] + elif len(inp.shape) == 3: # 2D -> height + width + chan + shape = [self.get_attr('out_height'), self.get_attr('out_width'), self.get_attr('n_chan')] + dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}'] + else: + if len(inp.shape) == 2: # 1D -> width + chan + shape = [self.get_attr('n_chan'), self.get_attr('out_width')] + dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}'] + elif len(inp.shape) == 3: # 2D -> height + width + chan + shape = [self.get_attr('n_chan'), self.get_attr('out_height'), self.get_attr('out_width')] + dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}'] + self.add_output_variable(shape, dims, precision=inp.type.precision) diff --git a/test/pytest/test_upsampling_pytorch.py b/test/pytest/test_upsampling_pytorch.py new file mode 100644 index 0000000000..4a6c69ede4 --- /dev/null +++ b/test/pytest/test_upsampling_pytorch.py @@ -0,0 +1,100 @@ +from pathlib import Path + +import numpy as np +import pytest +import torch +import torch.nn as nn + +import hls4ml + +test_root_path = Path(__file__).parent + +in_height = 6 +in_width = 8 +in_feat = 4 + +size = 2 +atol = 5e-3 + + +@pytest.fixture(scope='module') +def data_1d(): + X = np.random.rand(100, in_feat, in_width) + return X + + +@pytest.fixture(scope='module') +def data_2d(): + X = np.random.rand(100, in_feat, in_height, in_width) + return X + + +class Upsample1DModel(nn.Module): + def __init__(self): + super().__init__() + self.upsample = nn.Upsample(scale_factor=2) + + def forward(self, x): + return self.upsample(x) + + +class Upsample2DModel(nn.Module): + def __init__(self): + super().__init__() + # this scale_factor tests proper output shape calculation with fractional scaling and parsing per-axis scales + self.upsample = nn.UpsamplingNearest2d(scale_factor=(1, 2.4)) # Would also work with Upsample(mode='nearest') + + def forward(self, x): + return self.upsample(x) + + +@pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +def test_pytorch_upsampling1d(data_1d, io_type, backend): + model = Upsample1DModel() + + config = hls4ml.utils.config_from_pytorch_model( + model, + default_precision='ap_fixed<16,6>', + inputs_channel_last=True, # We don't test channels_last conversion here + transpose_outputs=False, + ) + odir = str(test_root_path / f'hls4mlprj_pytorch_upsampling_1d_{backend}_{io_type}') + hls_model = hls4ml.converters.convert_from_pytorch_model( + model, (None, in_feat, in_width), hls_config=config, io_type=io_type, output_dir=odir, backend=backend + ) + hls_model.compile() + + data_1d_t = np.ascontiguousarray(data_1d.transpose([0, 2, 1])) + + pytorch_prediction = model(torch.Tensor(data_1d)).detach().numpy() + hls_prediction = hls_model.predict(data_1d_t) + + pred_shape = list(pytorch_prediction.shape) + pred_shape.append(pred_shape.pop(1)) # Transpose shape to channels_last + hls_prediction = hls_prediction.reshape(pred_shape).transpose([0, 2, 1]) # Transpose back + + np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=1e-2, atol=0.01) + + +@pytest.mark.parametrize('io_type', ['io_parallel']) # Fractional scaling doesn't work with io_stream +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +def test_pytorch_upsampling2d(data_2d, io_type, backend): + model = Upsample2DModel() + + config = hls4ml.utils.config_from_pytorch_model( + model, + default_precision='ap_fixed<16,6>', + inputs_channel_last=False, # With conversion to channels_last + transpose_outputs=True, + ) + odir = str(test_root_path / f'hls4mlprj_pytorch_upsampling_2d_{backend}_{io_type}') + hls_model = hls4ml.converters.convert_from_pytorch_model( + model, (None, in_feat, in_height, in_width), hls_config=config, io_type=io_type, output_dir=odir, backend=backend + ) + hls_model.compile() + + pytorch_prediction = model(torch.Tensor(data_2d)).detach().numpy().flatten() + hls_prediction = hls_model.predict(data_2d).flatten() + + np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=1e-2, atol=0.01) From 1b72b19905befc84bea30a2d17ad7d28c8dc0022 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 15 Apr 2024 20:58:25 +0200 Subject: [PATCH 028/103] Split Catapult types into separate file --- hls4ml/backends/catapult/catapult_backend.py | 3 +- hls4ml/backends/catapult/catapult_types.py | 92 +++++++++++++++++++ .../catapult/passes/transform_types.py | 6 +- hls4ml/backends/fpga/fpga_types.py | 65 ------------- 4 files changed, 96 insertions(+), 70 deletions(-) create mode 100644 hls4ml/backends/catapult/catapult_types.py diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py index 5556154dcb..0583e80dab 100644 --- a/hls4ml/backends/catapult/catapult_backend.py +++ b/hls4ml/backends/catapult/catapult_backend.py @@ -4,7 +4,8 @@ import numpy as np from hls4ml.backends import FPGABackend -from hls4ml.backends.fpga.fpga_types import ACTypeConverter, CatapultArrayVariableConverter, HLSTypeConverter +from hls4ml.backends.catapult.catapult_types import CatapultArrayVariableConverter +from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter from hls4ml.model.attributes import ChoiceAttribute, ConfigurableAttribute, TypeAttribute from hls4ml.model.flow import register_flow from hls4ml.model.layers import ( diff --git a/hls4ml/backends/catapult/catapult_types.py b/hls4ml/backends/catapult/catapult_types.py new file mode 100644 index 0000000000..92fbeb2db8 --- /dev/null +++ b/hls4ml/backends/catapult/catapult_types.py @@ -0,0 +1,92 @@ +from hls4ml.backends.fpga.fpga_types import ( + ArrayVariableConverter, + InplaceStreamVariableConverter, + StreamVariableConverter, + StructMemberVariableConverter, + VariableDefinition, +) + +# region ArrayVariable + + +class CatapultArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}] /* {pragma} */'.format( + type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma + ) + + +class CatapultInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class CatapultArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultArrayVariableDefinition) + + +class CatapultInplaceArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceArrayVariableDefinition + ) + + +# endregion + +# region StructMemberVariable + + +class CatapultStructMemberVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}]'.format( + type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp() + ) + + +class CatapultStructMemberVariableConverter(StructMemberVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStructMemberVariableDefinition + ) + + +# endregion + +# region StreamVariable + + +class CatapultStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if as_reference: # Function parameter + return f'ac_channel<{self.type.name}> &{self.name}{name_suffix}' + else: # Declaration (string name arg not implemented in ac_channel) + return 'ac_channel<{type}> {name}{suffix}/*("{name}")*/'.format( + type=self.type.name, name=self.name, suffix=name_suffix + ) + + +class CatapultStreamVariableConverter(StreamVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStreamVariableDefinition) + + +# endregion + +# region InplaceStreamVariable + + +class CatapultInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class CatapultInplaceStreamVariableConverter(InplaceStreamVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceStreamVariableDefinition + ) + + +# endregion diff --git a/hls4ml/backends/catapult/passes/transform_types.py b/hls4ml/backends/catapult/passes/transform_types.py index 4ef3548cb6..3cbb917a67 100755 --- a/hls4ml/backends/catapult/passes/transform_types.py +++ b/hls4ml/backends/catapult/passes/transform_types.py @@ -1,12 +1,10 @@ -from hls4ml.backends.fpga.fpga_types import ( - ACTypeConverter, +from hls4ml.backends.catapult.catapult_types import ( CatapultArrayVariableConverter, CatapultInplaceArrayVariableConverter, CatapultInplaceStreamVariableConverter, CatapultStreamVariableConverter, - HLSTypeConverter, - StaticWeightVariableConverter, ) +from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter, StaticWeightVariableConverter from hls4ml.model.optimizer import GlobalOptimizerPass from hls4ml.model.types import InplaceTensorVariable diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py index 408f1320e4..c5327dab8c 100644 --- a/hls4ml/backends/fpga/fpga_types.py +++ b/hls4ml/backends/fpga/fpga_types.py @@ -248,13 +248,6 @@ def definition_cpp(self, name_suffix='', as_reference=False): ) -class CatapultArrayVariableDefinition(VariableDefinition): - def definition_cpp(self, name_suffix='', as_reference=False): - return '{type} {name}{suffix}[{shape}] /* {pragma} */'.format( - type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma - ) - - class VivadoInplaceArrayVariableDefinition(VariableDefinition): def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' @@ -265,11 +258,6 @@ def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' -class CatapultInplaceArrayVariableDefinition(VariableDefinition): - def definition_cpp(self): - return f'auto& {self.name} = {self.input_var.name}' - - class ArrayVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -297,11 +285,6 @@ def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition) -class CatapultArrayVariableConverter(ArrayVariableConverter): - def __init__(self, type_converter): - super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultArrayVariableDefinition) - - class VivadoInplaceArrayVariableConverter(ArrayVariableConverter): def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition) @@ -314,13 +297,6 @@ def __init__(self, type_converter): ) -class CatapultInplaceArrayVariableConverter(ArrayVariableConverter): - def __init__(self, type_converter): - super().__init__( - type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceArrayVariableDefinition - ) - - # endregion # region StructMemberVariable @@ -333,13 +309,6 @@ def definition_cpp(self, name_suffix='', as_reference=False): ) -class CatapultStructMemberVariableDefinition(VariableDefinition): - def definition_cpp(self, name_suffix='', as_reference=False): - return '{type} {name}{suffix}[{shape}]'.format( - type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp() - ) - - class StructMemberVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -369,13 +338,6 @@ def __init__(self, type_converter): ) -class CatapultStructMemberVariableConverter(StructMemberVariableConverter): - def __init__(self, type_converter): - super().__init__( - type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStructMemberVariableDefinition - ) - - # endregion # region StreamVariable @@ -409,21 +371,6 @@ def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' -class CatapultStreamVariableDefinition(VariableDefinition): - def definition_cpp(self, name_suffix='', as_reference=False): - if as_reference: # Function parameter - return f'ac_channel<{self.type.name}> &{self.name}{name_suffix}' - else: # Declaration (string name arg not implemented in ac_channel) - return 'ac_channel<{type}> {name}{suffix}/*("{name}")*/'.format( - type=self.type.name, name=self.name, suffix=name_suffix - ) - - -class CatapultInplaceStreamVariableDefinition(VariableDefinition): - def definition_cpp(self): - return f'auto& {self.name} = {self.input_var.name}' - - class StreamVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -455,11 +402,6 @@ def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition) -class CatapultStreamVariableConverter(StreamVariableConverter): - def __init__(self, type_converter): - super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStreamVariableDefinition) - - # endregion # region InplaceStreamVariable @@ -493,13 +435,6 @@ def __init__(self, type_converter): ) -class CatapultInplaceStreamVariableConverter(InplaceStreamVariableConverter): - def __init__(self, type_converter): - super().__init__( - type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceStreamVariableDefinition - ) - - # endregion # region WeightsVariable From 28521d0e5148a039fad9d0fdb7656996ec84dafd Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 15 Apr 2024 21:10:04 +0200 Subject: [PATCH 029/103] Split Quartus types into separate file --- hls4ml/backends/fpga/fpga_types.py | 63 ------------- .../quartus/passes/transform_types.py | 6 +- hls4ml/backends/quartus/quartus_types.py | 90 +++++++++++++++++++ 3 files changed, 92 insertions(+), 67 deletions(-) create mode 100644 hls4ml/backends/quartus/quartus_types.py diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py index c5327dab8c..16c029828f 100644 --- a/hls4ml/backends/fpga/fpga_types.py +++ b/hls4ml/backends/fpga/fpga_types.py @@ -241,23 +241,11 @@ def definition_cpp(self, name_suffix='', as_reference=False): ) -class QuartusArrayVariableDefinition(VariableDefinition): - def definition_cpp(self, name_suffix='', as_reference=False): - return '{type} {name}{suffix}[{shape}] {pragma}'.format( - type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma - ) - - class VivadoInplaceArrayVariableDefinition(VariableDefinition): def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' -class QuartusInplaceArrayVariableDefinition(VariableDefinition): - def definition_cpp(self): - return f'auto& {self.name} = {self.input_var.name}' - - class ArrayVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -280,35 +268,16 @@ def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoArrayVariableDefinition) -class QuartusArrayVariableConverter(ArrayVariableConverter): - def __init__(self, type_converter): - super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition) - - class VivadoInplaceArrayVariableConverter(ArrayVariableConverter): def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition) -class QuartusInplaceArrayVariableConverter(ArrayVariableConverter): - def __init__(self, type_converter): - super().__init__( - type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceArrayVariableDefinition - ) - - # endregion # region StructMemberVariable -class QuartusStructMemberVariableDefinition(VariableDefinition): - def definition_cpp(self, name_suffix='', as_reference=False): - return '{type} {name}{suffix}[{shape}]'.format( - type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp() - ) - - class StructMemberVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -331,13 +300,6 @@ def convert(self, tensor_var, pragma='partition', struct_name=None): return tensor_var -class QuartusStructMemberVariableConverter(StructMemberVariableConverter): - def __init__(self, type_converter): - super().__init__( - type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStructMemberVariableDefinition - ) - - # endregion # region StreamVariable @@ -358,19 +320,6 @@ def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' -class QuartusStreamVariableDefinition(VariableDefinition): - def definition_cpp(self, name_suffix='', as_reference=False): - if as_reference: # Function parameter - return f'stream<{self.type.name}> &{self.name}{name_suffix}' - else: # Declaration - return f'stream<{self.type.name}> {self.name}{name_suffix}' - - -class QuartusInplaceStreamVariableDefinition(VariableDefinition): - def definition_cpp(self): - return f'auto& {self.name} = {self.input_var.name}' - - class StreamVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -397,11 +346,6 @@ def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition) -class QuartusStreamVariableConverter(StreamVariableConverter): - def __init__(self, type_converter): - super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition) - - # endregion # region InplaceStreamVariable @@ -428,13 +372,6 @@ def __init__(self, type_converter): ) -class QuartusInplaceStreamVariableConverter(InplaceStreamVariableConverter): - def __init__(self, type_converter): - super().__init__( - type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceStreamVariableDefinition - ) - - # endregion # region WeightsVariable diff --git a/hls4ml/backends/quartus/passes/transform_types.py b/hls4ml/backends/quartus/passes/transform_types.py index 67de32ab65..041aad8136 100644 --- a/hls4ml/backends/quartus/passes/transform_types.py +++ b/hls4ml/backends/quartus/passes/transform_types.py @@ -1,12 +1,10 @@ -from hls4ml.backends.fpga.fpga_types import ( - ACTypeConverter, - HLSTypeConverter, +from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter, StaticWeightVariableConverter +from hls4ml.backends.quartus.quartus_types import ( QuartusArrayVariableConverter, QuartusInplaceArrayVariableConverter, QuartusInplaceStreamVariableConverter, QuartusStreamVariableConverter, QuartusStructMemberVariableConverter, - StaticWeightVariableConverter, ) from hls4ml.model.optimizer import GlobalOptimizerPass from hls4ml.model.types import InplaceTensorVariable diff --git a/hls4ml/backends/quartus/quartus_types.py b/hls4ml/backends/quartus/quartus_types.py new file mode 100644 index 0000000000..e641c9aba7 --- /dev/null +++ b/hls4ml/backends/quartus/quartus_types.py @@ -0,0 +1,90 @@ +from hls4ml.backends.fpga.fpga_types import ( + ArrayVariableConverter, + InplaceStreamVariableConverter, + StreamVariableConverter, + StructMemberVariableConverter, + VariableDefinition, +) + +# region ArrayVariable + + +class QuartusArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}] {pragma}'.format( + type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma + ) + + +class QuartusInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class QuartusArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition) + + +class QuartusInplaceArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceArrayVariableDefinition + ) + + +# endregion + +# region StructMemberVariable + + +class QuartusStructMemberVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}]'.format( + type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp() + ) + + +class QuartusStructMemberVariableConverter(StructMemberVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStructMemberVariableDefinition + ) + + +# endregion + +# region StreamVariable + + +class QuartusStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if as_reference: # Function parameter + return f'stream<{self.type.name}> &{self.name}{name_suffix}' + else: # Declaration + return f'stream<{self.type.name}> {self.name}{name_suffix}' + + +class QuartusInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class QuartusStreamVariableConverter(StreamVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition) + + +# endregion + +# region InplaceStreamVariable + + +class QuartusInplaceStreamVariableConverter(InplaceStreamVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceStreamVariableDefinition + ) + + +# endregion From a44707d30e92f855bb15716d17e1918a40d8412f Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 15 Apr 2024 21:15:40 +0200 Subject: [PATCH 030/103] Split Vivado types into separate file --- hls4ml/backends/fpga/fpga_types.py | 49 ------------- .../backends/vivado/passes/transform_types.py | 6 +- hls4ml/backends/vivado/vivado_backend.py | 3 +- hls4ml/backends/vivado/vivado_types.py | 70 +++++++++++++++++++ 4 files changed, 74 insertions(+), 54 deletions(-) create mode 100644 hls4ml/backends/vivado/vivado_types.py diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py index 16c029828f..15ad386c5a 100644 --- a/hls4ml/backends/fpga/fpga_types.py +++ b/hls4ml/backends/fpga/fpga_types.py @@ -234,18 +234,6 @@ def definition_cpp(self, name_suffix='', as_reference=False): # region ArrayVariable -class VivadoArrayVariableDefinition(VariableDefinition): - def definition_cpp(self, name_suffix='', as_reference=False): - return '{type} {name}{suffix}[{shape}]'.format( - type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp() - ) - - -class VivadoInplaceArrayVariableDefinition(VariableDefinition): - def definition_cpp(self): - return f'auto& {self.name} = {self.input_var.name}' - - class ArrayVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -263,16 +251,6 @@ def convert(self, tensor_var, pragma='partition'): return tensor_var -class VivadoArrayVariableConverter(ArrayVariableConverter): - def __init__(self, type_converter): - super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoArrayVariableDefinition) - - -class VivadoInplaceArrayVariableConverter(ArrayVariableConverter): - def __init__(self, type_converter): - super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition) - - # endregion # region StructMemberVariable @@ -305,21 +283,6 @@ def convert(self, tensor_var, pragma='partition', struct_name=None): # region StreamVariable -class VivadoStreamVariableDefinition(VariableDefinition): - def definition_cpp(self, name_suffix='', as_reference=False): - if as_reference: # Function parameter - return f'hls::stream<{self.type.name}> &{self.name}{name_suffix}' - else: # Declaration - return 'hls::stream<{type}> {name}{suffix}("{name}")'.format( - type=self.type.name, name=self.name, suffix=name_suffix - ) - - -class VivadoInplaceStreamVariableDefinition(VariableDefinition): - def definition_cpp(self): - return f'auto& {self.name} = {self.input_var.name}' - - class StreamVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -341,11 +304,6 @@ def convert(self, tensor_var, n_pack=1, depth=0): return tensor_var -class VivadoStreamVariableConverter(StreamVariableConverter): - def __init__(self, type_converter): - super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition) - - # endregion # region InplaceStreamVariable @@ -365,13 +323,6 @@ def convert(self, tensor_var, n_pack=1, depth=0): return tensor_var -class VivadoInplaceStreamVariableConverter(InplaceStreamVariableConverter): - def __init__(self, type_converter): - super().__init__( - type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceStreamVariableDefinition - ) - - # endregion # region WeightsVariable diff --git a/hls4ml/backends/vivado/passes/transform_types.py b/hls4ml/backends/vivado/passes/transform_types.py index 3462578e74..7bff3b8efc 100644 --- a/hls4ml/backends/vivado/passes/transform_types.py +++ b/hls4ml/backends/vivado/passes/transform_types.py @@ -1,7 +1,5 @@ -from hls4ml.backends.fpga.fpga_types import ( - APTypeConverter, - HLSTypeConverter, - StaticWeightVariableConverter, +from hls4ml.backends.fpga.fpga_types import APTypeConverter, HLSTypeConverter, StaticWeightVariableConverter +from hls4ml.backends.vivado.vivado_types import ( VivadoArrayVariableConverter, VivadoInplaceArrayVariableConverter, VivadoInplaceStreamVariableConverter, diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 64df42bb42..6bd57d6a88 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -4,7 +4,8 @@ import numpy as np from hls4ml.backends import FPGABackend -from hls4ml.backends.fpga.fpga_types import APTypeConverter, HLSTypeConverter, VivadoArrayVariableConverter +from hls4ml.backends.fpga.fpga_types import APTypeConverter, HLSTypeConverter +from hls4ml.backends.vivado.vivado_types import VivadoArrayVariableConverter from hls4ml.model.attributes import ChoiceAttribute, ConfigurableAttribute, TypeAttribute from hls4ml.model.flow import register_flow from hls4ml.model.layers import ( diff --git a/hls4ml/backends/vivado/vivado_types.py b/hls4ml/backends/vivado/vivado_types.py new file mode 100644 index 0000000000..ecac4a46a4 --- /dev/null +++ b/hls4ml/backends/vivado/vivado_types.py @@ -0,0 +1,70 @@ +from hls4ml.backends.fpga.fpga_types import ( + ArrayVariableConverter, + InplaceStreamVariableConverter, + StreamVariableConverter, + VariableDefinition, +) + +# region ArrayVariable + + +class VivadoArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}]'.format( + type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp() + ) + + +class VivadoInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class VivadoArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoArrayVariableDefinition) + + +class VivadoInplaceArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition) + + +# endregion + +# region StreamVariable + + +class VivadoStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if as_reference: # Function parameter + return f'hls::stream<{self.type.name}> &{self.name}{name_suffix}' + else: # Declaration + return 'hls::stream<{type}> {name}{suffix}("{name}")'.format( + type=self.type.name, name=self.name, suffix=name_suffix + ) + + +class VivadoInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class VivadoStreamVariableConverter(StreamVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition) + + +# endregion + +# region InplaceStreamVariable + + +class VivadoInplaceStreamVariableConverter(InplaceStreamVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceStreamVariableDefinition + ) + + +# endregion From cefab60a29e19fdf7036d2b0fa8ed7e7cc75d27d Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 18 Apr 2024 16:57:54 +0200 Subject: [PATCH 031/103] Increase precision of Softsign test --- test/pytest/test_softsign.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/pytest/test_softsign.py b/test/pytest/test_softsign.py index 217865fe46..31a2a1c2cf 100644 --- a/test/pytest/test_softsign.py +++ b/test/pytest/test_softsign.py @@ -14,11 +14,15 @@ @pytest.mark.parametrize('input_shape, io_type', [((8,), 'io_parallel'), ((8,), 'io_stream'), ((8, 8, 3), 'io_stream')]) def test_softsign(backend, input_shape, io_type): X = np.random.rand(1000, *input_shape) + X = np.round(X * 2**10) * 2**-10 model = tf.keras.models.Sequential() model.add(tf.keras.layers.Activation(input_shape=input_shape, activation='softsign', name='softsign')) model.compile() - cfg = hls4ml.utils.config_from_keras_model(model, granularity='name') + cfg = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='fixed<20,4>') + # Since softsign implementation is lookup-based increasing the precision and size of the table helps with accuracy + cfg['LayerName']['softsign']['table_t'] = 'fixed<20,4>' + cfg['LayerName']['softsign']['table_size'] = 2048 odir = str(test_root_path / f'hls4mlprj_softsign_{backend}_{io_type}_{str(input_shape)}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=cfg, io_type=io_type, output_dir=odir, backend=backend From 440901b914f3cdc1d9a1aa24f22a2fba810cbd6b Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 18 Apr 2024 16:58:15 +0200 Subject: [PATCH 032/103] Use quantized input in binary CNN test --- test/pytest/test_binary_cnn.py | 47 +++++++++++++++++----------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/test/pytest/test_binary_cnn.py b/test/pytest/test_binary_cnn.py index 7114e47263..40af056df9 100644 --- a/test/pytest/test_binary_cnn.py +++ b/test/pytest/test_binary_cnn.py @@ -33,57 +33,57 @@ def test_binary_cnn(backend, io_type, strategy): x = QConv2D( 4, (3, 3), - kernel_quantizer="binary", - name="conv2d_1", + kernel_quantizer='binary', + name='conv2d_1', kernel_regularizer=l2(0.0001), use_bias=True, bias_quantizer='quantized_bits(5,2)', )(x_in) x = QBatchNormalization()(x) - x = QActivation("binary", name="act1")(x) + x = QActivation('binary', name='act1')(x) - x = QConv2D(8, (3, 3), kernel_quantizer="binary", name="conv2d_2", kernel_regularizer=l2(0.0001), use_bias=False)(x) + x = QConv2D(8, (3, 3), kernel_quantizer='binary', name='conv2d_2', kernel_regularizer=l2(0.0001), use_bias=False)(x) x = QBatchNormalization()(x) - x = QActivation("binary", name="act2")(x) + x = QActivation('binary', name='act2')(x) x = MaxPooling2D(pool_size=(2, 2))(x) - x = QConv2D(8, (3, 3), kernel_quantizer="binary", name="conv2d_3", kernel_regularizer=l2(0.0001), use_bias=False)(x) + x = QConv2D(8, (3, 3), kernel_quantizer='binary', name='conv2d_3', kernel_regularizer=l2(0.0001), use_bias=False)(x) x = QBatchNormalization()(x) - x = QActivation("binary", name="act3")(x) + x = QActivation('binary', name='act3')(x) x = MaxPooling2D(pool_size=(2, 2))(x) x = Flatten()(x) - x = QDense(10, kernel_quantizer="binary", name="q_dense_6", use_bias=False)(x) + x = QDense(10, kernel_quantizer='binary', name='q_dense_6', use_bias=False)(x) x = QBatchNormalization()(x) - x = QActivation("binary_tanh", name="act4")(x) + x = QActivation('binary_tanh', name='act4')(x) - x = QDense(10, kernel_quantizer="binary", activation="linear", name="q_dense_7", use_bias=False)(x) + x = QDense(10, kernel_quantizer='binary', activation='linear', name='q_dense_7', use_bias=False)(x) model2 = Model(inputs=x_in, outputs=x) - model2.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) + model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model2.summary() - hls_config = hls4ml.utils.config_from_keras_model(model2, granularity="name", default_precision='fixed<32,12>') - hls_config["Model"]["Strategy"] = strategy + hls_config = hls4ml.utils.config_from_keras_model(model2, granularity='name', default_precision='fixed<32,12>') + hls_config['Model']['Strategy'] = strategy - # hls_config["LayerName"]["q_dense_7_softmax"]["Implementation"] = "legacy" + # hls_config['LayerName']['q_dense_7_softmax']['Implementation'] = 'legacy' - hls_config["LayerName"]["conv2d_1"]["ReuseFactor"] = 9 - hls_config["LayerName"]["conv2d_2"]["ReuseFactor"] = 36 - hls_config["LayerName"]["conv2d_3"]["ReuseFactor"] = 72 - hls_config["LayerName"]["q_dense_6"]["ReuseFactor"] = 2000 - hls_config["LayerName"]["q_dense_7"]["ReuseFactor"] = 100 + hls_config['LayerName']['conv2d_1']['ReuseFactor'] = 9 + hls_config['LayerName']['conv2d_2']['ReuseFactor'] = 36 + hls_config['LayerName']['conv2d_3']['ReuseFactor'] = 72 + hls_config['LayerName']['q_dense_6']['ReuseFactor'] = 2000 + hls_config['LayerName']['q_dense_7']['ReuseFactor'] = 100 if backend == 'Quartus' and io_type == 'io_parallel': # Winegrad imp[lementation does not support binary - hls_config["LayerName"]["conv2d_1"]["Implementation"] = "im2col" - hls_config["LayerName"]["conv2d_2"]["Implementation"] = "im2col" - hls_config["LayerName"]["conv2d_3"]["Implementation"] = "im2col" + hls_config['LayerName']['conv2d_1']['Implementation'] = 'im2col' + hls_config['LayerName']['conv2d_2']['Implementation'] = 'im2col' + hls_config['LayerName']['conv2d_3']['Implementation'] = 'im2col' - output_dir = str(test_root_path / f"hls4mlprj_binary_cnn_{backend}_{io_type}_{strategy}") + output_dir = str(test_root_path / f'hls4mlprj_binary_cnn_{backend}_{io_type}_{strategy}') hls_model = hls4ml.converters.convert_from_keras_model( model2, hls_config=hls_config, @@ -93,6 +93,7 @@ def test_binary_cnn(backend, io_type, strategy): ) X = np.random.rand(100, 28, 28, 1) + X = np.round(X * 2**10) * 2**-10 hls_model.compile() y = model2.predict(X) # noqa: F841 From c351a0201fe0ae65fed8438bede03f3e695220bb Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 20 Aug 2023 22:46:26 +0200 Subject: [PATCH 033/103] Add UnspecifiedPrecisionType --- hls4ml/model/types.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py index ba926b11dc..fb5cde3863 100644 --- a/hls4ml/model/types.py +++ b/hls4ml/model/types.py @@ -223,6 +223,17 @@ def __str__(self): return typestring +class UnspecifiedPrecisionType(PrecisionType): + """ + Class representing an unspecified precision type. + + Instances of this class are expected to be replaced with concrete precision types during conversion. + """ + + def __init__(self): + super().__init__(width=0, signed=False) + + def find_minimum_width(data, signed=True): """ Helper function to find the minimum integer width to express all entries in the data array @@ -437,7 +448,9 @@ def __next__(self): def update_precision(self, new_precision): self.type.precision = new_precision - if isinstance(new_precision, (IntegerPrecisionType, XnorPrecisionType, ExponentPrecisionType)): + if isinstance(new_precision, UnspecifiedPrecisionType): + self.precision_fmt = '' # Temporarily set precision to undefined value + elif isinstance(new_precision, (IntegerPrecisionType, XnorPrecisionType, ExponentPrecisionType)): self.precision_fmt = '{:.0f}' elif isinstance(new_precision, FixedPrecisionType): decimal_spaces = max(0, new_precision.fractional) From 4d9d35a32e9a12e7be1c388fd76f8364dcb1e0bc Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 20 Aug 2023 22:49:18 +0200 Subject: [PATCH 034/103] Rudimentary optimizer to infer 'auto' precision --- hls4ml/backends/fpga/fpga_backend.py | 14 +- hls4ml/backends/quartus/quartus_backend.py | 2 +- hls4ml/backends/vivado/vivado_backend.py | 2 +- hls4ml/model/optimizer/__init__.py | 2 + .../model/optimizer/passes/infer_precision.py | 290 ++++++++++++++++++ 5 files changed, 307 insertions(+), 3 deletions(-) create mode 100644 hls4ml/model/optimizer/passes/infer_precision.py diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 8cfaec8b3f..8d0ed64aad 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -34,8 +34,10 @@ ExponentPrecisionType, FixedPrecisionType, IntegerPrecisionType, + PrecisionType, RoundingMode, SaturationMode, + UnspecifiedPrecisionType, XnorPrecisionType, ) from hls4ml.writer import get_writer @@ -290,9 +292,12 @@ def get_valid_conv_partition_splits(self, out_height, out_width): @classmethod def convert_precision_string(cls, precision): - if isinstance(precision, IntegerPrecisionType) or isinstance(precision, FixedPrecisionType): + if isinstance(precision, PrecisionType): return precision + if precision.lower() == 'auto': + return cls._convert_auto_type(precision) + if precision.startswith('ac_'): return cls._convert_ac_type(precision) else: @@ -366,6 +371,13 @@ def _convert_ac_type(cls, precision): elif 'int' in precision: return IntegerPrecisionType(width, signed) + @classmethod + def _convert_auto_type(cls, precision): + ''' + Convert a "auto" precision string into the UnspecifiedPrecisionType + ''' + return UnspecifiedPrecisionType() + def product_type(self, data_T, weight_T): ''' Helper function to determine which product implementation to use during inference diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py index 8e81e0a2e9..cf84016aee 100644 --- a/hls4ml/backends/quartus/quartus_backend.py +++ b/hls4ml/backends/quartus/quartus_backend.py @@ -72,7 +72,7 @@ def _register_flows(self): 'quartus:inplace_parallel_reshape', 'quartus:inplace_stream_flatten', 'quartus:skip_softmax', - 'quartus:fix_softmax_table_size', + 'infer_precision_types', ] optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 6bd57d6a88..47974e10c3 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -109,7 +109,7 @@ def _register_flows(self): 'vivado:inplace_parallel_reshape', 'vivado:inplace_stream_flatten', 'vivado:skip_softmax', - 'vivado:fix_softmax_table_size', + 'infer_precision_types', ] optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 2e9b197475..5eab99db8a 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -33,6 +33,7 @@ register_flow( 'convert', [ + 'infer_precision_types', 'channels_last_converter', 'fuse_bias_add', 'remove_useless_transpose', @@ -51,6 +52,7 @@ 'fuse_consecutive_batch_normalization', 'fuse_batch_normalization', 'replace_multidimensional_dense_with_conv', + 'infer_precision_types', 'set_precision_concat', ], requires=['convert'], diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py new file mode 100644 index 0000000000..5ef1c2dee5 --- /dev/null +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -0,0 +1,290 @@ +from copy import deepcopy + +import numpy as np + +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType + + +class InferPrecisionTypes(OptimizerPass): + def match(self, node): + for layer_type in node.types.values(): + if isinstance(layer_type.precision, UnspecifiedPrecisionType): + return True + return False + + def transform(self, model, node): + types_to_infer = [] + for type_name, type_obj in node.types.items(): + if isinstance(type_obj.precision, UnspecifiedPrecisionType): + types_to_infer.append(type_name) + + inferred_types = self._infer_precision(node, types_to_infer) + for type_name in types_to_infer: + if type_name not in inferred_types: + self._infer_default_type(node, type_name) + + return False # No model graph changes made + + def _infer_precision(self, node, types_to_infer): + node_class = node.class_name + if node_class in ['Dense']: + return self._infer_dense_precision(node, types_to_infer) + + if node_class in ['BatchNormalization']: + return self._infer_bn_precision(node, types_to_infer) + + if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']: + return self._infer_conv_precision(node, types_to_infer) + + if node_class in ['SeparableConv1D', 'SeparableConv2D', 'DepthwiseConv2D']: + return self._infer_sepconv_precision(node, types_to_infer) + + if node_class in ['Pooling1D', 'Pooling2D']: + return self._infer_pooling_precision(node, types_to_infer) + + if node_class in ['Clone', 'Reshape', 'Resize', 'Transpose', 'ZeroPadding1D', 'ZeroPadding2D']: + return self._infer_output_matching_precision(node, types_to_infer) + + if node_class in ['Concatenate', 'Merge']: + return self._infer_merge_precision(node, types_to_infer) + + # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent + # this in config_from_* functions + + return [] + + def _infer_default_type(self, node, type_name): + model_config = node.model.config + default_precision = model_config.backend.convert_precision_string(model_config.model_precision['default']) + # No need to change the name of the NamedType since we use the default precision + node.types[type_name].precision = default_precision + + def _infer_output_matching_precision(self, node, types_to_infer): + assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + + in_var = node.get_input_variable() + out_var = node.get_output_variable() + in_out_type = in_var.type.precision + out_var.type.precision = in_out_type + + return ['result_t'] + + def _infer_common_precision(self, node, types_to_infer, n_ops): + inferred_types = [] + + input_precision = node.get_input_variable().type.precision + input_width = input_precision.width + input_integers = input_precision.integer + + if 'weight_t' in types_to_infer: + weight_quantizer = node.get_attr('weight_quantizer', None) + if weight_quantizer is not None: + weight_width = weight_quantizer.bits + weight_integers = weight_quantizer.hls_type.integer + node.types['weight_t'].name = node.name + '_weight_t' + node.types['weight_t'].precision = weight_quantizer.hls_type + else: + self._infer_default_type(node, 'weight_t') + weight_width = node.types['weight_t'].precision.width + weight_integers = node.types['weight_t'].precision.integer + node.weights['weight'].update_precision(node.types['weight_t'].precision) + + inferred_types.append('weight_t') + else: + weight_width = node.types['weight_t'].precision.width + weight_integers = node.types['weight_t'].precision.integer + + if 'bias_t' in types_to_infer: + bias_quantizer = node.get_attr('bias_quantizer', None) + if bias_quantizer is not None: + bias_width = bias_quantizer.bits + bias_integers = bias_quantizer.hls_type.integer + node.types['bias_t'].name = node.name + '_bias_t' + node.types['bias_t'].precision = bias_quantizer.hls_type + else: + self._infer_default_type(node, 'bias_t') + bias_width = node.types['bias_t'].precision.width + bias_integers = node.types['bias_t'].precision.integer + node.weights['bias'].update_precision(node.types['bias_t'].precision) + + inferred_types.append('bias_t') + else: + bias_width = node.types['bias_t'].precision.width + bias_integers = node.types['bias_t'].precision.integer + + new_type = FixedPrecisionType( + width=int(max(np.ceil(input_width + weight_width + np.log2(n_ops)), bias_width) + 1), + integer=int(max(np.ceil(input_integers + weight_integers + np.log2(n_ops)), bias_integers) + 1), + ) + + if 'accum_t' in types_to_infer: + node.types['accum_t'].name = node.name + '_accum_t' + node.types['accum_t'].precision = new_type + + inferred_types.append('accum_t') + + if 'result_t' in types_to_infer: + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = new_type + + inferred_types.append('result_t') + + return inferred_types + + def _infer_dense_precision(self, node, types_to_infer): + n_ops = node.get_attr('n_in') * node.get_attr('n_out') + return self._infer_common_precision(node, types_to_infer, n_ops) + + def _infer_conv_precision(self, node, types_to_infer): + n_ops = node.get_attr('n_chan') * node.get_attr('filt_height', 1) * node.get_attr('filt_width') + return self._infer_common_precision(node, types_to_infer, n_ops) + + def _infer_sepconv_precision(self, node, types_to_infer): + inferred_types = [] + + input_precision = node.get_input_variable().type.precision + input_width = input_precision.width + input_integers = input_precision.integer + + if 'depthwise_t' in types_to_infer: + # TODO Current HLS implementations use data_T (input type) as the result hence this doesn't affect the output + # precision ATM, but this will probably change in the future + depthwise_quantizer = node.get_attr('depthwise_quantizer', None) + if depthwise_quantizer is not None: + node.types['depthwise_t'].name = node.name + '_depthwise_t' + node.types['depthwise_t'].precision = depthwise_quantizer.hls_type + else: + self._infer_default_type(node, 'depthwise_t') + node.weights['depthwise'].update_precision(node.types['depthwise_t'].precision) + + inferred_types.append('depthwise_t') + + if 'pointwise_t' in types_to_infer: + pointwise_quantizer = node.get_attr('pointwise_quantizer', None) + if pointwise_quantizer is not None: + pointwise_width = pointwise_quantizer.bits + pointwise_integers = pointwise_quantizer.hls_type.integer + node.types['pointwise_t'].name = node.name + '_pointwise_t' + node.types['pointwise_t'].precision = pointwise_quantizer.hls_type + else: + self._infer_default_type(node, 'pointwise_t') + pointwise_width = node.types['pointwise_t'].precision.width + pointwise_integers = node.types['pointwise_t'].precision.integer + node.weights['pointwise'].update_precision(node.types['pointwise_t'].precision) + + inferred_types.append('pointwise_t') + else: + pointwise_width = node.types['pointwise_t'].precision.width + pointwise_integers = node.types['pointwise_t'].precision.integer + + if 'bias_t' in types_to_infer: + bias_quantizer = node.get_attr('bias_quantizer', None) + if bias_quantizer is not None: + bias_width = bias_quantizer.bits + bias_integers = bias_quantizer.hls_type.integer + node.types['bias_t'].name = node.name + '_bias_t' + node.types['bias_t'].precision = bias_quantizer.hls_type + else: + self._infer_default_type(node, 'bias_t') + bias_width = node.types['bias_t'].precision.width + bias_integers = node.types['bias_t'].precision.integer + node.weights['bias'].update_precision(node.types['bias_t'].precision) + + inferred_types.append('bias_t') + else: + bias_width = node.types['bias_t'].precision.width + bias_integers = node.types['bias_t'].precision.integer + + n_ops = node.get_attr('n_chan') + new_type = FixedPrecisionType( + width=int(max(np.ceil(input_width + pointwise_width + np.log2(n_ops)), bias_width) + 1), + integer=int(max(np.ceil(input_integers + pointwise_integers + np.log2(n_ops)), bias_integers) + 1), + ) + + if 'accum_t' in types_to_infer: + node.types['accum_t'].name = node.name + '_accum_t' + node.types['accum_t'].precision = new_type + + inferred_types.append('accum_t') + + if 'result_t' in types_to_infer: + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = new_type + + inferred_types.append('result_t') + + return inferred_types + + def _infer_bn_precision(self, node, types_to_infer): + inferred_types = [] + + if 'scale_t' in types_to_infer: + self._infer_default_type(node, 'scale_t') + node.weights['scale'].update_precision(node.types['scale_t'].precision) + inferred_types.append('scale_t') + + if 'bias_t' in types_to_infer: + self._infer_default_type(node, 'bias_t') + node.weights['bias'].update_precision(node.types['bias_t'].precision) + inferred_types.append('bias_t') + + if 'result_t' in types_to_infer: + scale_precision = node.types['scale_t'].precision + bias_precision = node.types['bias_t'].precision + + out_precision = deepcopy(node.get_input_node().get_output_variable().type.precision) + out_precision.integer += scale_precision.integer + out_precision.fractional = max(out_precision.fractional, scale_precision.fractional) + + out_precision.integer = max(out_precision.integer, bias_precision.integer) + 1 + out_precision.fractional = max(out_precision.fractional, bias_precision.fractional) + out_precision.width = out_precision.fractional + out_precision.integer + + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + + inferred_types.append('result_t') + + return inferred_types + + def _infer_pooling_precision(self, node, types_to_infer): + inferred_types = [] + + if 'accum_t' in types_to_infer: + input_precision = node.get_input_variable().type.precision + input_width = input_precision.width + input_integers = input_precision.integer + + n_ops = node.get_attr('n_filt') * node.get_attr('pool_height', 1) * node.get_attr('pool_width') + + accum_type = FixedPrecisionType( + width=int(np.ceil(input_width + np.log2(n_ops)) + 1), + integer=int(np.ceil(input_integers + np.log2(n_ops)) + 1), + ) + + node.types['accum_t'].name = node.name + '_accum_t' + node.types['accum_t'].precision = accum_type + + inferred_types.append('accum_t') + + if 'result_t' in types_to_infer: + self._infer_output_matching_precision(node, ['result_t']) + inferred_types.append('result_t') + + return inferred_types + + def _infer_merge_precision(self, node, types_to_infer): + assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + + input_1 = node.get_input_variable(node.inputs[0]).type.precision + input_2 = node.get_input_variable(node.inputs[1]).type.precision + + new_width = max(input_1.fractional, input_2.fractional) + max(input_1.integer, input_2.integer) + new_int = max(input_1.integer, input_2.integer) + + out_precision = FixedPrecisionType(new_width, new_int) + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + + return ['result_t'] From 32ae9b6362c8e99538fd23e1d50fb3817170a13b Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 20 Aug 2023 22:50:00 +0200 Subject: [PATCH 035/103] Auto precision test --- test/pytest/test_auto_precision.py | 255 +++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 test/pytest/test_auto_precision.py diff --git a/test/pytest/test_auto_precision.py b/test/pytest/test_auto_precision.py new file mode 100644 index 0000000000..8454d1a7f8 --- /dev/null +++ b/test/pytest/test_auto_precision.py @@ -0,0 +1,255 @@ +from pathlib import Path + +import numpy as np +import pytest +from tensorflow.keras.layers import ( + AveragePooling1D, + AveragePooling2D, + BatchNormalization, + Conv1D, + Conv2D, + Dense, + Flatten, + ReLU, + SeparableConv1D, + SeparableConv2D, +) +from tensorflow.keras.models import Sequential + +import hls4ml + +test_root_path = Path(__file__).parent + +in_height = 10 +in_width = 12 +in_feat = 4 + + +@pytest.fixture(scope='module') +def data_1d(): + X = np.random.rand(100, in_feat) + return X + + +@pytest.fixture(scope='module') +def data_2d(): + X = np.random.rand(100, in_width, in_feat) + return X + + +@pytest.fixture(scope='module') +def data_3d(): + X = np.random.rand(100, in_height, in_width, in_feat) + return X + + +@pytest.fixture(scope='module') +def keras_model_dense(): + model = Sequential() + model.add(Dense(8, activation='relu', input_shape=(in_feat,), name='first_layer')) + model.add(BatchNormalization(name='first_bn')) + model.add(Dense(6, activation='relu', name='middle_layer')) + model.add(BatchNormalization(name='middle_bn')) + model.add(Dense(4, activation='relu', name='last_layer')) + model.compile() + return model + + +@pytest.fixture(scope='module') +def keras_model_conv1d(): + model = Sequential() + model.add(Conv1D(8, kernel_size=3, activation='linear', name='first_layer', input_shape=(in_width, in_feat))) + model.add(AveragePooling1D(pool_size=2, name='first_pool')) + model.add(ReLU(name='first_act')) + model.add(Conv1D(4, kernel_size=2, activation='relu', name='middle_layer')) + model.add(Conv1D(4, kernel_size=1, activation='relu', name='last_layer')) # Will become PointwiseConv1D + model.add(Flatten()) + model.add(Dense(4, activation='relu')) + model.compile() + return model + + +@pytest.fixture(scope='module') +def keras_model_conv2d(): + model = Sequential() + model.add( + Conv2D(8, kernel_size=(3, 3), activation='linear', name='first_layer', input_shape=(in_height, in_width, in_feat)) + ) + model.add(AveragePooling2D(pool_size=(2, 2), name='first_pool')) + model.add(ReLU(name='first_act')) + model.add(Conv2D(4, kernel_size=(3, 3), activation='relu', name='middle_layer')) + model.add(Conv2D(4, kernel_size=(1, 1), activation='relu', name='last_layer')) # Will become PointwiseConv2D + model.add(Flatten()) + model.add(Dense(4, activation='relu')) + model.compile() + return model + + +@pytest.fixture(scope='module') +def keras_model_sepconv1d(): + model = Sequential() + model.add(SeparableConv1D(8, kernel_size=3, activation='linear', name='first_layer', input_shape=(in_width, in_feat))) + model.add(AveragePooling1D(pool_size=2, name='first_pool')) + model.add(ReLU(name='first_act')) + model.add(Conv1D(4, kernel_size=2, activation='relu', name='middle_layer')) + model.add(Conv1D(4, kernel_size=1, activation='relu', name='last_layer')) # Will become PointwiseConv1D + model.add(Flatten()) + model.add(Dense(4, activation='relu')) + model.compile() + return model + + +@pytest.fixture(scope='module') +def keras_model_sepconv2d(): + model = Sequential() + model.add( + SeparableConv2D( + 8, kernel_size=(3, 3), activation='linear', name='first_layer', input_shape=(in_height, in_width, in_feat) + ) + ) + model.add(AveragePooling2D(pool_size=(2, 2), name='first_pool')) + model.add(ReLU(name='first_act')) + model.add(Conv2D(4, kernel_size=(3, 3), activation='relu', name='middle_layer')) + model.add(Conv2D(4, kernel_size=(1, 1), activation='relu', name='last_layer')) # Will become PointwiseConv2D + model.add(Flatten()) + model.add(Dense(4, activation='relu')) + model.compile() + return model + + +@pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +@pytest.mark.parametrize('model_type', ['conv1d', 'conv2d']) +def test_auto_precision_conv(keras_model_conv1d, keras_model_conv2d, data_2d, data_3d, model_type, io_type, backend): + if model_type == 'conv1d': + model = keras_model_conv1d + data = data_2d + else: + model = keras_model_conv2d + data = data_3d + + config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<16,6>', granularity='model') + config['LayerName'] = { + # Infer all types of these layers + 'first_layer': { + 'Precision': 'auto', + }, + 'first_pool': { + 'Precision': 'auto', + }, + # Infer only a few specific types for these layers + 'middle_layer': { + 'Precision': { + 'accum': 'auto', + 'weight': 'auto', + }, + }, + 'last_layer': { + 'Precision': { + 'result': 'auto', + }, + }, + } + odir = str(test_root_path / f'hls4mlprj_auto_{model_type}_{backend}_{io_type}') + hls_model = hls4ml.converters.convert_from_keras_model( + model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend + ) + + # Compile will fail if there are still UnspecifiedPrecisionTypes in the model + hls_model.compile() + + # Predict + y_keras = model.predict(data).flatten() + y_hls = hls_model.predict(data).flatten() + np.testing.assert_allclose(y_keras, y_hls, rtol=2e-2, atol=5e-2, verbose=True) + + +@pytest.mark.parametrize('io_type', ['io_stream']) # Until we implement SeparableConv1D/2D for io_parallel +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) # No SeparableConv1D/2D in Quartus +@pytest.mark.parametrize('model_type', ['sepconv1d', 'sepconv2d']) +def test_auto_precision_sepconv( + keras_model_sepconv1d, keras_model_sepconv2d, data_2d, data_3d, model_type, io_type, backend +): + if model_type == 'sepconv1d': + model = keras_model_sepconv1d + data = data_2d + else: + model = keras_model_sepconv2d + data = data_3d + + config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<16,6>', granularity='model') + config['LayerName'] = { + # Infer all types of these layers + 'first_layer': { + 'Precision': 'auto', + }, + 'first_pool': { + 'Precision': 'auto', + }, + # Infer only a few specific types for these layers + 'middle_layer': { + 'Precision': { + 'accum': 'auto', + 'weight': 'auto', + }, + }, + 'last_layer': { + 'Precision': { + 'result': 'auto', + }, + }, + } + odir = str(test_root_path / f'hls4mlprj_auto_{model_type}_{backend}_{io_type}') + hls_model = hls4ml.converters.convert_from_keras_model( + model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend + ) + + # Compile will fail if there are still UnspecifiedPrecisionTypes in the model + hls_model.compile() + + # Predict + y_keras = model.predict(data).flatten() + y_hls = hls_model.predict(data).flatten() + np.testing.assert_allclose(y_keras, y_hls, rtol=2e-2, atol=5e-2, verbose=True) + + +@pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel']) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus']) +def test_auto_precision_dense(keras_model_dense, data_1d, io_type, backend): + model = keras_model_dense + data = data_1d + + config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<16,6>', granularity='model') + config['LayerName'] = { + # Infer all types of these layers + 'first_layer': { + 'Precision': 'auto', + }, + 'first_bn': { + 'Precision': 'auto', + }, + # Infer only a few specific types for these layers + 'middle_layer': { + 'Precision': { + 'accum': 'auto', + 'weight': 'auto', + }, + }, + 'last_layer': { + 'Precision': { + 'result': 'auto', + }, + }, + } + odir = str(test_root_path / f'hls4mlprj_auto_dense_{backend}_{io_type}') + hls_model = hls4ml.converters.convert_from_keras_model( + model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend + ) + + # Compile will fail if there are still UnspecifiedPrecisionTypes in the model + hls_model.compile() + + # Predict + y_keras = model.predict(data).flatten() + y_hls = hls_model.predict(data).flatten() + np.testing.assert_allclose(y_keras, y_hls, rtol=2e-2, atol=5e-2, verbose=True) From 932b01e235a0ea2e3a7183e15fa41c7cfb8409de Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 20 Aug 2023 22:50:18 +0200 Subject: [PATCH 036/103] Sepconv fixes --- hls4ml/backends/vivado/passes/convolution_templates.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 97972be36a..c990f08be0 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -285,6 +285,7 @@ def format(self, node): params['nzeros'] = node.get_weights('depthwise').nzeros params['index'] = str(node.index) + '_depthwise' params['weight_t'] = node.get_weights('depthwise').type + params['bias_t'] = node.get_weights('zero_bias').type params['fill_fn'] = 'FillConv1DBuffer' if node.get_attr('unscaled'): From 6a65fed1258c8f7070dc198c9eb49095b7f47511 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 26 Jan 2024 16:55:42 -0600 Subject: [PATCH 037/103] update precision propagation for signed, select im2col for quartus parallel conv --- .../model/optimizer/passes/infer_precision.py | 49 ++++++++++--------- test/pytest/test_auto_precision.py | 7 +++ 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 5ef1c2dee5..d2e166b557 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -1,3 +1,4 @@ +import math from copy import deepcopy import numpy as np @@ -70,53 +71,57 @@ def _infer_output_matching_precision(self, node, types_to_infer): return ['result_t'] - def _infer_common_precision(self, node, types_to_infer, n_ops): + def _infer_common_precision(self, node, types_to_infer, n_ops, use_given_weights=False): + '''The option, use_given_weights, allows you to tailor for the given weights, in particular, zero bias''' inferred_types = [] input_precision = node.get_input_variable().type.precision input_width = input_precision.width input_integers = input_precision.integer + input_signed = input_precision.signed if 'weight_t' in types_to_infer: weight_quantizer = node.get_attr('weight_quantizer', None) if weight_quantizer is not None: - weight_width = weight_quantizer.bits - weight_integers = weight_quantizer.hls_type.integer node.types['weight_t'].name = node.name + '_weight_t' node.types['weight_t'].precision = weight_quantizer.hls_type else: self._infer_default_type(node, 'weight_t') - weight_width = node.types['weight_t'].precision.width - weight_integers = node.types['weight_t'].precision.integer node.weights['weight'].update_precision(node.types['weight_t'].precision) - inferred_types.append('weight_t') - else: - weight_width = node.types['weight_t'].precision.width - weight_integers = node.types['weight_t'].precision.integer + + weight_width = node.types['weight_t'].precision.width + weight_integers = node.types['weight_t'].precision.integer + weight_signed = node.types['weight_t'].precision.signed if 'bias_t' in types_to_infer: bias_quantizer = node.get_attr('bias_quantizer', None) if bias_quantizer is not None: - bias_width = bias_quantizer.bits - bias_integers = bias_quantizer.hls_type.integer node.types['bias_t'].name = node.name + '_bias_t' node.types['bias_t'].precision = bias_quantizer.hls_type else: self._infer_default_type(node, 'bias_t') - bias_width = node.types['bias_t'].precision.width - bias_integers = node.types['bias_t'].precision.integer node.weights['bias'].update_precision(node.types['bias_t'].precision) - inferred_types.append('bias_t') - else: - bias_width = node.types['bias_t'].precision.width - bias_integers = node.types['bias_t'].precision.integer - new_type = FixedPrecisionType( - width=int(max(np.ceil(input_width + weight_width + np.log2(n_ops)), bias_width) + 1), - integer=int(max(np.ceil(input_integers + weight_integers + np.log2(n_ops)), bias_integers) + 1), - ) + bias_width = node.types['bias_t'].precision.width + bias_integers = node.types['bias_t'].precision.integer + bias_signed = node.types['bias_t'].precision.signed + no_bias = node.weights['bias'].nonzeros == 0 and use_given_weights # no bias + + # using math.ceil instead of np.ceil because it returns an int + bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops)) + integers = weight_integers + input_integers + math.ceil(np.log2(n_ops)) + signed = weight_signed or input_signed + + frac = bitwidth - integers + + if not no_bias: + integers = max(integers + (bias_signed and not signed), bias_integers + (signed and not bias_signed)) + 1 + bitwidth = integers + max(frac, bias_width - bias_integers) + signed = signed or bias_signed + + new_type = FixedPrecisionType(bitwidth, integers, signed) if 'accum_t' in types_to_infer: node.types['accum_t'].name = node.name + '_accum_t' @@ -133,7 +138,7 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): return inferred_types def _infer_dense_precision(self, node, types_to_infer): - n_ops = node.get_attr('n_in') * node.get_attr('n_out') + n_ops = node.get_attr('n_in') return self._infer_common_precision(node, types_to_infer, n_ops) def _infer_conv_precision(self, node, types_to_infer): diff --git a/test/pytest/test_auto_precision.py b/test/pytest/test_auto_precision.py index 8454d1a7f8..cbb74aa12f 100644 --- a/test/pytest/test_auto_precision.py +++ b/test/pytest/test_auto_precision.py @@ -150,6 +150,13 @@ def test_auto_precision_conv(keras_model_conv1d, keras_model_conv2d, data_2d, da }, }, } + + # Winograd is not bit-accurate, so avoid it. + if backend == 'Quartus' and io_type == 'io_parallel': + config["LayerName"]["first_layer"]["Implementation"] = "im2col" + config["LayerName"]["middle_layer"]["Implementation"] = "im2col" + config["LayerName"]["last_layer"]["Implementation"] = "im2col" + odir = str(test_root_path / f'hls4mlprj_auto_{model_type}_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend From 41b7e98d6386bed647305ace39ff0d07c2599905 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 6 Feb 2024 22:24:37 +0100 Subject: [PATCH 038/103] Make inferring no_bias a configurable option of the optimizer --- hls4ml/model/optimizer/passes/infer_precision.py | 13 ++++++++----- test/pytest/test_auto_precision.py | 6 +++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index d2e166b557..6f6a72097f 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -3,11 +3,15 @@ import numpy as np -from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.optimizer import ConfigurableOptimizerPass from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType -class InferPrecisionTypes(OptimizerPass): +class InferPrecisionTypes(ConfigurableOptimizerPass): + def __init__(self): + # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias + self.infer_no_bias = False + def match(self, node): for layer_type in node.types.values(): if isinstance(layer_type.precision, UnspecifiedPrecisionType): @@ -71,8 +75,7 @@ def _infer_output_matching_precision(self, node, types_to_infer): return ['result_t'] - def _infer_common_precision(self, node, types_to_infer, n_ops, use_given_weights=False): - '''The option, use_given_weights, allows you to tailor for the given weights, in particular, zero bias''' + def _infer_common_precision(self, node, types_to_infer, n_ops): inferred_types = [] input_precision = node.get_input_variable().type.precision @@ -107,7 +110,7 @@ def _infer_common_precision(self, node, types_to_infer, n_ops, use_given_weights bias_width = node.types['bias_t'].precision.width bias_integers = node.types['bias_t'].precision.integer bias_signed = node.types['bias_t'].precision.signed - no_bias = node.weights['bias'].nonzeros == 0 and use_given_weights # no bias + no_bias = node.weights['bias'].nonzeros == 0 and self.infer_no_bias # no bias # using math.ceil instead of np.ceil because it returns an int bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops)) diff --git a/test/pytest/test_auto_precision.py b/test/pytest/test_auto_precision.py index cbb74aa12f..356be1031a 100644 --- a/test/pytest/test_auto_precision.py +++ b/test/pytest/test_auto_precision.py @@ -153,9 +153,9 @@ def test_auto_precision_conv(keras_model_conv1d, keras_model_conv2d, data_2d, da # Winograd is not bit-accurate, so avoid it. if backend == 'Quartus' and io_type == 'io_parallel': - config["LayerName"]["first_layer"]["Implementation"] = "im2col" - config["LayerName"]["middle_layer"]["Implementation"] = "im2col" - config["LayerName"]["last_layer"]["Implementation"] = "im2col" + config['LayerName']['first_layer']['Implementation'] = 'im2col' + config['LayerName']['middle_layer']['Implementation'] = 'im2col' + config['LayerName']['last_layer']['Implementation'] = 'im2col' odir = str(test_root_path / f'hls4mlprj_auto_{model_type}_{backend}_{io_type}') hls_model = hls4ml.converters.convert_from_keras_model( From 24253e1b724f228f287447873eca53dbaf4e3644 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 16 Apr 2024 15:49:21 -0500 Subject: [PATCH 039/103] updates to infering precision from qonnx branch --- .../model/optimizer/passes/infer_precision.py | 141 ++++++++++++++++-- 1 file changed, 128 insertions(+), 13 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 6f6a72097f..4de58a18c2 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -1,18 +1,29 @@ import math -from copy import deepcopy import numpy as np from hls4ml.model.optimizer import ConfigurableOptimizerPass from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType +# TODO: The code assumes everything is Fixed or Integer precision. Need to add checks + class InferPrecisionTypes(ConfigurableOptimizerPass): def __init__(self): # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias self.infer_no_bias = False + self.count = 0 + self.MAX_COUNT = 1000 def match(self, node): + input_var = node.get_input_variable() + if input_var is not None and isinstance(input_var.type, UnspecifiedPrecisionType): + # need to wait for the input to update + # but check for infinite loops + self.count += 1 + if self.count == self.MAX_COUNT: + raise RuntimeError("There is an infinite loop in the precision inference.") + return False for layer_type in node.types.values(): if isinstance(layer_type.precision, UnspecifiedPrecisionType): return True @@ -29,14 +40,14 @@ def transform(self, model, node): if type_name not in inferred_types: self._infer_default_type(node, type_name) - return False # No model graph changes made + return True # May need to rerun def _infer_precision(self, node, types_to_infer): node_class = node.class_name if node_class in ['Dense']: return self._infer_dense_precision(node, types_to_infer) - if node_class in ['BatchNormalization']: + if node_class in ['BatchNormalization', 'ApplyAlpha']: return self._infer_bn_precision(node, types_to_infer) if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']: @@ -51,14 +62,24 @@ def _infer_precision(self, node, types_to_infer): if node_class in ['Clone', 'Reshape', 'Resize', 'Transpose', 'ZeroPadding1D', 'ZeroPadding2D']: return self._infer_output_matching_precision(node, types_to_infer) - if node_class in ['Concatenate', 'Merge']: + if node_class in ['Merge']: return self._infer_merge_precision(node, types_to_infer) + if node_class in ['Concatenate']: + return self._infer_cat_precision(node, types_to_infer) + + if node_class in ['Dot']: + return self._infer_dot_precision(node, types_to_infer) + # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent # this in config_from_* functions return [] + def _get_default_precision(self, node): + model_config = node.model.config + return model_config.backend.convert_precision_string(model_config.model_precision['default']) + def _infer_default_type(self, node, type_name): model_config = node.model.config default_precision = model_config.backend.convert_precision_string(model_config.model_precision['default']) @@ -124,6 +145,7 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): bitwidth = integers + max(frac, bias_width - bias_integers) signed = signed or bias_signed + # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. new_type = FixedPrecisionType(bitwidth, integers, signed) if 'accum_t' in types_to_infer: @@ -225,6 +247,11 @@ def _infer_sepconv_precision(self, node, types_to_infer): return inferred_types def _infer_bn_precision(self, node, types_to_infer): + """ + The batchnormalziation precision here is the more implementation-focused version. It propagates + precision from scale and bias, not mean, variance, etc. + """ + inferred_types = [] if 'scale_t' in types_to_infer: @@ -238,16 +265,28 @@ def _infer_bn_precision(self, node, types_to_infer): inferred_types.append('bias_t') if 'result_t' in types_to_infer: + input_precision = node.get_input_variable().type.precision scale_precision = node.types['scale_t'].precision bias_precision = node.types['bias_t'].precision - out_precision = deepcopy(node.get_input_node().get_output_variable().type.precision) - out_precision.integer += scale_precision.integer - out_precision.fractional = max(out_precision.fractional, scale_precision.fractional) + after_scale_signed = scale_precision.signed or input_precision.signed + after_scale_width = input_precision.width + scale_precision.width + after_scale_integer = input_precision.integer + scale_precision.integer + + out_precision_signed = after_scale_signed or bias_precision.signed + out_precision_integer = ( + max( + after_scale_integer + (bias_precision.signed and not after_scale_signed), + bias_precision.integer + (after_scale_signed and not bias_precision.signed), + ) + + 1 + ) + out_precision_width = out_precision_integer + max( + after_scale_width - after_scale_integer, bias_precision.fractional + ) - out_precision.integer = max(out_precision.integer, bias_precision.integer) + 1 - out_precision.fractional = max(out_precision.fractional, bias_precision.fractional) - out_precision.width = out_precision.fractional + out_precision.integer + # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. + out_precision = FixedPrecisionType(out_precision_width, out_precision_integer, out_precision_signed) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision @@ -288,10 +327,86 @@ def _infer_merge_precision(self, node, types_to_infer): input_1 = node.get_input_variable(node.inputs[0]).type.precision input_2 = node.get_input_variable(node.inputs[1]).type.precision - new_width = max(input_1.fractional, input_2.fractional) + max(input_1.integer, input_2.integer) - new_int = max(input_1.integer, input_2.integer) + op = node.get_attr('op').lower() + if op in ('add', 'subtract', 'average'): + new_signed = input_1.signed or input_2.signed or op == 'subtract' + new_int = ( + max( + input_1.integer + (input_2.signed and not input_1.signed), + input_2.integer + (input_1.signed and not input_2.signed), + ) + + 1 + ) + new_width = new_int + max(input_1.fractional, input_2.fractional) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + elif op == 'multiply': + new_signed = input_1.signed or input_2.signed + new_int = input_1.integer + input_2.integer + new_width = input_1.width + input_2.width + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + elif op in ('maximum', 'minimum'): + new_signed = input_1.signed or input_2.signed + + input_1_integer = input_1.integer + input_2_integer = input_2.integer + + # add one to integer if unsigned while new is signed + if new_signed and not input_1.signed: + input_1_integer += 1 + if new_signed and not input_2.signed: + input_2_integer += 1 + + new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) + new_int = max(input_1_integer, input_2_integer) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + else: + print(f'Warning: not propagating weights for type {op}') + out_precision = self._get_default_precision(node) + + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + + return ['result_t'] + + def _infer_cat_precision(self, node, types_to_infer): + assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + + input_1 = node.get_input_variable(node.inputs[0]).type.precision + input_2 = node.get_input_variable(node.inputs[1]).type.precision + + new_signed = input_1.signed or input_2.signed + + input_1_integer = input_1.integer + input_2_integer = input_2.integer + + # add one to integer if unsigned while new is signed + if new_signed and not input_1.signed: + input_1_integer += 1 + if new_signed and not input_2.signed: + input_2_integer += 1 + + new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) + new_int = max(input_1_integer, input_2_integer) + + out_precision = FixedPrecisionType(new_width, new_int, new_signed) + node.types['result_t'].name = node.name + '_result_t' + node.types['result_t'].precision = out_precision + + return ['result_t'] + + def _infer_dot_precision(self, node, types_to_infer): + assert 'result_t' in types_to_infer and len(types_to_infer) == 1 + + input_1 = node.get_input_variable(node.inputs[0]).type.precision + input_2 = node.get_input_variable(node.inputs[1]).type.precision + + n_in = node.get_input_variable(node.inputs[0]).shape[0] + + new_signed = input_1.signed or input_2.signed + new_width = input_1.width + input_2.width + math.ceil(np.log2(n_in)) + new_int = input_1.integer + input_2.integer + math.ceil(np.log2(n_in)) - out_precision = FixedPrecisionType(new_width, new_int) + out_precision = FixedPrecisionType(new_width, new_int, new_signed) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision From 6ee81890973dfa92412b946ea17ebccdfbff6303 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 17 Apr 2024 09:59:54 -0500 Subject: [PATCH 040/103] remove count, become more selective on when True is returned --- hls4ml/model/optimizer/passes/infer_precision.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 4de58a18c2..ee585c42d6 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -12,17 +12,11 @@ class InferPrecisionTypes(ConfigurableOptimizerPass): def __init__(self): # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias self.infer_no_bias = False - self.count = 0 - self.MAX_COUNT = 1000 def match(self, node): input_var = node.get_input_variable() if input_var is not None and isinstance(input_var.type, UnspecifiedPrecisionType): - # need to wait for the input to update - # but check for infinite loops - self.count += 1 - if self.count == self.MAX_COUNT: - raise RuntimeError("There is an infinite loop in the precision inference.") + # only infer types if the input type is known return False for layer_type in node.types.values(): if isinstance(layer_type.precision, UnspecifiedPrecisionType): @@ -40,7 +34,9 @@ def transform(self, model, node): if type_name not in inferred_types: self._infer_default_type(node, type_name) - return True # May need to rerun + # if the return type was set, this may allow InferPrecisionTypes to be run + # on layers it was not previously able to + return 'result_t' in types_to_infer def _infer_precision(self, node, types_to_infer): node_class = node.class_name From b5add0caefe02dfd2412dfa3355d97f8a0a39980 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Tue, 16 Apr 2024 17:08:49 -0700 Subject: [PATCH 041/103] fix pooling precision --- .../model/optimizer/passes/infer_precision.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index ee585c42d6..a38f61914a 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -296,15 +296,21 @@ def _infer_pooling_precision(self, node, types_to_infer): if 'accum_t' in types_to_infer: input_precision = node.get_input_variable().type.precision - input_width = input_precision.width - input_integers = input_precision.integer + pool_op = node.attributes['pool_op'].lower() - n_ops = node.get_attr('n_filt') * node.get_attr('pool_height', 1) * node.get_attr('pool_width') + width: int = input_precision.width + integer: int = input_precision.integer + signed: bool = input_precision.signed - accum_type = FixedPrecisionType( - width=int(np.ceil(input_width + np.log2(n_ops)) + 1), - integer=int(np.ceil(input_integers + np.log2(n_ops)) + 1), - ) + pool_size: int = node.get_attr('pool_height', 1) * node.get_attr('pool_width') + if pool_op == 'avg': + extra_bits = int(np.ceil(np.log2(pool_size))) + elif pool_op == 'max': + extra_bits = 0 + else: + raise ValueError(f'Unknown pooling operation: {pool_op}') + + accum_type = FixedPrecisionType(width=width + extra_bits * 2, integer=integer + extra_bits, signed=signed) node.types['accum_t'].name = node.name + '_accum_t' node.types['accum_t'].precision = accum_type From 665c904aee185e0235b96496a1165ca2f581e702 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Wed, 17 Apr 2024 11:07:24 -0700 Subject: [PATCH 042/103] remove typing --- hls4ml/model/optimizer/passes/infer_precision.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index a38f61914a..3bc3a64772 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -298,11 +298,11 @@ def _infer_pooling_precision(self, node, types_to_infer): input_precision = node.get_input_variable().type.precision pool_op = node.attributes['pool_op'].lower() - width: int = input_precision.width - integer: int = input_precision.integer - signed: bool = input_precision.signed + width = input_precision.width + integer = input_precision.integer + signed = input_precision.signed - pool_size: int = node.get_attr('pool_height', 1) * node.get_attr('pool_width') + pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width') if pool_op == 'avg': extra_bits = int(np.ceil(np.log2(pool_size))) elif pool_op == 'max': From b366d2488f0bb78067a41f2c34152ab6b4c63ab4 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Thu, 18 Apr 2024 21:13:42 +0200 Subject: [PATCH 043/103] Fix avg pooling op check --- hls4ml/model/optimizer/passes/infer_precision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 3bc3a64772..51422c534e 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -303,7 +303,7 @@ def _infer_pooling_precision(self, node, types_to_infer): signed = input_precision.signed pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width') - if pool_op == 'avg': + if pool_op == 'average': extra_bits = int(np.ceil(np.log2(pool_size))) elif pool_op == 'max': extra_bits = 0 From f0ca86597aaee34ecf2dba4c22a4b0a230c666fd Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 21 Feb 2024 21:39:00 +0100 Subject: [PATCH 044/103] Optimizer to remove expensive Transpose that serves as Flatten --- hls4ml/model/optimizer/__init__.py | 1 + .../passes/convert_to_channels_last.py | 47 ++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 5eab99db8a..1a150e727d 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -35,6 +35,7 @@ [ 'infer_precision_types', 'channels_last_converter', + 'remove_transpose_before_flatten', 'fuse_bias_add', 'remove_useless_transpose', 'expand_layer_group', diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py index 9c19711569..98ae549be5 100644 --- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py +++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py @@ -2,8 +2,9 @@ # Based on https://github.com/fastmachinelearning/qonnx/blob/ # 12c96a3ded06beacab08e0f554e4ed014476c0aa/src/qonnx/transformation/channels_last.py -from hls4ml.model.layers import Concatenate, Input, Reshape +from hls4ml.model.layers import Concatenate, Dense, Input, Reshape, Transpose from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import WeightVariable class ChannelsLastConverter(OptimizerPass): @@ -133,3 +134,47 @@ def transform(self, model, node): node.channels_last_converted = True return True + + +class RemoveTransposeBeforeFlatten(OptimizerPass): + '''After the channels last conversion, model may have a sequence: Transpose -> Flatten -> Dense. + In this case we can remove the expensive transpose and instead transpose the weights of the Dense layer.''' + + def match(self, node): + if node.model.config.get_config_value('IOType') != 'io_parallel': + return False + + if isinstance(node, Reshape): + input_node = node.get_input_node() + output_nodes = node.get_output_nodes() + if len(node.get_attr('target_shape')) == 1 and isinstance(input_node, Transpose) \ + and len(output_nodes) == 1 and isinstance(output_nodes[0], Dense): + return True + + return False + + def transform(self, model, node): + transpose_node = node.get_input_node() + dense_node = node.get_output_nodes()[0] + input_shape = transpose_node.get_output_variable().shape + + weight_var = dense_node.get_weights('weight') + # Transpose the weights to achieve the same computation with transposed input + weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(1, 2, 0, 3) + weight_data_t = weight_data_t.reshape(-1, weight_data_t.shape[-1]) + new_weight_var = WeightVariable( + var_name=weight_var.name, + type_name=weight_var.type.name, + precision=weight_var.type.precision, + quantizer=weight_var.quantizer, + data=weight_data_t, + index=dense_node.index + ) + + # Update the weight variable of the node + dense_node.set_attr('weight', new_weight_var) + + # Get rid of the Transpose node + model.remove_node(transpose_node) + + return True \ No newline at end of file From 1e416b5cdb3ee9cb4a75577347fab5820612c731 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 27 Feb 2024 21:09:04 +0100 Subject: [PATCH 045/103] Generalize removal of Transpose after flatten so it works on 1D as well --- .../passes/convert_to_channels_last.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py index 98ae549be5..01e949086e 100644 --- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py +++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py @@ -144,23 +144,38 @@ def match(self, node): if node.model.config.get_config_value('IOType') != 'io_parallel': return False + if hasattr(node, '_channels_last_keep_transpose') and node._channels_last_keep_transpose: + return False + if isinstance(node, Reshape): input_node = node.get_input_node() output_nodes = node.get_output_nodes() - if len(node.get_attr('target_shape')) == 1 and isinstance(input_node, Transpose) \ - and len(output_nodes) == 1 and isinstance(output_nodes[0], Dense): + if ( + len(node.get_attr('target_shape')) == 1 + and isinstance(input_node, Transpose) + and len(output_nodes) == 1 + and isinstance(output_nodes[0], Dense) + ): return True - + return False - + def transform(self, model, node): transpose_node = node.get_input_node() dense_node = node.get_output_nodes()[0] input_shape = transpose_node.get_output_variable().shape + if len(input_shape) == 2: # Usually after Conv1D + tran_axis = [1, 0, 2] + elif len(input_shape) == 3: # Usually after Conv2D + tran_axis = [1, 2, 0, 3] + else: # In this case we bail + node._channels_last_keep_transpose = True + return False + weight_var = dense_node.get_weights('weight') # Transpose the weights to achieve the same computation with transposed input - weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(1, 2, 0, 3) + weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(*tran_axis) weight_data_t = weight_data_t.reshape(-1, weight_data_t.shape[-1]) new_weight_var = WeightVariable( var_name=weight_var.name, @@ -168,13 +183,13 @@ def transform(self, model, node): precision=weight_var.type.precision, quantizer=weight_var.quantizer, data=weight_data_t, - index=dense_node.index + index=dense_node.index, ) - + # Update the weight variable of the node dense_node.set_attr('weight', new_weight_var) # Get rid of the Transpose node model.remove_node(transpose_node) - return True \ No newline at end of file + return True From 2a5d8de2134c7779e617cb981b49108eb99fe45e Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 27 Feb 2024 21:09:35 +0100 Subject: [PATCH 046/103] Remove transpose of input if n_chan=1 --- hls4ml/model/optimizer/__init__.py | 3 +- .../model/optimizer/passes/transpose_opt.py | 40 +++++++++-- test/pytest/test_pytorch_api.py | 69 +++++++++++++++++++ 3 files changed, 104 insertions(+), 8 deletions(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 1a150e727d..3aa247d03f 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -36,8 +36,9 @@ 'infer_precision_types', 'channels_last_converter', 'remove_transpose_before_flatten', + 'remove_nop_transpose', + 'remove_single_channel_transpose', 'fuse_bias_add', - 'remove_useless_transpose', 'expand_layer_group', 'output_rounding_saturation_mode', 'qkeras_factorize_alpha', diff --git a/hls4ml/model/optimizer/passes/transpose_opt.py b/hls4ml/model/optimizer/passes/transpose_opt.py index a5bff8a703..1c0328fb34 100644 --- a/hls4ml/model/optimizer/passes/transpose_opt.py +++ b/hls4ml/model/optimizer/passes/transpose_opt.py @@ -1,21 +1,47 @@ -from hls4ml.model.layers import Transpose +from hls4ml.model.layers import Input, Transpose from hls4ml.model.optimizer import OptimizerPass -class RemoveUselessTranspose(OptimizerPass): +class RemoveNopTranspose(OptimizerPass): + """ + Remove a transpose layer if it doesn't do anything to a 1D array. i.e, 1D input and perm = [0] + """ + def match(self, node): is_match = isinstance(node, Transpose) and node.get_attr('perm') == [0] # Useless transpose return is_match def transform(self, model, node): - """ - Remove a transpose layer if it doesn't do anything. i.e 1D input and perm = [0] - """ - print(f"Unnessary {node.name} in the model, optimizing ...") + print(f'Unnecessary transpose node ({node.name}) detected, optimizing ...') if not node.get_output_nodes(): - print(f"WARNING: {node.name} is the output layer! No rewiring performed.") + print(f'WARNING: {node.name} is the output layer! No rewiring performed.') model.remove_node(node, rewire=False) # Don't rewire if there is no output layer else: model.remove_node(node, rewire=True) return True + + +class RemoveSingleChannelTranspose(OptimizerPass): + """ + Remove transpose of inputs if the number of channels is 1 as for io_parallel this doesn't affect the array + representation used + """ + + def match(self, node): + if node.model.config.get_config_value('IOType') != 'io_parallel': + return False + + return ( + isinstance(node, Transpose) + and isinstance(node.get_input_node(), Input) + and node.get_input_variable().shape[0] == 1 + ) + + def transform(self, model, node): + # Adjust the input shape and remove the Transpose node + input_var = node.get_input_variable() + input_var.shape.append(input_var.shape.pop(0)) + model.remove_node(node) + + return True diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py index f9bc175ca2..f5985d0dab 100644 --- a/test/pytest/test_pytorch_api.py +++ b/test/pytest/test_pytorch_api.py @@ -740,3 +740,72 @@ def test_skipped_layers(backend, io_type): hls_prediction = hls_model.predict(hls_input).flatten() np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2) + + +@pytest.mark.parametrize('backend', ['Vivado', 'Quartus']) +@pytest.mark.parametrize('io_type', ['io_parallel']) # Only io_parallel for now +@pytest.mark.parametrize('tensor_rank', [2, 3]) +def test_remove_transpose(backend, io_type, tensor_rank): + class TestModel(nn.Module): + def __init__(self, tensor_rank): + super().__init__() + if tensor_rank == 2: + self.conv1 = nn.Conv1d(in_channels=1, out_channels=4, kernel_size=3, bias=False) + self.relu1 = nn.ReLU() + self.flatten = nn.Flatten() + self.fc1 = nn.Linear(in_features=4 * 6, out_features=5, bias=False) + self.relu2 = nn.ReLU() + else: + self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, bias=False) + self.relu1 = nn.ReLU() + self.flatten = nn.Flatten() + self.fc1 = nn.Linear(in_features=4 * 6 * 6, out_features=5, bias=False) + self.relu2 = nn.ReLU() + + def forward(self, x): + # In the hls4ml model, there should be a Transpose node on the input tensor before conv1 + x = self.conv1(x) + x = self.relu1(x) + x = self.flatten(x) # This should result in a Transpose node that we aim to remove + x = self.fc1(x) + x = self.relu2(x) + return x + + model = TestModel(tensor_rank=tensor_rank) + if tensor_rank == 2: + input_shape = (1, 8) + input_tensor = torch.randn(10, 1, 8) + hls_input = np.ascontiguousarray(torch.permute(input_tensor, (0, 2, 1)).detach().numpy()) + else: + input_shape = (1, 8, 8) + input_tensor = torch.randn(10, 1, 8, 8) + hls_input = np.ascontiguousarray(torch.permute(input_tensor, (0, 2, 3, 1)).detach().numpy()) + + batch_input_shape = (None,) + input_shape + config = config_from_pytorch_model( + model, + default_precision='ap_fixed<32,16>', + inputs_channel_last=False, # Crucial for testing if the first Transpose was removed + transpose_outputs=False, + ) + output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_transpose_nop_{tensor_rank}d_{backend}_{io_type}') + hls_model = convert_from_pytorch_model( + model, + batch_input_shape, + hls_config=config, + output_dir=output_dir, + io_type=io_type, + backend=backend, + ) + + hls_model.compile() + + # Test optimizers removed the two Transpose layers + transpose_layers = [layer for layer in list(hls_model.get_layers()) if layer.class_name == 'Transpose'] + assert len(transpose_layers) == 0 + + # Test predictions match + pytorch_prediction = model(input_tensor).detach().numpy().flatten() + hls_prediction = hls_model.predict(hls_input).flatten() + + np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2) From 3969523dd45e624a2e56eabdc4724a6fc90fc9ad Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 13 May 2024 19:09:47 +0200 Subject: [PATCH 047/103] SepConv1d/2d for io_parallel w/ Latency strategy --- hls4ml/backends/fpga/passes/codegen.py | 80 +++++++++++++++-- .../vivado/passes/convolution_templates.py | 24 +++-- .../vivado/nnet_utils/nnet_sepconv1d.h | 46 ++++++++++ .../nnet_utils/nnet_sepconv1d_latency.h | 86 ++++++++++++++++++ .../vivado/nnet_utils/nnet_sepconv2d.h | 51 +++++++++++ .../nnet_utils/nnet_sepconv2d_latency.h | 87 +++++++++++++++++++ 6 files changed, 360 insertions(+), 14 deletions(-) create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index f1f1080996..b8f367137b 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -1,4 +1,4 @@ -from hls4ml.model.layers import Conv1D, Conv2D +from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D from hls4ml.model.optimizer import OptimizerPass from hls4ml.model.types import Source @@ -7,16 +7,27 @@ class GenerateConvIm2col(OptimizerPass): '''Generates tcode for im2col step of 1D/2d convolution''' def match(self, node): - return isinstance(node, (Conv1D, Conv2D)) and node.model.config.get_config_value('IOType') == 'io_parallel' + return ( + isinstance(node, (Conv1D, Conv2D, SeparableConv1D, SeparableConv2D)) + and node.model.config.get_config_value('IOType') == 'io_parallel' + ) def transform(self, model, node): - node_class = node.__class__.__name__ - if '1D' in node_class: - self._generate_im2col_1d(node) - elif '2D' in node_class: - self._generate_im2col_2d(node) + node_class = node.class_name + if 'Separable' in node_class: + if '1D' in node_class: + self._generate_separable_im2col_1d(node) + elif '2D' in node_class: + self._generate_separable_im2col_2d(node) + else: + raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') else: - raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') + if '1D' in node_class: + self._generate_im2col_1d(node) + elif '2D' in node_class: + self._generate_im2col_2d(node) + else: + raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})') def _generate_im2col_1d(self, node): code_str = node.model.config.backend.generate_conv1d_line_buffer_fn( @@ -49,3 +60,56 @@ def _generate_im2col_2d(self, node): ) node.set_attr('line_buffer_codegen', Source(code_str)) + + def _generate_separable_im2col_1d(self, node): + dw_code_str = node.model.config.backend.generate_conv1d_line_buffer_fn( + str(node.get_attr('index')) + '_dw', + node.get_attr('n_partitions'), + node.get_input_variable().shape[0], + node.get_input_variable().shape[1], + kernel=node.get_attr('filt_width'), + stride=node.get_attr('stride_width'), + pad=(node.get_attr('pad_left'), node.get_attr('pad_right')), + ) + + node.set_attr('dw_line_buffer_codegen', Source(dw_code_str)) + + pw_code_str = node.model.config.backend.generate_conv1d_line_buffer_fn( + str(node.get_attr('index')) + '_pw', + node.get_attr('n_partitions'), + node.get_output_variable().shape[0], + node.get_output_variable().shape[1], + kernel=1, + ) + + node.set_attr('pw_line_buffer_codegen', Source(pw_code_str)) + + def _generate_separable_im2col_2d(self, node): + dw_code_str = node.model.config.backend.generate_conv2d_line_buffer_fn( + str(node.get_attr('index')) + '_dw', + node.get_attr('n_partitions'), + node.get_input_variable().shape[0], + node.get_input_variable().shape[1], + node.get_input_variable().shape[2], + kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')), + stride=(node.get_attr('stride_height'), node.get_attr('stride_width')), + pad=( + node.get_attr('pad_top'), + node.get_attr('pad_bottom'), + node.get_attr('pad_left'), + node.get_attr('pad_right'), + ), + ) + + node.set_attr('dw_line_buffer_codegen', Source(dw_code_str)) + + pw_code_str = node.model.config.backend.generate_conv2d_line_buffer_fn( + str(node.get_attr('index')) + '_pw', + node.get_attr('n_partitions'), + node.get_output_variable().shape[0], + node.get_output_variable().shape[1], + node.get_input_variable().shape[2], + kernel=(1, 1), + ) + + node.set_attr('pw_line_buffer_codegen', Source(pw_code_str)) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index c990f08be0..43a8b4fb7d 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -254,8 +254,8 @@ def __init__(self): '{input}, {output}, {d}, {p}, {z}, {b});' ) -sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h'] -sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h'] +sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h'] +sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h'] class SeparableConv1DConfigTemplate(LayerConfigTemplate): @@ -286,7 +286,10 @@ def format(self, node): params['index'] = str(node.index) + '_depthwise' params['weight_t'] = node.get_weights('depthwise').type params['bias_t'] = node.get_weights('zero_bias').type - params['fill_fn'] = 'FillConv1DBuffer' + if node.model.config.get_config_value('IOType') == 'io_parallel': + params['fill_fn'] = f'fill_buffer_{node.index}_dw' + else: + params['fill_fn'] = 'FillConv1DBuffer' if node.get_attr('unscaled'): params['scale_index_type'] = 'scale_index_unscaled' @@ -323,7 +326,10 @@ def format(self, node): params['weight_t'] = node.get_weights('pointwise').type params['min_width'] = params['in_width'] params['instructions'] = '0' - params['fill_fn'] = 'FillConv1DBuffer' + if node.model.config.get_config_value('IOType') == 'io_parallel': + params['fill_fn'] = f'fill_buffer_{node.index}_dw' + else: + params['fill_fn'] = 'FillConv1DBuffer' if node.get_attr('unscaled'): params['scale_index_type'] = 'scale_index_unscaled' @@ -402,7 +408,10 @@ def format(self, node): params['nzeros'] = node.get_weights('depthwise').nzeros params['index'] = str(node.index) + '_depthwise' params['weight_t'] = node.get_weights('depthwise').type - params['fill_fn'] = 'FillConv2DBuffer' + if node.model.config.get_config_value('IOType') == 'io_parallel': + params['fill_fn'] = f'fill_buffer_{node.index}_dw' + else: + params['fill_fn'] = 'FillConv2DBuffer' if node.get_attr('unscaled_h'): params['scale_index_height_type'] = 'scale_index_unscaled' @@ -447,7 +456,10 @@ def format(self, node): params['min_height'] = params['in_height'] params['min_width'] = params['in_width'] params['instructions'] = '0' - params['fill_fn'] = 'FillConv2DBuffer' + if node.model.config.get_config_value('IOType') == 'io_parallel': + params['fill_fn'] = f'fill_buffer_{node.index}_pw' + else: + params['fill_fn'] = 'FillConv2DBuffer' if node.get_attr('unscaled_h'): params['scale_index_height_type'] = 'scale_index_unscaled' diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h new file mode 100644 index 0000000000..d804af260c --- /dev/null +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h @@ -0,0 +1,46 @@ +#ifndef NNET_SEPARABLE_CONV1D_H_ +#define NNET_SEPARABLE_CONV1D_H_ + +#include "nnet_common.h" +#include "nnet_conv1d.h" +#include "nnet_sepconv1d_latency.h" +//#include "nnet_sepconv1d_resource.h" +#include + +namespace nnet { + +template +void depthwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + depthwise_conv_1d_latency_cl(data, res, weights, biases); + } else { + assert("Resource strategy for DepthwiseConv1D is not supported." && false); + } +} + +template +void separable_conv_1d_cl(data_T data[CONFIG_T::depthwise_config::in_width * CONFIG_T::depthwise_config::n_chan], + res_T res[CONFIG_T::pointwise_config::out_width * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::weight_t + depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { + #pragma HLS INLINE recursive + + dw_res_T depthwise_res[CONFIG_T::depthwise_config::out_width * CONFIG_T::depthwise_config::n_filt]; + + depthwise_conv_1d_cl(data, depthwise_res, depthwise_weights, + depthwise_biases); + pointwise_conv_1d_cl(depthwise_res, res, pointwise_weights, + pointwise_biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h new file mode 100644 index 0000000000..c9fe86ea93 --- /dev/null +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h @@ -0,0 +1,86 @@ +#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_ +#define NNET_SEPARABLE_CONV2D_LATENCY_H_ + +#include "nnet_common.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +template +void depthwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_acc = CONFIG_T::filt_width; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + typename CONFIG_T::accum_t mult[mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=mult complete + + typename CONFIG_T::accum_t acc[mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + + // Limit multipliers to control parallelization + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit + +PartitionLoop: + for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + data_T cache; + + // Do the matrix-multiply + Product: + for (int i_in = 0; i_in < mult_n_in; i_in++) { + #pragma HLS UNROLL + cache = data_buf[i_pxl][i_in]; + mult[i_in] = + CONFIG_T::mult_config::template product::product( + cache, weights[i_in]); + } + + // Initialize accumulator with input biases + ResetAccum: + for (int i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc]; + } + + // Accumulate multiplication result + Accum1: + for (int i_in = 0; i_in < mult_n_acc; i_in++) { + #pragma HLS UNROLL + Accum2: + for (int i_out = 0; i_out < mult_n_out; i_out++) { + #pragma HLS UNROLL + acc[i_out] += mult[i_in * mult_n_out + i_out]; + } + } + + // Cast to "res_t" type + Result: + for (int i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_res]); + } + } + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h new file mode 100644 index 0000000000..9ec638375d --- /dev/null +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h @@ -0,0 +1,51 @@ +#ifndef NNET_SEPARABLE_CONV2D_H_ +#define NNET_SEPARABLE_CONV2D_H_ + +#include "nnet_common.h" +#include "nnet_conv2d.h" +#include "nnet_sepconv2d_latency.h" +//#include "nnet_sepconv2d_resource.h" +#include + +namespace nnet { + +template +void depthwise_conv_2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + depthwise_conv_2d_latency_cl(data, res, weights, biases); + } else { + assert("Resource strategy for DepthwiseConv2D is not supported." && false); + } +} + +template +void separable_conv_2d_cl(data_T data[CONFIG_T::depthwise_config::in_height * CONFIG_T::depthwise_config::in_width * + CONFIG_T::depthwise_config::n_chan], + res_T res[CONFIG_T::pointwise_config::out_height * CONFIG_T::pointwise_config::out_width * + CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::weight_t + depthwise_weights[CONFIG_T::depthwise_config::filt_height * + CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { + #pragma HLS INLINE recursive + + dw_res_T depthwise_res[CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width * + CONFIG_T::depthwise_config::n_filt]; + + depthwise_conv_2d_cl(data, depthwise_res, depthwise_weights, + depthwise_biases); + pointwise_conv_2d_cl(depthwise_res, res, pointwise_weights, + pointwise_biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h new file mode 100644 index 0000000000..161cc2c834 --- /dev/null +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h @@ -0,0 +1,87 @@ +#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_ +#define NNET_SEPARABLE_CONV2D_LATENCY_H_ + +#include "nnet_common.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +template +void depthwise_conv_2d_latency_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_acc = CONFIG_T::filt_height * CONFIG_T::filt_width; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + typename CONFIG_T::accum_t mult[mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=mult complete + + typename CONFIG_T::accum_t acc[mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + + // Limit multipliers to control parallelization + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit + +PartitionLoop: + for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + data_T cache; + + // Do the matrix-multiply + Product: + for (int i_in = 0; i_in < mult_n_in; i_in++) { + #pragma HLS UNROLL + cache = data_buf[i_pxl][i_in]; + mult[i_in] = + CONFIG_T::mult_config::template product::product( + cache, weights[i_in]); + } + + // Initialize accumulator with input biases + ResetAccum: + for (int i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc]; + } + + // Accumulate multiplication result + Accum1: + for (int i_in = 0; i_in < mult_n_acc; i_in++) { + #pragma HLS UNROLL + Accum2: + for (int i_out = 0; i_out < mult_n_out; i_out++) { + #pragma HLS UNROLL + acc[i_out] += mult[i_in * mult_n_out + i_out]; + } + } + + // Cast to "res_t" type + Result: + for (int i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_res]); + } + } + } +} + +} // namespace nnet +#endif From 52252ca8cc5898b83fa0a7fc124bff18ca384ca8 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 13 May 2024 20:29:59 +0200 Subject: [PATCH 048/103] Cosmetic parameter config fixes --- hls4ml/backends/fpga/fpga_backend.py | 4 ++-- hls4ml/backends/fpga/passes/codegen.py | 2 +- hls4ml/backends/vivado/passes/convolution_templates.py | 5 ++++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 8d0ed64aad..87309ff4e5 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -685,7 +685,7 @@ def generate_conv1d_line_buffer_fn(self, layer_idx, n_partitions, in_W, in_C, ke The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since - the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc), + the result depends on the parameters of the convolution layer (the input size, the kernel size, stride etc), we need to do this for every convolution layer. Args: @@ -782,7 +782,7 @@ def generate_conv2d_line_buffer_fn( The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since - the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc), + the result depends on the parameters of the convolution layer (the input size, the kernel size, stride etc), we need to do this for every convolution layer. Args: diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py index b8f367137b..c951a02b80 100644 --- a/hls4ml/backends/fpga/passes/codegen.py +++ b/hls4ml/backends/fpga/passes/codegen.py @@ -78,7 +78,7 @@ def _generate_separable_im2col_1d(self, node): str(node.get_attr('index')) + '_pw', node.get_attr('n_partitions'), node.get_output_variable().shape[0], - node.get_output_variable().shape[1], + node.get_input_variable().shape[1], kernel=1, ) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 43a8b4fb7d..037f2d5eb2 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -320,6 +320,7 @@ def format(self, node): params['filt_width'] = 1 params['stride_width'] = 1 + params['pad_left'] = params['pad_right'] = 0 params['dilation'] = node.get_attr('dilation', 1) params['nzeros'] = node.get_weights('pointwise').nzeros params['index'] = str(node.index) + '_pointwise' @@ -327,7 +328,7 @@ def format(self, node): params['min_width'] = params['in_width'] params['instructions'] = '0' if node.model.config.get_config_value('IOType') == 'io_parallel': - params['fill_fn'] = f'fill_buffer_{node.index}_dw' + params['fill_fn'] = f'fill_buffer_{node.index}_pw' else: params['fill_fn'] = 'FillConv1DBuffer' @@ -449,6 +450,8 @@ def format(self, node): params['filt_height'] = params['filt_width'] = 1 params['stride_height'] = params['stride_width'] = 1 + params['pad_left'] = params['pad_right'] = 0 + params['pad_top'] = params['pad_bottom'] = 0 params['dilation'] = node.get_attr('dilation', 1) params['nzeros'] = node.get_weights('pointwise').nzeros params['index'] = str(node.index) + '_pointwise' From be56b9347873600296d65aecb8a0ca115e212871 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 13 May 2024 20:32:07 +0200 Subject: [PATCH 049/103] Tests for SepConv io_parallel --- test/pytest/test_sepconv1d.py | 4 ++-- test/pytest/test_sepconv2d.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py index 64b72db48a..e64bd06a76 100644 --- a/test/pytest/test_sepconv1d.py +++ b/test/pytest/test_sepconv1d.py @@ -12,9 +12,9 @@ keras_conv1d = [SeparableConv1D] padds_options = ['same', 'valid'] chans_options = ['channels_last'] -io_type_options = ['io_stream'] +io_type_options = ['io_parallel', 'io_stream'] strides_options = [(1), (2)] -kernel_options = [(1), (3)] +kernel_options = [(2), (3)] bias_options = [False] diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index 2fa2d94afe..da87488aa2 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -12,19 +12,19 @@ keras_conv2d = [SeparableConv2D] padds_options = ['same', 'valid'] chans_options = ['channels_last'] -io_type_options = ['io_stream'] +io_type_options = ['io_parallel', 'io_stream'] strides_options = [(1, 1), (2, 2)] kernel_options = [(2, 2), (3, 3)] bias_options = [False] -@pytest.mark.parametrize("conv2d", keras_conv2d) -@pytest.mark.parametrize("chans", chans_options) -@pytest.mark.parametrize("padds", padds_options) -@pytest.mark.parametrize("strides", strides_options) -@pytest.mark.parametrize("kernels", kernel_options) -@pytest.mark.parametrize("bias", bias_options) -@pytest.mark.parametrize("io_type", io_type_options) +@pytest.mark.parametrize('conv2d', keras_conv2d) +@pytest.mark.parametrize('chans', chans_options) +@pytest.mark.parametrize('padds', padds_options) +@pytest.mark.parametrize('strides', strides_options) +@pytest.mark.parametrize('kernels', kernel_options) +@pytest.mark.parametrize('bias', bias_options) +@pytest.mark.parametrize('io_type', io_type_options) @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult']) def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() From b0085a11d38ab8f1ca8ec239e4b3b20128e4f64f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 20:11:49 +0000 Subject: [PATCH 050/103] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.4.0 → 24.4.2](https://github.com/psf/black/compare/24.4.0...24.4.2) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 275b349422..6db9312eb3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte repos: - repo: https://github.com/psf/black - rev: 24.4.0 + rev: 24.4.2 hooks: - id: black language_version: python3 From 44bc8f33f350c652ba2ae60edec7ad96f5d26d40 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 26 Apr 2024 18:23:10 -0500 Subject: [PATCH 051/103] Update pytest docker image to 0.5.4 --- test/pytest/ci-template.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml index 50e9f799f6..fa4e7c9d8a 100644 --- a/test/pytest/ci-template.yml +++ b/test/pytest/ci-template.yml @@ -1,10 +1,11 @@ .pytest: stage: test - image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.4.base + image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.4.base tags: - k8s-default before_script: - source ~/.bashrc + - git config --global --add safe.directory /builds/fastmachinelearning/hls4ml - git submodule update --init --recursive hls4ml/templates/catapult/ - if [ $EXAMPLEMODEL == 1 ]; then git submodule update --init example-models; fi - conda activate hls4ml-testing From a7826e07425ab2ec703cbe1150485c99c9837198 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 1 May 2024 13:58:43 -0500 Subject: [PATCH 052/103] bump to 0.5.5 --- test/pytest/ci-template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml index fa4e7c9d8a..afaf90da4d 100644 --- a/test/pytest/ci-template.yml +++ b/test/pytest/ci-template.yml @@ -1,6 +1,6 @@ .pytest: stage: test - image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.4.base + image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.5.base tags: - k8s-default before_script: From 41ab6af33228f4c8a3ce3ea33dabf20811dbab96 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 25 Apr 2024 16:03:05 -0500 Subject: [PATCH 053/103] fix pre-commit warning --- test/pytest/test_weight_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytest/test_weight_writer.py b/test/pytest/test_weight_writer.py index 168b781a67..431f10970b 100644 --- a/test/pytest/test_weight_writer.py +++ b/test/pytest/test_weight_writer.py @@ -29,5 +29,5 @@ def test_weight_writer(k, i, f): print(w_paths[0]) assert len(w_paths) == 1 w_loaded = np.loadtxt(w_paths[0], delimiter=',').reshape(1, 1) - print(f'{w[0,0]:.14}', f'{w_loaded[0,0]:.14}') + print(f'{w[0, 0]:.14}', f'{w_loaded[0, 0]:.14}') assert np.all(w == w_loaded) From c0f8d9f3bd0be0ee340168c483c9995c705752fb Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 26 Apr 2024 18:35:04 -0500 Subject: [PATCH 054/103] change writing of obsolete ".h5" to ".keras" files --- hls4ml/writer/catapult_writer.py | 2 +- hls4ml/writer/quartus_writer.py | 2 +- hls4ml/writer/vivado_writer.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py index 48d44e4a59..af3f28a59e 100755 --- a/hls4ml/writer/catapult_writer.py +++ b/hls4ml/writer/catapult_writer.py @@ -884,7 +884,7 @@ def write_yml(self, model): """ def keras_model_representer(dumper, keras_model): - model_path = model.config.get_output_dir() + '/keras_model.h5' + model_path = model.config.get_output_dir() + '/keras_model.keras' keras_model.save(model_path) return dumper.represent_scalar('!keras_model', model_path) diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py index f8f3d76188..8c0217f924 100644 --- a/hls4ml/writer/quartus_writer.py +++ b/hls4ml/writer/quartus_writer.py @@ -1322,7 +1322,7 @@ def write_yml(self, model): """ def keras_model_representer(dumper, keras_model): - model_path = model.config.get_output_dir() + '/keras_model.h5' + model_path = model.config.get_output_dir() + '/keras_model.keras' keras_model.save(model_path) return dumper.represent_scalar('!keras_model', model_path) diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index 412bb8d667..38b9de15f6 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -686,7 +686,7 @@ def write_yml(self, model): """ def keras_model_representer(dumper, keras_model): - model_path = model.config.get_output_dir() + '/keras_model.h5' + model_path = model.config.get_output_dir() + '/keras_model.keras' keras_model.save(model_path) return dumper.represent_scalar('!keras_model', model_path) From bcfd6858d0ef3d1b575712f2acac6559274654fe Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 1 May 2024 20:21:44 +0200 Subject: [PATCH 055/103] Fix extension test for Keras v3 --- docs/advanced/extension.rst | 4 ++++ test/pytest/test_extensions.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/docs/advanced/extension.rst b/docs/advanced/extension.rst index ad86051d82..b6acc4ac6a 100644 --- a/docs/advanced/extension.rst +++ b/docs/advanced/extension.rst @@ -35,6 +35,10 @@ For concreteness, let's say our custom layer ``KReverse`` is implemented in Kera def call(self, inputs): return tf.reverse(inputs, axis=[-1]) + def get_config(self): + return super().get_config() + +Make sure you define a ``get_config()`` method for your custom layer as this is needed for correct parsing. We can define the equivalent layer in hls4ml ``HReverse``, which inherits from ``hls4ml.model.layers.Layer``. .. code-block:: Python diff --git a/test/pytest/test_extensions.py b/test/pytest/test_extensions.py index 0820a58c7c..bf5c7e2981 100644 --- a/test/pytest/test_extensions.py +++ b/test/pytest/test_extensions.py @@ -19,6 +19,10 @@ def __init__(self): def call(self, inputs): return tf.reverse(inputs, axis=[-1]) + def get_config(self): + # Breaks serialization and parsing in hls4ml if not defined + return super().get_config() + # hls4ml layer implementation class HReverse(hls4ml.model.layers.Layer): From 8c0959567e92633bdfc95f71bbee0d8941d8eb29 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 15 May 2024 22:15:28 +0200 Subject: [PATCH 056/103] Support ParallelizationFactor in SepConv1D/2D --- hls4ml/backends/vivado/vivado_backend.py | 53 ++++++++++++++++++++---- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 47974e10c3..0b24393134 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -294,9 +294,20 @@ def init_sepconv1d(self, layer): else: layer.set_attr('strategy', 'latency') - layer.set_attr( - 'n_partitions', 1 - ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly + out_width = layer.get_output_variable().shape[0] + chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1) + valid_pf = self.get_valid_conv_partition_splits(1, out_width) + if chosen_pf not in valid_pf: + closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf) + valid_pf_str = ','.join(map(str, valid_pf)) + print( + f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".' + f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.' + ) + else: + closest_pf = chosen_pf + layer.set_attr('n_partitions', out_width // closest_pf) + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) # Set the output type of the depthwise phase @@ -349,9 +360,21 @@ def init_sepconv2d(self, layer): else: layer.set_attr('strategy', 'latency') - layer.set_attr( - 'n_partitions', 1 - ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly + out_height = layer.get_output_variable().shape[0] + out_width = layer.get_output_variable().shape[1] + chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1) + valid_pf = self.get_valid_conv_partition_splits(out_height, out_width) + if chosen_pf not in valid_pf: + closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf) + valid_pf_str = ','.join(map(str, valid_pf)) + print( + f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".' + f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.' + ) + else: + closest_pf = chosen_pf + layer.set_attr('n_partitions', out_height * out_width // closest_pf) + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) # Set the output type of the depthwise phase @@ -372,9 +395,21 @@ def init_depconv2d(self, layer): else: layer.set_attr('strategy', 'latency') - layer.set_attr( - 'n_partitions', 1 - ) # TODO Once we have SeparableConv implementation for io_parallel this should be set properly + out_height = layer.get_output_variable().shape[0] + out_width = layer.get_output_variable().shape[1] + chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1) + valid_pf = self.get_valid_conv_partition_splits(out_height, out_width) + if chosen_pf not in valid_pf: + closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf) + valid_pf_str = ','.join(map(str, valid_pf)) + print( + f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".' + f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.' + ) + else: + closest_pf = chosen_pf + layer.set_attr('n_partitions', out_height * out_width // closest_pf) + layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower()) def _set_pooling_accum_t(self, layer, pool_size): From 11819acc24cc56868e7082928e1014fb020e3060 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 30 May 2024 18:51:16 -0500 Subject: [PATCH 057/103] updated pytest docker image --- test/pytest/ci-template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml index afaf90da4d..f6aa700415 100644 --- a/test/pytest/ci-template.yml +++ b/test/pytest/ci-template.yml @@ -1,6 +1,6 @@ .pytest: stage: test - image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.5.base + image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.6.base tags: - k8s-default before_script: From 39d923295daf792ba035fe2b761c938ff9cad935 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Mon, 3 Jun 2024 18:59:39 +0200 Subject: [PATCH 058/103] Don't test io_parallel for Catapult test and reduce the size of test to speed it up --- test/pytest/test_sepconv1d.py | 17 ++++++++++++----- test/pytest/test_sepconv2d.py | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py index e64bd06a76..a0d6abae4d 100644 --- a/test/pytest/test_sepconv1d.py +++ b/test/pytest/test_sepconv1d.py @@ -12,7 +12,6 @@ keras_conv1d = [SeparableConv1D] padds_options = ['same', 'valid'] chans_options = ['channels_last'] -io_type_options = ['io_parallel', 'io_stream'] strides_options = [(1), (2)] kernel_options = [(2), (3)] bias_options = [False] @@ -24,14 +23,22 @@ @pytest.mark.parametrize('strides', strides_options) @pytest.mark.parametrize('kernels', kernel_options) @pytest.mark.parametrize('bias', bias_options) -@pytest.mark.parametrize('io_type', io_type_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult']) +@pytest.mark.parametrize( + 'backend, io_type', + [ + ('Vivado', 'io_parallel'), + ('Vitis', 'io_parallel'), + ('Vivado', 'io_stream'), + ('Vitis', 'io_stream'), + ('Catapult', 'io_stream'), + ], +) def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() - input_shape = (28, 3) + input_shape = (16, 3) model.add( conv1d( - filters=32, + filters=8, kernel_size=kernels, strides=strides, padding=padds, diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index da87488aa2..9c0ece575f 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -24,14 +24,22 @@ @pytest.mark.parametrize('strides', strides_options) @pytest.mark.parametrize('kernels', kernel_options) @pytest.mark.parametrize('bias', bias_options) -@pytest.mark.parametrize('io_type', io_type_options) -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult']) +@pytest.mark.parametrize( + 'backend, io_type', + [ + ('Vivado', 'io_parallel'), + ('Vitis', 'io_parallel'), + ('Vivado', 'io_stream'), + ('Vitis', 'io_stream'), + ('Catapult', 'io_stream'), + ], +) def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() - input_shape = (28, 28, 3) + input_shape = (16, 16, 3) model.add( conv2d( - filters=32, + filters=8, kernel_size=kernels, strides=strides, padding=padds, From 68a83d636f4b74b4f082022160692128d2c8e028 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 9 Jun 2024 14:03:33 +0200 Subject: [PATCH 059/103] Add explicit DepthwiseConv tests and simpligy SepConv tests --- test/pytest/test_depthconv1d.py | 66 ++++++++++++++++++++++++++++++++ test/pytest/test_depthconv2d.py | 67 +++++++++++++++++++++++++++++++++ test/pytest/test_sepconv1d.py | 11 ++---- test/pytest/test_sepconv2d.py | 11 ++---- 4 files changed, 141 insertions(+), 14 deletions(-) create mode 100644 test/pytest/test_depthconv1d.py create mode 100644 test/pytest/test_depthconv2d.py diff --git a/test/pytest/test_depthconv1d.py b/test/pytest/test_depthconv1d.py new file mode 100644 index 0000000000..3734815af0 --- /dev/null +++ b/test/pytest/test_depthconv1d.py @@ -0,0 +1,66 @@ +from pathlib import Path + +import numpy as np +import pytest +import tensorflow as tf + +import hls4ml + +test_root_path = Path(__file__).parent + +padds_options = ['same', 'valid'] +chans_options = ['channels_last'] +strides_options = [(1), (2)] +kernel_options = [(2), (3)] +bias_options = [False] + + +@pytest.mark.parametrize('chans', chans_options) +@pytest.mark.parametrize('padds', padds_options) +@pytest.mark.parametrize('strides', strides_options) +@pytest.mark.parametrize('kernels', kernel_options) +@pytest.mark.parametrize('bias', bias_options) +@pytest.mark.parametrize( + 'backend, io_type', + [ + ('Vivado', 'io_parallel'), + ('Vitis', 'io_parallel'), + ('Vivado', 'io_stream'), + ('Vitis', 'io_stream'), + ('Catapult', 'io_stream'), + ], +) +def test_depthconv1d(chans, padds, strides, kernels, bias, io_type, backend): + model = tf.keras.models.Sequential() + input_shape = (16, 3) + model.add( + tf.keras.layers.DepthwiseConv1D( + kernel_size=kernels, + strides=strides, + padding=padds, + input_shape=input_shape, + kernel_initializer='normal', + use_bias=bias, + data_format=chans, + ) + ) + + model.compile(optimizer='adam', loss='mse') + X_input = np.random.rand(100, *input_shape) + keras_prediction = model.predict(X_input) + config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>') + stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') + kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '') + output_dir = str( + test_root_path + / 'hls4mlprj_depthconv1d_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format( + chans, stride_cfg, kernel_cfg, padds, backend, io_type + ) + ) + hls_model = hls4ml.converters.convert_from_keras_model( + model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend + ) + hls_model.compile() + hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape) + + np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001) diff --git a/test/pytest/test_depthconv2d.py b/test/pytest/test_depthconv2d.py new file mode 100644 index 0000000000..9178edf368 --- /dev/null +++ b/test/pytest/test_depthconv2d.py @@ -0,0 +1,67 @@ +from pathlib import Path + +import numpy as np +import pytest +import tensorflow as tf + +import hls4ml + +test_root_path = Path(__file__).parent + +padds_options = ['same', 'valid'] +chans_options = ['channels_last'] +io_type_options = ['io_parallel', 'io_stream'] +strides_options = [(1, 1), (2, 2)] +kernel_options = [(2, 2), (3, 3)] +bias_options = [False] + + +@pytest.mark.parametrize('chans', chans_options) +@pytest.mark.parametrize('padds', padds_options) +@pytest.mark.parametrize('strides', strides_options) +@pytest.mark.parametrize('kernels', kernel_options) +@pytest.mark.parametrize('bias', bias_options) +@pytest.mark.parametrize( + 'backend, io_type', + [ + ('Vivado', 'io_parallel'), + ('Vitis', 'io_parallel'), + ('Vivado', 'io_stream'), + ('Vitis', 'io_stream'), + ('Catapult', 'io_stream'), + ], +) +def test_depthconv2d(chans, padds, strides, kernels, bias, io_type, backend): + model = tf.keras.models.Sequential() + input_shape = (16, 16, 3) + model.add( + tf.keras.layers.DepthwiseConv2D( + kernel_size=kernels, + strides=strides, + padding=padds, + input_shape=input_shape, + kernel_initializer='normal', + use_bias=bias, + data_format=chans, + ) + ) + + model.compile(optimizer='adam', loss='mse') + X_input = np.random.rand(100, *input_shape) + keras_prediction = model.predict(X_input) + config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>') + stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') + kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '') + output_dir = str( + test_root_path + / 'hls4mlprj_depthconv2d_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format( + chans, stride_cfg, kernel_cfg, padds, backend, io_type + ) + ) + hls_model = hls4ml.converters.convert_from_keras_model( + model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend + ) + hls_model.compile() + hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape) + + np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001) diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py index a0d6abae4d..64312e9932 100644 --- a/test/pytest/test_sepconv1d.py +++ b/test/pytest/test_sepconv1d.py @@ -3,13 +3,11 @@ import numpy as np import pytest import tensorflow as tf -from tensorflow.keras.layers import SeparableConv1D import hls4ml test_root_path = Path(__file__).parent -keras_conv1d = [SeparableConv1D] padds_options = ['same', 'valid'] chans_options = ['channels_last'] strides_options = [(1), (2)] @@ -17,7 +15,6 @@ bias_options = [False] -@pytest.mark.parametrize('conv1d', keras_conv1d) @pytest.mark.parametrize('chans', chans_options) @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides_options) @@ -33,11 +30,11 @@ ('Catapult', 'io_stream'), ], ) -def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backend): +def test_sepconv1d(chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() input_shape = (16, 3) model.add( - conv1d( + tf.keras.layers.SeparableConv1D( filters=8, kernel_size=kernels, strides=strides, @@ -57,8 +54,8 @@ def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backen kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( test_root_path - / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format( - conv1d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds, backend, io_type + / 'hls4mlprj_sepconv1d_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format( + chans, stride_cfg, kernel_cfg, padds, backend, io_type ) ) hls_model = hls4ml.converters.convert_from_keras_model( diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index 9c0ece575f..58e63fec8a 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -3,13 +3,11 @@ import numpy as np import pytest import tensorflow as tf -from tensorflow.keras.layers import SeparableConv2D import hls4ml test_root_path = Path(__file__).parent -keras_conv2d = [SeparableConv2D] padds_options = ['same', 'valid'] chans_options = ['channels_last'] io_type_options = ['io_parallel', 'io_stream'] @@ -18,7 +16,6 @@ bias_options = [False] -@pytest.mark.parametrize('conv2d', keras_conv2d) @pytest.mark.parametrize('chans', chans_options) @pytest.mark.parametrize('padds', padds_options) @pytest.mark.parametrize('strides', strides_options) @@ -34,11 +31,11 @@ ('Catapult', 'io_stream'), ], ) -def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend): +def test_sepconv2d(chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() input_shape = (16, 16, 3) model.add( - conv2d( + tf.keras.layers.SeparableConv2D( filters=8, kernel_size=kernels, strides=strides, @@ -58,8 +55,8 @@ def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backen kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( test_root_path - / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format( - conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds, backend, io_type + / 'hls4mlprj_sepconv2d_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format( + chans, stride_cfg, kernel_cfg, padds, backend, io_type ) ) hls_model = hls4ml.converters.convert_from_keras_model( From 8a9d5568f42f80e631228d3647452715e1e97b6d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:06:27 +0000 Subject: [PATCH 060/103] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/pyupgrade: v3.15.2 → v3.16.0](https://github.com/asottile/pyupgrade/compare/v3.15.2...v3.16.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6db9312eb3..aa9e58da38 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,7 +30,7 @@ repos: args: ["--profile", "black", --line-length=125] - repo: https://github.com/asottile/pyupgrade - rev: v3.15.2 + rev: v3.16.0 hooks: - id: pyupgrade args: ["--py36-plus"] From ad86387dc2aebcb78a2097965c19bb479aa8da09 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 19 Apr 2024 10:31:04 +0200 Subject: [PATCH 061/103] Initial commit --- .../passes/fifo_depth_optimization.py | 69 +++++++++++++++++++ .../vitis_accelerator/supported_boards.json | 28 ++++++++ 2 files changed, 97 insertions(+) create mode 100644 hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py diff --git a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py new file mode 100644 index 0000000000..e983ca49fb --- /dev/null +++ b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py @@ -0,0 +1,69 @@ +# from hls4ml.backends.vivado.passes.fifo_depth_optimization import ( +# generate_max_depth_file, +# get_vcd_data, +# populate_values, +# set_big_fifos, +# set_fifo_depth, +# ) +# from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass + + +# class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass): +# def __init__(self): +# self.values = [] + +# def transform(self, model): +# # use `large_fifo_depth = 0` to keep the default fifo depth +# profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000) + +# # check axi-stream or io-stream, if not one the 2 exit +# if not ( +# model.config.get_config_value('IOType') == 'io_stream' +# or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_stream' +# or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_master' +# ): +# raise Exception( +# 'To use this optimization you have to set `IOType` field to `io_stream` in the HLS config ' +# 'or `axi_stream` or `axi_master` in `AcceleratorConfig` interface field' +# ) + +# # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be +# # profiled + +# if profiling_fifo_depth: +# set_big_fifos(model.output_vars, profiling_fifo_depth) + +# data = get_vcd_data(model) + +# for i in range(1, len(data['children'][0]['children'][0]['children'])): +# # wrapper fifos +# populate_values( +# self.values, +# data['children'][0]['children'][0]['children'][i]['name'], +# data['children'][0]['children'][0]['children'][i]['children'][0]['data'], +# data['children'][0]['children'][0]['children'][i]['children'][1]['data'], +# ) + +# n_elem = len(data['children'][0]['children'][0]['children'][0]['children']) +# for i in range(n_elem): +# name = data['children'][0]['children'][0]['children'][0]['children'][i]['name'] +# data_p = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data'] +# depth = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data'] +# populate_values(self.values, name, data_p, depth) + +# maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values] + +# generate_max_depth_file(model, maxs) + +# set_fifo_depth(model, maxs) + +# inp = model.get_input_variables()[0] +# out = model.get_output_variables()[0] +# for x in maxs: +# if 'in_local' in x['name']: +# inp.pragma = (inp.pragma[0], x['max'] + 1) +# elif 'out_local' in x['name']: +# out.pragma = (out.pragma[0], x['max'] + 1) + +# print('[hls4ml] - FIFO optimization completed') +# return False diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator/supported_boards.json index 4a54ea2924..1279ec22d0 100644 --- a/hls4ml/backends/vitis_accelerator/supported_boards.json +++ b/hls4ml/backends/vitis_accelerator/supported_boards.json @@ -10,5 +10,33 @@ "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"}, "python_drivers": {"axi_stream": "axi_stream_driver.py"}, "c_drivers": {} + }, + "alveo-u50": { + "part": "xcu50-fsvh2104-2-e", + "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, + "c_drivers": {} + }, + "alveo-u250": { + "part": "xcu250-figd2104-2L-e", + "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, + "c_drivers": {} + }, + "alveo-u200": { + "part": "xcu200-fsgd2104-2-e", + "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, + "c_drivers": {} + }, + "alveo-u280": { + "part": "xcu280-fsvh2892-2L-e", + "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, + "c_drivers": {} } } From 4ea329be3c1f75d579db5e82f56f1284cee0b651 Mon Sep 17 00:00:00 2001 From: dgburnette <36940078+dgburnette@users.noreply.github.com> Date: Mon, 15 Apr 2024 07:12:17 -0700 Subject: [PATCH 062/103] Stage initial set of changes for the Catapult backend (#956) * Stage initial set of changes for the Catapult backend * applied some changes for issues reported by pre-commit. But pre-commit still reorders backends/__init__.py incorrectly * final changes for clean pre-commit * minor edits * Checkin * Add file * pre-commit format * add in nnet_utils files * format changes for pre-commit * run flows by netlist type * update design pragmas on some blocks. cleaned up TCL script * move AC submodules under hls4ml/templates/catapult * merged in latest changes from mainline * remove bad submodules * recreate AC submodules in hls4ml/templates/catapult * pre-commit fixes * pre-commit fixes * turn on Catapult backend testing * removed io_parallel testing for Catapult backend * add Catapult * added Catapult * added Catapult * added Catapult to some pytests * Added concept of ProjectDir to distinguish the project directory of the HLS tool from the ProjectName which is used for the cpp file and top function name * better handling of c++ testbench data files. enhanced directory naming. * fix syntax * workaround from Giuseppe * Add concept of ProjectDir for Catapult which is different from ProjectName that gets used for the top function name and the cpp files * add new file from Giuseppe * improvements to project management, reporting and testbench * include new file in generation of parameters.h * add hard_tanh for io_parallel. formatting * Full path to the header nnet_helpers.h is necessary in the include (check if this is not an issue with recent versions of Catapult) * Avoid ceiling function from the math library: ceil(n/d) ---> (n+d-1)/n * These are mostly workarounds for the BUP synyhesis of a testing model (should these changes make in something more general?) * revert format back to what clang-format currently enforces * simplification from Giuesspe * Fixes for bottom-up handling of libraries * pre-commit format fixes * fix loops * consolidate prj scripts * cleanup pragmas * switch from using ssh to https for submodules * fix include path for non-catapult install * update pytest environment * CL 1100381 * CL 1098112 * roll in latest changes. turn off Catapult variants of test_binary_cnn and test_cnn_mnist_qkeras for now * fix test failure * disable Catapult test for pytorch until it is supported * disable Catapult for pytorch tests * Simpler submodule initialization for CI --------- Co-authored-by: David Burnette Co-authored-by: Giuseppe Di Guglielmo Co-authored-by: Jovan Mitrevski Co-authored-by: Vladimir Loncar --- hls4ml/backends/catapult/catapult_backend.py | 3 +- .../catapult/passes/transform_types.py | 6 +- hls4ml/backends/fpga/fpga_types.py | 177 ++++++++++++++++++ hls4ml/writer/catapult_writer.py | 2 +- test/pytest/ci-template.yml | 1 - test/pytest/test_cnn_mnist.py | 2 +- test/pytest/test_sepconv1d.py | 14 +- test/pytest/test_sepconv2d.py | 25 +-- 8 files changed, 196 insertions(+), 34 deletions(-) diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py index 0583e80dab..5556154dcb 100644 --- a/hls4ml/backends/catapult/catapult_backend.py +++ b/hls4ml/backends/catapult/catapult_backend.py @@ -4,8 +4,7 @@ import numpy as np from hls4ml.backends import FPGABackend -from hls4ml.backends.catapult.catapult_types import CatapultArrayVariableConverter -from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter +from hls4ml.backends.fpga.fpga_types import ACTypeConverter, CatapultArrayVariableConverter, HLSTypeConverter from hls4ml.model.attributes import ChoiceAttribute, ConfigurableAttribute, TypeAttribute from hls4ml.model.flow import register_flow from hls4ml.model.layers import ( diff --git a/hls4ml/backends/catapult/passes/transform_types.py b/hls4ml/backends/catapult/passes/transform_types.py index 3cbb917a67..4ef3548cb6 100755 --- a/hls4ml/backends/catapult/passes/transform_types.py +++ b/hls4ml/backends/catapult/passes/transform_types.py @@ -1,10 +1,12 @@ -from hls4ml.backends.catapult.catapult_types import ( +from hls4ml.backends.fpga.fpga_types import ( + ACTypeConverter, CatapultArrayVariableConverter, CatapultInplaceArrayVariableConverter, CatapultInplaceStreamVariableConverter, CatapultStreamVariableConverter, + HLSTypeConverter, + StaticWeightVariableConverter, ) -from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter, StaticWeightVariableConverter from hls4ml.model.optimizer import GlobalOptimizerPass from hls4ml.model.types import InplaceTensorVariable diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py index 15ad386c5a..408f1320e4 100644 --- a/hls4ml/backends/fpga/fpga_types.py +++ b/hls4ml/backends/fpga/fpga_types.py @@ -234,6 +234,42 @@ def definition_cpp(self, name_suffix='', as_reference=False): # region ArrayVariable +class VivadoArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}]'.format( + type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp() + ) + + +class QuartusArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}] {pragma}'.format( + type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma + ) + + +class CatapultArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}] /* {pragma} */'.format( + type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma + ) + + +class VivadoInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class QuartusInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class CatapultInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + class ArrayVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -251,11 +287,59 @@ def convert(self, tensor_var, pragma='partition'): return tensor_var +class VivadoArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoArrayVariableDefinition) + + +class QuartusArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition) + + +class CatapultArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultArrayVariableDefinition) + + +class VivadoInplaceArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition) + + +class QuartusInplaceArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceArrayVariableDefinition + ) + + +class CatapultInplaceArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceArrayVariableDefinition + ) + + # endregion # region StructMemberVariable +class QuartusStructMemberVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}]'.format( + type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp() + ) + + +class CatapultStructMemberVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return '{type} {name}{suffix}[{shape}]'.format( + type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp() + ) + + class StructMemberVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -278,11 +362,68 @@ def convert(self, tensor_var, pragma='partition', struct_name=None): return tensor_var +class QuartusStructMemberVariableConverter(StructMemberVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStructMemberVariableDefinition + ) + + +class CatapultStructMemberVariableConverter(StructMemberVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStructMemberVariableDefinition + ) + + # endregion # region StreamVariable +class VivadoStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if as_reference: # Function parameter + return f'hls::stream<{self.type.name}> &{self.name}{name_suffix}' + else: # Declaration + return 'hls::stream<{type}> {name}{suffix}("{name}")'.format( + type=self.type.name, name=self.name, suffix=name_suffix + ) + + +class VivadoInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class QuartusStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if as_reference: # Function parameter + return f'stream<{self.type.name}> &{self.name}{name_suffix}' + else: # Declaration + return f'stream<{self.type.name}> {self.name}{name_suffix}' + + +class QuartusInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class CatapultStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if as_reference: # Function parameter + return f'ac_channel<{self.type.name}> &{self.name}{name_suffix}' + else: # Declaration (string name arg not implemented in ac_channel) + return 'ac_channel<{type}> {name}{suffix}/*("{name}")*/'.format( + type=self.type.name, name=self.name, suffix=name_suffix + ) + + +class CatapultInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + class StreamVariableConverter: def __init__(self, type_converter, prefix, definition_cls): self.type_converter = type_converter @@ -304,6 +445,21 @@ def convert(self, tensor_var, n_pack=1, depth=0): return tensor_var +class VivadoStreamVariableConverter(StreamVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition) + + +class QuartusStreamVariableConverter(StreamVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition) + + +class CatapultStreamVariableConverter(StreamVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStreamVariableDefinition) + + # endregion # region InplaceStreamVariable @@ -323,6 +479,27 @@ def convert(self, tensor_var, n_pack=1, depth=0): return tensor_var +class VivadoInplaceStreamVariableConverter(InplaceStreamVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceStreamVariableDefinition + ) + + +class QuartusInplaceStreamVariableConverter(InplaceStreamVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceStreamVariableDefinition + ) + + +class CatapultInplaceStreamVariableConverter(InplaceStreamVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceStreamVariableDefinition + ) + + # endregion # region WeightsVariable diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py index af3f28a59e..48d44e4a59 100755 --- a/hls4ml/writer/catapult_writer.py +++ b/hls4ml/writer/catapult_writer.py @@ -884,7 +884,7 @@ def write_yml(self, model): """ def keras_model_representer(dumper, keras_model): - model_path = model.config.get_output_dir() + '/keras_model.keras' + model_path = model.config.get_output_dir() + '/keras_model.h5' keras_model.save(model_path) return dumper.represent_scalar('!keras_model', model_path) diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml index f6aa700415..a9391709cb 100644 --- a/test/pytest/ci-template.yml +++ b/test/pytest/ci-template.yml @@ -5,7 +5,6 @@ - k8s-default before_script: - source ~/.bashrc - - git config --global --add safe.directory /builds/fastmachinelearning/hls4ml - git submodule update --init --recursive hls4ml/templates/catapult/ - if [ $EXAMPLEMODEL == 1 ]; then git submodule update --init example-models; fi - conda activate hls4ml-testing diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py index ab3365f228..27b966f51d 100644 --- a/test/pytest/test_cnn_mnist.py +++ b/test/pytest/test_cnn_mnist.py @@ -61,7 +61,7 @@ def keras_model(mnist_data): ('Vitis', 'io_parallel', 'resource'), ('Vitis', 'io_parallel', 'latency'), ('Vitis', 'io_stream', 'latency'), - ('Vitis', 'io_stream', 'resource'), + ('Vitis', 'io_stream', 'latency'), ], ) def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy): diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py index 64312e9932..1afdfbae67 100644 --- a/test/pytest/test_sepconv1d.py +++ b/test/pytest/test_sepconv1d.py @@ -20,17 +20,9 @@ @pytest.mark.parametrize('strides', strides_options) @pytest.mark.parametrize('kernels', kernel_options) @pytest.mark.parametrize('bias', bias_options) -@pytest.mark.parametrize( - 'backend, io_type', - [ - ('Vivado', 'io_parallel'), - ('Vitis', 'io_parallel'), - ('Vivado', 'io_stream'), - ('Vitis', 'io_stream'), - ('Catapult', 'io_stream'), - ], -) -def test_sepconv1d(chans, padds, strides, kernels, bias, io_type, backend): +@pytest.mark.parametrize('io_type', io_type_options) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult']) +def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() input_shape = (16, 3) model.add( diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index 58e63fec8a..bee2227a86 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -16,22 +16,15 @@ bias_options = [False] -@pytest.mark.parametrize('chans', chans_options) -@pytest.mark.parametrize('padds', padds_options) -@pytest.mark.parametrize('strides', strides_options) -@pytest.mark.parametrize('kernels', kernel_options) -@pytest.mark.parametrize('bias', bias_options) -@pytest.mark.parametrize( - 'backend, io_type', - [ - ('Vivado', 'io_parallel'), - ('Vitis', 'io_parallel'), - ('Vivado', 'io_stream'), - ('Vitis', 'io_stream'), - ('Catapult', 'io_stream'), - ], -) -def test_sepconv2d(chans, padds, strides, kernels, bias, io_type, backend): +@pytest.mark.parametrize("conv2d", keras_conv2d) +@pytest.mark.parametrize("chans", chans_options) +@pytest.mark.parametrize("padds", padds_options) +@pytest.mark.parametrize("strides", strides_options) +@pytest.mark.parametrize("kernels", kernel_options) +@pytest.mark.parametrize("bias", bias_options) +@pytest.mark.parametrize("io_type", io_type_options) +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult']) +def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend): model = tf.keras.models.Sequential() input_shape = (16, 16, 3) model.add( From 992b9b766a79ec6f49a65b9655a4e7c46e949452 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 20 Aug 2023 22:49:18 +0200 Subject: [PATCH 063/103] Rudimentary optimizer to infer 'auto' precision --- .../model/optimizer/passes/infer_precision.py | 211 ++++-------------- 1 file changed, 43 insertions(+), 168 deletions(-) diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 51422c534e..5ef1c2dee5 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -1,23 +1,13 @@ -import math +from copy import deepcopy import numpy as np -from hls4ml.model.optimizer import ConfigurableOptimizerPass +from hls4ml.model.optimizer import OptimizerPass from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType -# TODO: The code assumes everything is Fixed or Integer precision. Need to add checks - - -class InferPrecisionTypes(ConfigurableOptimizerPass): - def __init__(self): - # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias - self.infer_no_bias = False +class InferPrecisionTypes(OptimizerPass): def match(self, node): - input_var = node.get_input_variable() - if input_var is not None and isinstance(input_var.type, UnspecifiedPrecisionType): - # only infer types if the input type is known - return False for layer_type in node.types.values(): if isinstance(layer_type.precision, UnspecifiedPrecisionType): return True @@ -34,16 +24,14 @@ def transform(self, model, node): if type_name not in inferred_types: self._infer_default_type(node, type_name) - # if the return type was set, this may allow InferPrecisionTypes to be run - # on layers it was not previously able to - return 'result_t' in types_to_infer + return False # No model graph changes made def _infer_precision(self, node, types_to_infer): node_class = node.class_name if node_class in ['Dense']: return self._infer_dense_precision(node, types_to_infer) - if node_class in ['BatchNormalization', 'ApplyAlpha']: + if node_class in ['BatchNormalization']: return self._infer_bn_precision(node, types_to_infer) if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']: @@ -58,24 +46,14 @@ def _infer_precision(self, node, types_to_infer): if node_class in ['Clone', 'Reshape', 'Resize', 'Transpose', 'ZeroPadding1D', 'ZeroPadding2D']: return self._infer_output_matching_precision(node, types_to_infer) - if node_class in ['Merge']: + if node_class in ['Concatenate', 'Merge']: return self._infer_merge_precision(node, types_to_infer) - if node_class in ['Concatenate']: - return self._infer_cat_precision(node, types_to_infer) - - if node_class in ['Dot']: - return self._infer_dot_precision(node, types_to_infer) - # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent # this in config_from_* functions return [] - def _get_default_precision(self, node): - model_config = node.model.config - return model_config.backend.convert_precision_string(model_config.model_precision['default']) - def _infer_default_type(self, node, type_name): model_config = node.model.config default_precision = model_config.backend.convert_precision_string(model_config.model_precision['default']) @@ -98,51 +76,47 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): input_precision = node.get_input_variable().type.precision input_width = input_precision.width input_integers = input_precision.integer - input_signed = input_precision.signed if 'weight_t' in types_to_infer: weight_quantizer = node.get_attr('weight_quantizer', None) if weight_quantizer is not None: + weight_width = weight_quantizer.bits + weight_integers = weight_quantizer.hls_type.integer node.types['weight_t'].name = node.name + '_weight_t' node.types['weight_t'].precision = weight_quantizer.hls_type else: self._infer_default_type(node, 'weight_t') + weight_width = node.types['weight_t'].precision.width + weight_integers = node.types['weight_t'].precision.integer node.weights['weight'].update_precision(node.types['weight_t'].precision) - inferred_types.append('weight_t') - weight_width = node.types['weight_t'].precision.width - weight_integers = node.types['weight_t'].precision.integer - weight_signed = node.types['weight_t'].precision.signed + inferred_types.append('weight_t') + else: + weight_width = node.types['weight_t'].precision.width + weight_integers = node.types['weight_t'].precision.integer if 'bias_t' in types_to_infer: bias_quantizer = node.get_attr('bias_quantizer', None) if bias_quantizer is not None: + bias_width = bias_quantizer.bits + bias_integers = bias_quantizer.hls_type.integer node.types['bias_t'].name = node.name + '_bias_t' node.types['bias_t'].precision = bias_quantizer.hls_type else: self._infer_default_type(node, 'bias_t') + bias_width = node.types['bias_t'].precision.width + bias_integers = node.types['bias_t'].precision.integer node.weights['bias'].update_precision(node.types['bias_t'].precision) - inferred_types.append('bias_t') - - bias_width = node.types['bias_t'].precision.width - bias_integers = node.types['bias_t'].precision.integer - bias_signed = node.types['bias_t'].precision.signed - no_bias = node.weights['bias'].nonzeros == 0 and self.infer_no_bias # no bias - # using math.ceil instead of np.ceil because it returns an int - bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops)) - integers = weight_integers + input_integers + math.ceil(np.log2(n_ops)) - signed = weight_signed or input_signed - - frac = bitwidth - integers - - if not no_bias: - integers = max(integers + (bias_signed and not signed), bias_integers + (signed and not bias_signed)) + 1 - bitwidth = integers + max(frac, bias_width - bias_integers) - signed = signed or bias_signed + inferred_types.append('bias_t') + else: + bias_width = node.types['bias_t'].precision.width + bias_integers = node.types['bias_t'].precision.integer - # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. - new_type = FixedPrecisionType(bitwidth, integers, signed) + new_type = FixedPrecisionType( + width=int(max(np.ceil(input_width + weight_width + np.log2(n_ops)), bias_width) + 1), + integer=int(max(np.ceil(input_integers + weight_integers + np.log2(n_ops)), bias_integers) + 1), + ) if 'accum_t' in types_to_infer: node.types['accum_t'].name = node.name + '_accum_t' @@ -159,7 +133,7 @@ def _infer_common_precision(self, node, types_to_infer, n_ops): return inferred_types def _infer_dense_precision(self, node, types_to_infer): - n_ops = node.get_attr('n_in') + n_ops = node.get_attr('n_in') * node.get_attr('n_out') return self._infer_common_precision(node, types_to_infer, n_ops) def _infer_conv_precision(self, node, types_to_infer): @@ -243,11 +217,6 @@ def _infer_sepconv_precision(self, node, types_to_infer): return inferred_types def _infer_bn_precision(self, node, types_to_infer): - """ - The batchnormalziation precision here is the more implementation-focused version. It propagates - precision from scale and bias, not mean, variance, etc. - """ - inferred_types = [] if 'scale_t' in types_to_infer: @@ -261,28 +230,16 @@ def _infer_bn_precision(self, node, types_to_infer): inferred_types.append('bias_t') if 'result_t' in types_to_infer: - input_precision = node.get_input_variable().type.precision scale_precision = node.types['scale_t'].precision bias_precision = node.types['bias_t'].precision - after_scale_signed = scale_precision.signed or input_precision.signed - after_scale_width = input_precision.width + scale_precision.width - after_scale_integer = input_precision.integer + scale_precision.integer - - out_precision_signed = after_scale_signed or bias_precision.signed - out_precision_integer = ( - max( - after_scale_integer + (bias_precision.signed and not after_scale_signed), - bias_precision.integer + (after_scale_signed and not bias_precision.signed), - ) - + 1 - ) - out_precision_width = out_precision_integer + max( - after_scale_width - after_scale_integer, bias_precision.fractional - ) + out_precision = deepcopy(node.get_input_node().get_output_variable().type.precision) + out_precision.integer += scale_precision.integer + out_precision.fractional = max(out_precision.fractional, scale_precision.fractional) - # Note: this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form. - out_precision = FixedPrecisionType(out_precision_width, out_precision_integer, out_precision_signed) + out_precision.integer = max(out_precision.integer, bias_precision.integer) + 1 + out_precision.fractional = max(out_precision.fractional, bias_precision.fractional) + out_precision.width = out_precision.fractional + out_precision.integer node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision @@ -296,21 +253,15 @@ def _infer_pooling_precision(self, node, types_to_infer): if 'accum_t' in types_to_infer: input_precision = node.get_input_variable().type.precision - pool_op = node.attributes['pool_op'].lower() - - width = input_precision.width - integer = input_precision.integer - signed = input_precision.signed + input_width = input_precision.width + input_integers = input_precision.integer - pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width') - if pool_op == 'average': - extra_bits = int(np.ceil(np.log2(pool_size))) - elif pool_op == 'max': - extra_bits = 0 - else: - raise ValueError(f'Unknown pooling operation: {pool_op}') + n_ops = node.get_attr('n_filt') * node.get_attr('pool_height', 1) * node.get_attr('pool_width') - accum_type = FixedPrecisionType(width=width + extra_bits * 2, integer=integer + extra_bits, signed=signed) + accum_type = FixedPrecisionType( + width=int(np.ceil(input_width + np.log2(n_ops)) + 1), + integer=int(np.ceil(input_integers + np.log2(n_ops)) + 1), + ) node.types['accum_t'].name = node.name + '_accum_t' node.types['accum_t'].precision = accum_type @@ -329,86 +280,10 @@ def _infer_merge_precision(self, node, types_to_infer): input_1 = node.get_input_variable(node.inputs[0]).type.precision input_2 = node.get_input_variable(node.inputs[1]).type.precision - op = node.get_attr('op').lower() - if op in ('add', 'subtract', 'average'): - new_signed = input_1.signed or input_2.signed or op == 'subtract' - new_int = ( - max( - input_1.integer + (input_2.signed and not input_1.signed), - input_2.integer + (input_1.signed and not input_2.signed), - ) - + 1 - ) - new_width = new_int + max(input_1.fractional, input_2.fractional) - out_precision = FixedPrecisionType(new_width, new_int, new_signed) - elif op == 'multiply': - new_signed = input_1.signed or input_2.signed - new_int = input_1.integer + input_2.integer - new_width = input_1.width + input_2.width - out_precision = FixedPrecisionType(new_width, new_int, new_signed) - elif op in ('maximum', 'minimum'): - new_signed = input_1.signed or input_2.signed - - input_1_integer = input_1.integer - input_2_integer = input_2.integer - - # add one to integer if unsigned while new is signed - if new_signed and not input_1.signed: - input_1_integer += 1 - if new_signed and not input_2.signed: - input_2_integer += 1 - - new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) - new_int = max(input_1_integer, input_2_integer) - out_precision = FixedPrecisionType(new_width, new_int, new_signed) - else: - print(f'Warning: not propagating weights for type {op}') - out_precision = self._get_default_precision(node) - - node.types['result_t'].name = node.name + '_result_t' - node.types['result_t'].precision = out_precision - - return ['result_t'] - - def _infer_cat_precision(self, node, types_to_infer): - assert 'result_t' in types_to_infer and len(types_to_infer) == 1 - - input_1 = node.get_input_variable(node.inputs[0]).type.precision - input_2 = node.get_input_variable(node.inputs[1]).type.precision - - new_signed = input_1.signed or input_2.signed - - input_1_integer = input_1.integer - input_2_integer = input_2.integer - - # add one to integer if unsigned while new is signed - if new_signed and not input_1.signed: - input_1_integer += 1 - if new_signed and not input_2.signed: - input_2_integer += 1 - - new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer) - new_int = max(input_1_integer, input_2_integer) - - out_precision = FixedPrecisionType(new_width, new_int, new_signed) - node.types['result_t'].name = node.name + '_result_t' - node.types['result_t'].precision = out_precision - - return ['result_t'] - - def _infer_dot_precision(self, node, types_to_infer): - assert 'result_t' in types_to_infer and len(types_to_infer) == 1 - - input_1 = node.get_input_variable(node.inputs[0]).type.precision - input_2 = node.get_input_variable(node.inputs[1]).type.precision - - n_in = node.get_input_variable(node.inputs[0]).shape[0] - - new_signed = input_1.signed or input_2.signed - new_width = input_1.width + input_2.width + math.ceil(np.log2(n_in)) - new_int = input_1.integer + input_2.integer + math.ceil(np.log2(n_in)) + new_width = max(input_1.fractional, input_2.fractional) + max(input_1.integer, input_2.integer) + new_int = max(input_1.integer, input_2.integer) - out_precision = FixedPrecisionType(new_width, new_int, new_signed) + out_precision = FixedPrecisionType(new_width, new_int) node.types['result_t'].name = node.name + '_result_t' node.types['result_t'].precision = out_precision From 8174465998afd6689bcb3b59d70472c7b52278bd Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Sun, 20 Aug 2023 22:50:18 +0200 Subject: [PATCH 064/103] Sepconv fixes --- hls4ml/backends/vivado/passes/convolution_templates.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 037f2d5eb2..4b46546971 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -286,10 +286,7 @@ def format(self, node): params['index'] = str(node.index) + '_depthwise' params['weight_t'] = node.get_weights('depthwise').type params['bias_t'] = node.get_weights('zero_bias').type - if node.model.config.get_config_value('IOType') == 'io_parallel': - params['fill_fn'] = f'fill_buffer_{node.index}_dw' - else: - params['fill_fn'] = 'FillConv1DBuffer' + params['fill_fn'] = 'FillConv1DBuffer' if node.get_attr('unscaled'): params['scale_index_type'] = 'scale_index_unscaled' From 84ff2c6b21ef0f74c9585111a0518ff0049636cd Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 21 Feb 2024 21:39:00 +0100 Subject: [PATCH 065/103] Optimizer to remove expensive Transpose that serves as Flatten --- hls4ml/model/optimizer/__init__.py | 2 -- .../passes/convert_to_channels_last.py | 29 +++++-------------- 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 3aa247d03f..247e799ec6 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -36,8 +36,6 @@ 'infer_precision_types', 'channels_last_converter', 'remove_transpose_before_flatten', - 'remove_nop_transpose', - 'remove_single_channel_transpose', 'fuse_bias_add', 'expand_layer_group', 'output_rounding_saturation_mode', diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py index 01e949086e..c283e28c92 100644 --- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py +++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py @@ -144,38 +144,23 @@ def match(self, node): if node.model.config.get_config_value('IOType') != 'io_parallel': return False - if hasattr(node, '_channels_last_keep_transpose') and node._channels_last_keep_transpose: - return False - if isinstance(node, Reshape): input_node = node.get_input_node() output_nodes = node.get_output_nodes() - if ( - len(node.get_attr('target_shape')) == 1 - and isinstance(input_node, Transpose) - and len(output_nodes) == 1 - and isinstance(output_nodes[0], Dense) - ): + if len(node.get_attr('target_shape')) == 1 and isinstance(input_node, Transpose) \ + and len(output_nodes) == 1 and isinstance(output_nodes[0], Dense): return True - + return False - + def transform(self, model, node): transpose_node = node.get_input_node() dense_node = node.get_output_nodes()[0] input_shape = transpose_node.get_output_variable().shape - if len(input_shape) == 2: # Usually after Conv1D - tran_axis = [1, 0, 2] - elif len(input_shape) == 3: # Usually after Conv2D - tran_axis = [1, 2, 0, 3] - else: # In this case we bail - node._channels_last_keep_transpose = True - return False - weight_var = dense_node.get_weights('weight') # Transpose the weights to achieve the same computation with transposed input - weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(*tran_axis) + weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(1, 2, 0, 3) weight_data_t = weight_data_t.reshape(-1, weight_data_t.shape[-1]) new_weight_var = WeightVariable( var_name=weight_var.name, @@ -183,9 +168,9 @@ def transform(self, model, node): precision=weight_var.type.precision, quantizer=weight_var.quantizer, data=weight_data_t, - index=dense_node.index, + index=dense_node.index ) - + # Update the weight variable of the node dense_node.set_attr('weight', new_weight_var) From 518796d12f1209c4798e1ee934a4a66e0a130971 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 27 Feb 2024 21:09:35 +0100 Subject: [PATCH 066/103] Remove transpose of input if n_chan=1 --- hls4ml/model/optimizer/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 247e799ec6..3aa247d03f 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -36,6 +36,8 @@ 'infer_precision_types', 'channels_last_converter', 'remove_transpose_before_flatten', + 'remove_nop_transpose', + 'remove_single_channel_transpose', 'fuse_bias_add', 'expand_layer_group', 'output_rounding_saturation_mode', From 238e35cf7595bf9e6b4d2a4a87d2f4d187c9add2 Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Wed, 21 Feb 2024 21:39:00 +0100 Subject: [PATCH 067/103] Optimizer to remove expensive Transpose that serves as Flatten --- hls4ml/model/optimizer/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 3aa247d03f..247e799ec6 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -36,8 +36,6 @@ 'infer_precision_types', 'channels_last_converter', 'remove_transpose_before_flatten', - 'remove_nop_transpose', - 'remove_single_channel_transpose', 'fuse_bias_add', 'expand_layer_group', 'output_rounding_saturation_mode', From c10dd8212eaeed54fbb726614373ccb84db4c11b Mon Sep 17 00:00:00 2001 From: Vladimir Loncar Date: Tue, 27 Feb 2024 21:09:35 +0100 Subject: [PATCH 068/103] Remove transpose of input if n_chan=1 --- hls4ml/model/optimizer/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 247e799ec6..3aa247d03f 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -36,6 +36,8 @@ 'infer_precision_types', 'channels_last_converter', 'remove_transpose_before_flatten', + 'remove_nop_transpose', + 'remove_single_channel_transpose', 'fuse_bias_add', 'expand_layer_group', 'output_rounding_saturation_mode', From d6fe369a31154a7d7fbebff72d873257a6a569fd Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 13 Jun 2024 15:21:12 -0500 Subject: [PATCH 069/103] fix up automatic precision inferrence --- .../model/optimizer/passes/infer_precision.py | 9 +- .../optimizer/passes/seperable_to_dw_conv.py | 127 ++++++++++++++++++ 2 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 hls4ml/model/optimizer/passes/seperable_to_dw_conv.py diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py index 5ef1c2dee5..0b323abc35 100644 --- a/hls4ml/model/optimizer/passes/infer_precision.py +++ b/hls4ml/model/optimizer/passes/infer_precision.py @@ -37,7 +37,10 @@ def _infer_precision(self, node, types_to_infer): if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']: return self._infer_conv_precision(node, types_to_infer) - if node_class in ['SeparableConv1D', 'SeparableConv2D', 'DepthwiseConv2D']: + if node_class in ['DepthwiseConv1D', 'DepthwiseConv2D']: + return self._infer_depthconv_precision(node, types_to_infer) + + if node_class in ['SeparableConv1D', 'SeparableConv2D']: return self._infer_sepconv_precision(node, types_to_infer) if node_class in ['Pooling1D', 'Pooling2D']: @@ -140,6 +143,10 @@ def _infer_conv_precision(self, node, types_to_infer): n_ops = node.get_attr('n_chan') * node.get_attr('filt_height', 1) * node.get_attr('filt_width') return self._infer_common_precision(node, types_to_infer, n_ops) + def _infer_depthconv_precision(self, node, types_to_infer): + n_ops = node.get_attr('filt_height', 1) * node.get_attr('filt_width') + return self._infer_common_precision(node, types_to_infer, n_ops) + def _infer_sepconv_precision(self, node, types_to_infer): inferred_types = [] diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py new file mode 100644 index 0000000000..0142f686d0 --- /dev/null +++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py @@ -0,0 +1,127 @@ +""" +This optimizer converts a seperable convolution to a depthwise followed by a regular convolution. +For backends with a custom pointwise implementations the regular convolution will subsequently +be converted to a pointwise convolution by a different optimizer. +""" + +import copy + +from hls4ml.model.layers import SeparableConv1D, SeparableConv2D +from hls4ml.model.optimizer import OptimizerPass + + +class SeperableToDepthwiseAndConv(OptimizerPass): + """Convert Seperable to DepthwiseConv + Conv (potentially later Pointwise)""" + + _dw_attributes = ( + 'in_width', + 'out_width', + 'n_chan', + 'depth_multiplier', + 'pad_left', + 'pad_right', + 'filt_width', + 'stride_width', + 'dilation_width', + 'in_height', + 'out_height', + 'pad_top', + 'pad_bottom', + 'filt_height', + 'stride_height', + 'dilation_height', + 'data_format', + 'depthwise_data', + 'depthwise_quantizer', + 'padding', + ) + + _pw_attributes = ('out_width', 'n_filt', 'dilation_width', 'out_height', 'dilation_height', 'data_format', 'use_bias') + + def match(self, node): + return isinstance(node, (SeparableConv1D, SeparableConv2D)) + + def transform(self, model, node): + dim = node.__class__.__name__[-2:] # '1D' or '2D' + + # get the layer configuration name + layer_config = model.config.get_layer_config(node) + + # First do depthwise + dw_name = f'{node.name}_depthwise' + + # now the layer config (so that set configuration get copied) + dw_layer_config = copy.deepcopy(layer_config) + + if dw_layer_config: + dw_precision_cfg = dw_layer_config.setdefault('Precision', {}) + if isinstance(dw_precision_cfg, dict): + if 'depthwise' in dw_precision_cfg: + dw_precision_cfg['weight'] = dw_precision_cfg['depthwise'] + del dw_precision_cfg['depthwise'] + if 'depthwise_accum' in dw_precision_cfg: + dw_precision_cfg['accum'] = dw_precision_cfg['depthwise_accum'] + del dw_precision_cfg['depthwise_accum'] + if 'depthwise_result' in dw_precision_cfg: + dw_precision_cfg['result'] = dw_precision_cfg['depthwise_result'] + del dw_precision_cfg['depthwise_result'] + dw_precision_cfg.pop('pointwise', None) + dw_precision_cfg.pop('pointwise_accum', None) + model.config.set_name_config(dw_name, dw_layer_config) + model.config.parse_name_config(dw_name, dw_layer_config) + + # creating the attributes + dw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._dw_attributes if k in node.attributes} + + dw_attributes['use_bias'] = False + + new_dw = model.make_node('DepthwiseConv' + dim, dw_name, dw_attributes, [node.inputs[0]]) + + # Then do convolution + pw_name = f'{node.name}_pointwise' + + # now the layer config (so that set configuration get copied) + pw_layer_config = copy.deepcopy(layer_config) + + if pw_layer_config: + pw_precision_cfg = pw_layer_config.setdefault('Precision', {}) + if isinstance(pw_precision_cfg, dict): + if 'pointwise' in pw_precision_cfg: + pw_precision_cfg['weight'] = pw_precision_cfg['pointwise'] + del pw_precision_cfg['pointwise'] + if 'pointwise_accum' in pw_precision_cfg: + pw_precision_cfg['accum'] = pw_precision_cfg['pointwise_accum'] + del pw_precision_cfg['pointwise_accum'] + if 'pointwise_result' in pw_precision_cfg: + pw_precision_cfg['result'] = pw_precision_cfg['pointwise_result'] + del pw_precision_cfg['pointwise_result'] + pw_precision_cfg.pop('depthwise', None) + pw_precision_cfg.pop('depthwise_accum', None) + model.config.set_name_config(pw_name, pw_layer_config) + model.config.parse_name_config(pw_name, pw_layer_config) + + # creating the attributes + pw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._pw_attributes if k in node.attributes} + pw_attributes['filt_width'] = 1 + pw_attributes['filt_height'] = 1 + pw_attributes['stride_width'] = 1 + pw_attributes['stride_height'] = 1 + pw_attributes['pad_left'] = 0 + pw_attributes['pad_right'] = 0 + pw_attributes['pad_top'] = 0 + pw_attributes['pad_bottom'] = 0 + pw_attributes['in_width'] = pw_attributes['out_width'] + pw_attributes['in_height'] = pw_attributes.get('out_height', 1) + pw_attributes['n_chan'] = node.get_attr('n_chan') * node.get_attr('depth_multiplier') + pw_attributes['weight_data'] = node.get_attr('pointwise_data') + pw_attributes['weight_quantizer'] = node.get_attr('pointwise_quantizer') + pw_attributes['bias_data'] = node.get_attr('bias_data') + pw_attributes['bias_quantizer'] = node.get_attr('bias_quantizer') + + # note this is just regular convolution. It is replaced by a special pointwise implementation + # if available by another optimizer + new_pw = model.make_node('Conv' + dim, pw_name, pw_attributes, [dw_name]) + + model.split_node(node, new_dw, new_pw) + + return True From 7290a29167c2e044912a3c6c8ea326f1621a41ea Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 10 Jun 2024 19:13:15 -0500 Subject: [PATCH 070/103] starting towards being able to split seperable --- hls4ml/backends/fpga/fpga_backend.py | 10 ++++ hls4ml/backends/vivado/vivado_backend.py | 6 --- hls4ml/model/graph.py | 62 ++++++++++++++---------- hls4ml/model/layers.py | 16 ++++++ 4 files changed, 62 insertions(+), 32 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py index 87309ff4e5..672627e35f 100644 --- a/hls4ml/backends/fpga/fpga_backend.py +++ b/hls4ml/backends/fpga/fpga_backend.py @@ -79,6 +79,16 @@ def __init__(self, name): attrs.append(ConfigurableAttribute('reuse_factor', default=1)) self.attribute_map[layer] = attrs + # seperable is kind of special because it is effectively two layers that will be split + for layer in (SeparableConv1D, SeparableConv2D): + attrs = self.attribute_map.get(layer, []) + attrs.append(TypeAttribute('depthwise_accum')) + attrs.append(TypeAttribute('pointwise_accum')) + attrs.append(TypeAttribute('depthwise_result')) + attrs.append(ConfigurableAttribute('depthwise_reuse_factor', default=1)) + attrs.append(ConfigurableAttribute('pointwise_reuse_factor', default=1)) + self.attribute_map[layer] = attrs + act_attrs = self.attribute_map.get(Activation, []) act_attrs.append(ConfigurableAttribute('table_size', default=1024)) act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 0b24393134..b80c6664af 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -76,12 +76,6 @@ def _register_layer_attributes(self): attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer')) self.attribute_map[layer] = attrs - sep_conv_layers = [SeparableConv1D, SeparableConv2D] - for layer in sep_conv_layers: - attrs = self.attribute_map.get(layer, []) - attrs.append(TypeAttribute('dw_output', default=FixedPrecisionType(18, 8))) - self.attribute_map[layer] = attrs - def _register_flows(self): initializers = self._get_layer_initializers() init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name) diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index 04ec33294d..d1722eaae1 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -100,6 +100,12 @@ def get_layer_config(self, layer): return layer_config + def set_name_config(self, name, config): + """sets hls_config["LayerName"][name] = config""" + hls_config = self.config['HLSConfig'] + layer_config = hls_config.setdefault('LayerName', {}) + layer_config[name] = config + def get_precision(self, layer, var='default'): precision = self.layer_name_precision.get(layer.name.lower() + '_' + var) type_name = layer.name.lower() + '_' + var + '_t' @@ -183,6 +189,35 @@ def get_compression(self, layer): return compression + def parse_name_config(self, layer_name, layer_cfg): + """This is used by _parse_hls_config below, but also in optimizers when a new layer config is created""" + precision_cfg = layer_cfg.get('Precision') + if isinstance(precision_cfg, dict): + for var, precision in precision_cfg.items(): + self.layer_name_precision[layer_name.lower() + '_' + var] = precision + else: + self.layer_name_precision[layer_name.lower() + '_default'] = precision_cfg + + rf = layer_cfg.get('ReuseFactor') + if rf is not None: + self.layer_name_rf[layer_name.lower()] = rf + + targ_cycles = layer_cfg.get('TargetCycles') + if targ_cycles is not None: + self.layer_name_targ_cycles[layer_name.lower()] = targ_cycles + + strategy = layer_cfg.get('Strategy') + if strategy is not None: + self.layer_name_strategy[layer_name.lower()] = strategy + + conv_implementation = layer_cfg.get('ConvImplementation') + if conv_implementation is not None: + self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation + + compression = layer_cfg.get('Compression') + if compression is not None: + self.layer_name_compression[layer_name.lower()] = bool(compression) + def _parse_hls_config(self): hls_config = self.config['HLSConfig'] @@ -255,32 +290,7 @@ def _parse_hls_config(self): layer_name_cfg = hls_config.get('LayerName') if layer_name_cfg is not None: for layer_name, layer_cfg in layer_name_cfg.items(): - precision_cfg = layer_cfg.get('Precision') - if isinstance(precision_cfg, dict): - for var, precision in precision_cfg.items(): - self.layer_name_precision[layer_name.lower() + '_' + var] = precision - else: - self.layer_name_precision[layer_name.lower() + '_default'] = precision_cfg - - rf = layer_cfg.get('ReuseFactor') - if rf is not None: - self.layer_name_rf[layer_name.lower()] = rf - - targ_cycles = layer_cfg.get('TargetCycles') - if targ_cycles is not None: - self.layer_name_targ_cycles[layer_name.lower()] = targ_cycles - - strategy = layer_cfg.get('Strategy') - if strategy is not None: - self.layer_name_strategy[layer_name.lower()] = strategy - - conv_implementation = layer_cfg.get('ConvImplementation') - if conv_implementation is not None: - self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation - - compression = layer_cfg.get('Compression') - if compression is not None: - self.layer_name_compression[layer_name.lower()] = bool(compression) + self.parse_name_config(layer_name, layer_cfg) def _validate_hls_config(self): use_dataflow = False diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 0d9cc0622c..f076a1e5f0 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -100,6 +100,7 @@ def __init__(self, model, name, attributes, inputs, outputs=None): layer_config = self.model.config.get_layer_config(self) for config_key, config_value in layer_config.items(): + print(f'{config_key=}, {config_value=}') config_key = convert_to_snake_case(config_key) if config_key in self.attributes: print( @@ -179,6 +180,12 @@ def _set_accum_t(self): accum_t = NamedType(*reversed(self.model.config.get_precision(self, 'accum'))) self.set_attr('accum_t', accum_t) + def _set_type_t(self, name): + has_type_t = any(a for a in self.expected_attributes if a.name == name + '_t' and isinstance(a, TypeAttribute)) + if has_type_t: + type_t = NamedType(*reversed(self.model.config.get_precision(self, name))) + self.set_attr(name + '_t', type_t) + def get_input_node(self, input_name=None): if input_name is None: if len(self.inputs) > 0: @@ -470,6 +477,11 @@ def initialize(self): self.add_bias(quantizer=self.get_attr('bias_quantizer')) + # set the needed types if needed + self._set_type_t('pointwise_accum') + self._set_type_t('depthwise_accum') + self._set_type_t('depthwise_result') + class DepthwiseConv1D(Conv1D): def initialize(self): @@ -616,6 +628,10 @@ def initialize(self): self.add_bias(quantizer=self.get_attr('bias_quantizer')) + self._set_type_t('pointwise_accum') + self._set_type_t('depthwise_accum') + self._set_type_t('depthwise_result') + class DepthwiseConv2D(Conv2D): def initialize(self): From 13fcf0a0c16ea380fad65bf59daaa533029cf68e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 11 Jun 2024 19:27:07 -0500 Subject: [PATCH 071/103] complete implementation of seperable -> dw + pw, untested --- .../vivado/passes/convolution_templates.py | 2 +- hls4ml/converters/keras/convolution.py | 3 + hls4ml/model/graph.py | 38 +++++++++++++ hls4ml/model/layers.py | 56 +++++++++++++++++-- hls4ml/model/optimizer/__init__.py | 1 + 5 files changed, 95 insertions(+), 5 deletions(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 4b46546971..36ec0aa475 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -280,7 +280,7 @@ def format(self, node): # Override bias and bias_t since these are zeros in depthwise step of SepConv1D params['bias'] = params['zero_bias'] params['bias_t'] = params['zero_bias_t'] - params['n_filt'] = params['n_chan'] # In depthwise step n_chan == n_filt + params['n_filt'] = params['n_chan'] * node.get_attr('depth_multiplier') # In depthwise step n_chan == n_filt params['dilation'] = node.get_attr('dilation', 1) params['nzeros'] = node.get_weights('depthwise').nzeros params['index'] = str(node.index) + '_depthwise' diff --git a/hls4ml/converters/keras/convolution.py b/hls4ml/converters/keras/convolution.py index 39780f6dc6..0eaa967844 100644 --- a/hls4ml/converters/keras/convolution.py +++ b/hls4ml/converters/keras/convolution.py @@ -60,6 +60,9 @@ def parse_conv2d_layer(keras_layer, input_names, input_shapes, data_reader): layer['bias_data'] = get_weights_data(data_reader, layer['name'], 'bias') + if 'depth_multiplier' in keras_layer['config']: + layer['depth_multiplier'] = keras_layer['config']['depth_multiplier'] + if 'filters' in keras_layer['config']: layer['n_filt'] = keras_layer['config']['filters'] else: diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index d1722eaae1..10b3a0f854 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -615,6 +615,44 @@ def replace_node(self, old_node, new_node): self.graph = OrderedDict((new_node.name, new_node) if k == old_node.name else (k, v) for k, v in self.graph.items()) self._update_model_outputs() + def split_node(self, old_node, new_node1, new_node2): + """Replace an existing node in the graph with two nodes in sequence. + + Args: + old_node (Layer): The node to replace + new_node1 (Layer): The first new node in sequence + new_node2 (Layer): The second new node in sequence + + """ + + # fmt: off + assert len(new_node1.inputs) == len(old_node.inputs), \ + f'{new_node1.name} and {old_node.name} have different number of inputs' + assert len(new_node2.outputs) == len(old_node.outputs), \ + f'{new_node2.name} and {old_node.name} have different number of outputs' + # fmt: on + + repl = {old_name: new_name for old_name, new_name in zip(old_node.outputs, new_node2.outputs)} + repl.update({old_name: new_name for old_name, new_name in zip(old_node.inputs, new_node1.inputs)}) + + for node in self.graph.values(): + for i, n in enumerate(node.inputs): + if n in repl: + node.inputs[i] = repl[n] + for i, n in enumerate(node.outputs): + if n in repl: + node.outputs[i] = repl[n] + + new_graph = OrderedDict() + for key, value in self.graph.items(): + if key == old_node.name: + new_graph[new_node1.name] = new_node1 + new_graph[new_node2.name] = new_node2 + else: + new_graph[key] = value + self.graph = new_graph + self._update_model_outputs() + def _update_model_outputs(self): '''Update the model outputs diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index f076a1e5f0..9e80da291f 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -447,6 +447,7 @@ class SeparableConv1D(Layer): Attribute('out_width'), Attribute('n_chan'), Attribute('n_filt'), + Attribute('depth_multiplier', default=1), Attribute('filt_width'), Attribute('stride_width'), Attribute('pad_left'), @@ -484,12 +485,27 @@ def initialize(self): class DepthwiseConv1D(Conv1D): + _expected_attributes = [ + Attribute('in_width'), + Attribute('out_width'), + Attribute('n_chan'), + Attribute('depth_multiplier', default=1), + Attribute('filt_width'), + Attribute('stride_width'), + Attribute('pad_left'), + Attribute('pad_right'), + WeightAttribute('depthwise'), + WeightAttribute('bias'), + TypeAttribute('depthwise'), + TypeAttribute('bias'), + ] + def initialize(self): if self.get_attr('data_format') == 'channels_last': - shape = [self.attributes['out_width'], self.attributes['n_chan']] + shape = [self.attributes['out_width'], self.attributes['n_chan'] * self.attributes['depth_multiplier']] dims = [f'OUT_HEIGHT_{self.index}', f'N_CHAN_{self.index}'] else: - shape = [self.attributes['n_chan'], self.attributes['out_width']] + shape = [self.attributes['n_chan'] * self.attributes['depth_multiplier'], self.attributes['out_width']] dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}'] self.add_output_variable(shape, dims) @@ -498,6 +514,7 @@ def initialize(self): ) self.add_bias(quantizer=self.get_attr('bias_quantizer')) + self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier')) class Conv2D(Layer): @@ -594,6 +611,7 @@ class SeparableConv2D(Layer): Attribute('out_width'), Attribute('n_chan'), Attribute('n_filt'), + Attribute('depth_multiplier', default=1), Attribute('filt_height'), Attribute('filt_width'), Attribute('stride_height'), @@ -634,12 +652,41 @@ def initialize(self): class DepthwiseConv2D(Conv2D): + _expected_attributes = [ + Attribute('in_height'), + Attribute('in_width'), + Attribute('out_height'), + Attribute('out_width'), + Attribute('n_chan'), + Attribute('depth_multiplier', default=1), + Attribute('filt_height'), + Attribute('filt_width'), + Attribute('stride_height'), + Attribute('stride_width'), + Attribute('pad_top'), + Attribute('pad_bottom'), + Attribute('pad_left'), + Attribute('pad_right'), + WeightAttribute('weight'), + WeightAttribute('bias'), + TypeAttribute('weight'), + TypeAttribute('bias'), + ] + def initialize(self): if self.get_attr('data_format') == 'channels_last': - shape = [self.attributes['out_height'], self.attributes['out_width'], self.attributes['n_chan']] + shape = [ + self.attributes['out_height'], + self.attributes['out_width'], + self.attributes['n_chan'] * self.attributes['depth_multiplier'], + ] dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}'] else: - shape = [self.attributes['n_chan'], self.attributes['out_height'], self.attributes['out_width']] + shape = [ + self.attributes['n_chan'] * self.attributes['depth_multiplier'], + self.attributes['out_height'], + self.attributes['out_width'], + ] dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}'] self.add_output_variable(shape, dims) @@ -648,6 +695,7 @@ def initialize(self): ) self.add_bias(quantizer=self.get_attr('bias_quantizer')) + self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier')) class Pooling1D(Layer): diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 3aa247d03f..de1b7597df 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -33,6 +33,7 @@ register_flow( 'convert', [ + 'seperable_to_depthwise_and_conv', # has to be before precision inference 'infer_precision_types', 'channels_last_converter', 'remove_transpose_before_flatten', From 92e722272dfd2b8162ca003d562a800f8f09c98e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 12 Jun 2024 18:28:08 -0500 Subject: [PATCH 072/103] make conv_same_pad also trigger on depthwise, varius bug fixes --- hls4ml/backends/vivado/passes/conv_same_pad.py | 6 +++--- hls4ml/model/layers.py | 5 ++--- test/pytest/test_sepconv2d.py | 5 +++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hls4ml/backends/vivado/passes/conv_same_pad.py b/hls4ml/backends/vivado/passes/conv_same_pad.py index bb8354a3d0..dd282f34e3 100644 --- a/hls4ml/backends/vivado/passes/conv_same_pad.py +++ b/hls4ml/backends/vivado/passes/conv_same_pad.py @@ -1,4 +1,4 @@ -from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D +from hls4ml.model.layers import Conv1D, Conv2D, DepthwiseConv1D, DepthwiseConv2D, SeparableConv1D, SeparableConv2D from hls4ml.model.optimizer import OptimizerPass @@ -7,7 +7,7 @@ class InsertZeroPaddingBeforeConv1D(OptimizerPass): def match(self, node): is_match = ( - isinstance(node, (Conv1D, SeparableConv1D)) + isinstance(node, (Conv1D, DepthwiseConv1D, SeparableConv1D)) and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal')) and node.get_attr('filt_width') != 1 ) @@ -55,7 +55,7 @@ class InsertZeroPaddingBeforeConv2D(OptimizerPass): def match(self, node): is_match = ( - isinstance(node, (Conv2D, SeparableConv2D)) + isinstance(node, (Conv2D, DepthwiseConv2D, SeparableConv2D)) and node.get_attr('padding') == 'same' and node.get_attr('filt_height') != 1 and node.get_attr('filt_width') != 1 diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 9e80da291f..cb826bb8a1 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -100,7 +100,6 @@ def __init__(self, model, name, attributes, inputs, outputs=None): layer_config = self.model.config.get_layer_config(self) for config_key, config_value in layer_config.items(): - print(f'{config_key=}, {config_value=}') config_key = convert_to_snake_case(config_key) if config_key in self.attributes: print( @@ -494,9 +493,9 @@ class DepthwiseConv1D(Conv1D): Attribute('stride_width'), Attribute('pad_left'), Attribute('pad_right'), - WeightAttribute('depthwise'), + WeightAttribute('weight'), WeightAttribute('bias'), - TypeAttribute('depthwise'), + TypeAttribute('weight'), TypeAttribute('bias'), ] diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py index bee2227a86..4c46e7ab57 100644 --- a/test/pytest/test_sepconv2d.py +++ b/test/pytest/test_sepconv2d.py @@ -10,7 +10,6 @@ padds_options = ['same', 'valid'] chans_options = ['channels_last'] -io_type_options = ['io_parallel', 'io_stream'] strides_options = [(1, 1), (2, 2)] kernel_options = [(2, 2), (3, 3)] bias_options = [False] @@ -43,7 +42,9 @@ def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backen model.compile(optimizer='adam', loss='mse') X_input = np.random.rand(100, *input_shape) keras_prediction = model.predict(X_input) - config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>') + config = hls4ml.utils.config_from_keras_model( + model, default_precision='ap_fixed<32,16>', granularity="name", backend=backend + ) stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '') kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '') output_dir = str( From f12a7ea94e981cef23d059c564c0ff46cb3330f9 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 12 Jun 2024 20:58:51 -0500 Subject: [PATCH 073/103] add parsing of depth multiplier for 1D depthwise conv --- hls4ml/converters/keras/convolution.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hls4ml/converters/keras/convolution.py b/hls4ml/converters/keras/convolution.py index 0eaa967844..2b24613094 100644 --- a/hls4ml/converters/keras/convolution.py +++ b/hls4ml/converters/keras/convolution.py @@ -21,6 +21,9 @@ def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader): layer['bias_data'] = get_weights_data(data_reader, layer['name'], 'bias') + if 'depth_multiplier' in keras_layer['config']: + layer['depth_multiplier'] = keras_layer['config']['depth_multiplier'] + if 'filters' in keras_layer['config']: layer['n_filt'] = keras_layer['config']['filters'] else: From e2d270ea3c05ebfc7e08a290b39edeb38c58aef1 Mon Sep 17 00:00:00 2001 From: stzelepi Date: Mon, 26 Aug 2024 17:30:53 +0200 Subject: [PATCH 074/103] Finish resolving conficts with main --- .../backends/vivado/passes/conv_same_pad.py | 6 +- hls4ml/model/layers.py | 2 - .../vivado/nnet_utils/nnet_sepconv_stream.h | 234 +----------------- 3 files changed, 12 insertions(+), 230 deletions(-) diff --git a/hls4ml/backends/vivado/passes/conv_same_pad.py b/hls4ml/backends/vivado/passes/conv_same_pad.py index dd282f34e3..bb8354a3d0 100644 --- a/hls4ml/backends/vivado/passes/conv_same_pad.py +++ b/hls4ml/backends/vivado/passes/conv_same_pad.py @@ -1,4 +1,4 @@ -from hls4ml.model.layers import Conv1D, Conv2D, DepthwiseConv1D, DepthwiseConv2D, SeparableConv1D, SeparableConv2D +from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D from hls4ml.model.optimizer import OptimizerPass @@ -7,7 +7,7 @@ class InsertZeroPaddingBeforeConv1D(OptimizerPass): def match(self, node): is_match = ( - isinstance(node, (Conv1D, DepthwiseConv1D, SeparableConv1D)) + isinstance(node, (Conv1D, SeparableConv1D)) and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal')) and node.get_attr('filt_width') != 1 ) @@ -55,7 +55,7 @@ class InsertZeroPaddingBeforeConv2D(OptimizerPass): def match(self, node): is_match = ( - isinstance(node, (Conv2D, DepthwiseConv2D, SeparableConv2D)) + isinstance(node, (Conv2D, SeparableConv2D)) and node.get_attr('padding') == 'same' and node.get_attr('filt_height') != 1 and node.get_attr('filt_width') != 1 diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 024c2233cd..d8d1fb9c8f 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -520,7 +520,6 @@ def initialize(self): ) self.add_bias(quantizer=self.get_attr('bias_quantizer')) - self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier')) class Conv2D(Layer): @@ -702,7 +701,6 @@ def initialize(self): ) self.add_bias(quantizer=self.get_attr('bias_quantizer')) - self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier')) class Pooling1D(Layer): diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h index dea028d53b..9c16de1908 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h @@ -4,210 +4,14 @@ #include "hls_stream.h" #include "nnet_common.h" #include "nnet_conv_stream.h" -#include namespace nnet { template -void depthwise_product_resource_rf_leq_nchan(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], +void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { - - const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan; - const int nout = CONFIG_T::n_chan; - - const int rufactor = MIN(CONFIG_T::reuse_factor, nin); - // const int multfactor = MIN(nin, CONFIG_T::reuse_factor); - // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor); - const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor); - // const int multscale = multiplier_limit; - - // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); - // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); - - #pragma HLS function_instantiate variable=weights,biases - //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly - #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor - #pragma HLS ARRAY_RESHAPE variable=data block factor=block_factor - - #pragma HLS ARRAY_PARTITION variable=biases complete - - typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; - #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor - // std::cout << "LEQ IMPLE" << std::endl; - -InitAccum: - for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { - #pragma HLS UNROLL - acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; - } - -int out_index = 0; - -ReuseLoop: - for (int ir = 0; ir < rufactor; ir++) { - #pragma HLS PIPELINE II=1 rewind - - int in_index = ir; - out_index = in_index % CONFIG_T::n_chan; - // int w_index = ir; - // int acc_step = 0; - - MultLoop: - for (int im = 0; im < block_factor; im++) { - #pragma HLS UNROLL - - acc[out_index] += static_cast(CONFIG_T::mult_config::template product::product(data[in_index], weights[in_index])); - - in_index+=rufactor; - - out_index+=rufactor; - out_index -= ((out_index) >= CONFIG_T::n_chan)*CONFIG_T::n_chan; - } - } - -// Cast to "res_t" type -Result: - for (int ires = 0; ires < nout; ires++) { - #pragma HLS UNROLL - res[ires] = cast(acc[ires]); - } -} - - -template -void depthwise_product_resource_rf_gt_nchan_rem0(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { - - const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan; - const int nout = CONFIG_T::n_chan; - - const int rufactor = MIN(CONFIG_T::reuse_factor, nin); - // const int multfactor = MIN(nin, CONFIG_T::reuse_factor); - // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor); - const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor); - // const int multscale = multiplier_limit; - - // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); - // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); - - #pragma HLS function_instantiate variable=weights,biases - //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly - #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor - #pragma HLS ARRAY_RESHAPE variable=data block factor=block_factor - - #pragma HLS ARRAY_PARTITION variable=biases complete - - typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; - #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor - // std::cout << "REM0 IMPLE" << std::endl; - -InitAccum: - for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { - #pragma HLS UNROLL - acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; - } - -int out_index = 0; - -ReuseLoop: - for (int ir = 0; ir < rufactor; ir++) { - #pragma HLS PIPELINE II=1 rewind - - int in_index = ir; - // int w_index = ir; - // int acc_step = 0; - - MultLoop: - for (int im = 0; im < block_factor; im++) { - #pragma HLS UNROLL - - acc[out_index] += static_cast(CONFIG_T::mult_config::template product::product(data[in_index], weights[in_index])); - - in_index+=rufactor; - } - out_index++; - out_index -= ((out_index) == CONFIG_T::n_chan)*CONFIG_T::n_chan; - } - -// Cast to "res_t" type -Result: - for (int ires = 0; ires < nout; ires++) { - #pragma HLS UNROLL - res[ires] = cast(acc[ires]); - } -} - -template -void depthwise_product_resource_rf_gt_nchan(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { - - const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan; - const int nout = CONFIG_T::n_chan; - - const int rufactor = MIN(CONFIG_T::reuse_factor, nin); - // const int multfactor = MIN(nin, CONFIG_T::reuse_factor); - // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor); - const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor); - // const int multscale = multiplier_limit; - - // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); - // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); - - #pragma HLS function_instantiate variable=weights,biases - //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly - #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor - #pragma HLS ARRAY_RESHAPE variable=data block factor=block_factor - - #pragma HLS ARRAY_PARTITION variable=biases complete - - typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; - #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor - // std::cout << "GT IMPLE" << std::endl; - -InitAccum: - for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { - #pragma HLS UNROLL - acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; - } - -int out_index = 0; - -ReuseLoop: - for (int ir = 0; ir < rufactor; ir++) { - #pragma HLS PIPELINE II=1 rewind - - int in_index = ir; - // int w_index = ir; - // int acc_step = 0; - - MultLoop: - for (int im = 0; im < block_factor; im++) { - #pragma HLS UNROLL - - out_index = in_index % CONFIG_T::n_chan; - acc[out_index] += static_cast(CONFIG_T::mult_config::template product::product(data[in_index], weights[in_index])); - - in_index+=rufactor; - } - } - -// Cast to "res_t" type -Result: - for (int ires = 0; ires < nout; ires++) { - #pragma HLS UNROLL - res[ires] = cast(acc[ires]); - } -} - - -template -void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { - // #pragma HLS INLINE + #pragma HLS INLINE typename CONFIG_T::accum_t mult[CONFIG_T::kernel_size * CONFIG_T::n_chan]; typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; @@ -239,10 +43,8 @@ void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_c // Accumulate multiplication result Accum1: for (int ii = 0; ii < CONFIG_T::kernel_size; ii++) { - // #pragma HLS PIPELINE II=1 rewind Accum2: for (int jj = 0; jj < CONFIG_T::n_chan; jj++) { - // #pragma HLS UNROLL int index = ii * CONFIG_T::n_chan + jj; acc[jj] += mult[index]; } @@ -256,22 +58,6 @@ void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_c } } -template -void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], - typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], - typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { - - #pragma HLS INLINE recursive - - if (CONFIG_T::reuse_factor < CONFIG_T::n_chan) { - depthwise_product_resource_rf_leq_nchan(data, res, weights, biases); - } else if (CONFIG_T::reuse_factor % CONFIG_T::n_chan == 0) { - depthwise_product_resource_rf_gt_nchan_rem0(data, res, weights, biases); - } else { - depthwise_product_resource_rf_gt_nchan(data, res, weights, biases); - } -} - template void depthwise_mult_buffer(hls::stream data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T &res_pack, hls::stream &res_stream, unsigned &outputs_ready, @@ -292,9 +78,9 @@ void depthwise_mult_buffer(hls::stream data_window[ #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - depthwise_product_latency(data, res, weights, biases); + depthwise_product(data, res, weights, biases); } else { - depthwise_product_resource(data, res, weights, biases); + assert("Resource strategy for DepthwiseConv2D is not supported." && false); } CastLoop: @@ -416,11 +202,10 @@ void compute_depthwise_output_buffer_1d(const data_T &in_elem, hls::stream(kernel_data, res_out, + depthwise_product(kernel_data, res_out, weights, biases); } else { - depthwise_product_resource(kernel_data, res_out, - weights, biases); + assert("Resource strategy for DepthwiseConv1D is not supported." && false); } // Pack output @@ -482,11 +267,10 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem, // Dense multiply #pragma HLS INLINE recursive if (CONFIG_T::strategy == nnet::latency) { - depthwise_product_latency(kernel_data, res_out, + depthwise_product(kernel_data, res_out, weights, biases); } else { - depthwise_product_resource(kernel_data, res_out, - weights, biases); + assert("Resource strategy for DepthwiseConv2D is not supported." && false); } // Pack output @@ -519,4 +303,4 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem, } } // namespace nnet -#endif \ No newline at end of file +#endif From fa6bd665bc4c735285ff42619615943e67c27d40 Mon Sep 17 00:00:00 2001 From: steltze Date: Mon, 18 Nov 2024 11:30:22 +0100 Subject: [PATCH 075/103] Supress removing tar for now --- hls4ml/writer/vitis_accelerator_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index 70573bb5c2..306de31bb8 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -378,7 +378,7 @@ def write_driver(self, model): ) def write_new_tar(self, model): - os.remove(model.config.get_output_dir() + '.tar.gz') + # os.remove(model.config.get_output_dir() + '.tar.gz') super().write_tar(model) def write_hls(self, model): From b42210d64cf66bd85e816403bb263bfcf7a9ce60 Mon Sep 17 00:00:00 2001 From: steltze Date: Mon, 18 Nov 2024 17:14:04 +0100 Subject: [PATCH 076/103] Fix csynth and cosim --- .../templates/vitis_accelerator/build_lib.sh | 4 + .../vivado/nnet_utils/nnet_helpers.h | 106 +++++++++++++----- hls4ml/writer/vitis_accelerator_writer.py | 15 ++- 3 files changed, 93 insertions(+), 32 deletions(-) diff --git a/hls4ml/templates/vitis_accelerator/build_lib.sh b/hls4ml/templates/vitis_accelerator/build_lib.sh index 69a2bace57..db929714cf 100644 --- a/hls4ml/templates/vitis_accelerator/build_lib.sh +++ b/hls4ml/templates/vitis_accelerator/build_lib.sh @@ -6,7 +6,11 @@ if [[ "$OSTYPE" == "linux-gnu" ]]; then elif [[ "$OSTYPE" == "darwin"* ]]; then CFLAGS="-O3 -fPIC -std=c++11" fi +VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR" +CFLAGS="$CFLAGS -D$VITIS_ACCELERATOR_FLAGS" + INCFLAGS="-Ifirmware/ap_types/" + PROJECT=myproject LIB_STAMP=mystamp diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h index 3938af347c..88a6561f7d 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h @@ -2,7 +2,6 @@ #define NNET_HELPERS_H #include "hls_stream.h" -#include "ap_axi_sdata.h" #include #include #include @@ -12,6 +11,9 @@ #include #include +#ifdef VITIS_ACCELERATOR +#include "ap_axi_sdata.h" +#endif namespace nnet { #ifndef __SYNTHESIS__ @@ -162,20 +164,22 @@ template void convert_data(hls::stre } } -template void convert_data(srcType *src, hls::stream> &dst) { +#ifdef VITIS_ACCELERATOR +template void convert_data(srcType *src, hls::stream> &dst) { for (size_t i = 0; i < SIZE; i++) { - hls::axis ctype; + hls::axis ctype; ctype.data = dstType(src[i]); dst.write(ctype); } } -template void convert_data(hls::stream> &src, dstType *dst) { +template void convert_data(hls::stream> &src, dstType *dst) { for (size_t i = 0; i < SIZE; i++) { - hls::axis ctype = src.read(); + hls::axis ctype = src.read(); dst[i] = dstType(ctype.data); } } +#endif extern bool trace_enabled; extern std::map *trace_outputs; @@ -263,8 +267,6 @@ template void save_layer_output(hls::stream &data, const } } -#endif - template void copy_data(std::vector src, dst_T dst[SIZE]) { typename std::vector::const_iterator in_begin = src.cbegin() + OFFSET; typename std::vector::const_iterator in_end = in_begin + SIZE; @@ -287,16 +289,31 @@ void copy_data(std::vector src, hls::stream &dst) { } } -template void copy_data_axi(std::vector src, dst_T dst[SIZE]) { - for (auto i = 0; i < SIZE; i++) +// template void copy_data_axi(std::vector src, dst_T dst[SIZE]) { +// for (auto i = 0; i < SIZE; i++) { +// dst[i].data = src[i]; +// if (i == SIZE - 1) { +// dst[i].last = 1; +// } else { +// dst[i].last = 0; +// } +// } +// } + +// #ifdef VITIS_ACCELERATOR +template void copy_data_axi(std::vector src, hls::stream &dst) { + for (auto i = 0; i < SIZE; i++) { + dst_T pack; + pack.data = src[i]; if (i == SIZE - 1) { - dst[i].data = src[i]; - dst[i].last = 1; + pack.last = 1; } else { - dst[i].data = src[i]; - dst[i].last = 0; + pack.last = 0; } + dst.write(pack); + } } +// #endif template void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) { for (int i = 0; i < SIZE; i++) { @@ -305,29 +322,65 @@ template void print_result(res_T result[SIZE], std::o out << std::endl; } -template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { - for (int i = 0; i < SIZE / res_T::size; i++) { - res_T res_pack = result.read(); - for (int j = 0; j < res_T::size; j++) { - out << res_pack[j] << " "; - } - if (keep) - result.write(res_pack); +// template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { +// for (int i = 0; i < SIZE / res_T::size; i++) { +// res_T res_pack = result.read(); +// for (int j = 0; j < res_T::size; j++) { +// out << res_pack[j] << " "; +// } +// if (keep) { +// result.write(res_pack); +// } +// } +// out << std::endl; +// } + +// #ifdef VITIS_ACCELERATOR +template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { + for (int i = 0; i < SIZE / underlying_res_T::size; i++) { + res_T res_pack; + for (int j = 0; j < underlying_res_T::size; j++) { + res_pack = result.read(); + out << res_pack.data << " "; + if (keep) { + result.write(res_pack); + } + } } out << std::endl; } +// #endif template void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); } -template void fill_zero(hls::stream &data) { - for (int i = 0; i < SIZE / data_T::size; i++) { +// template void fill_zero(hls::stream &data) { +// for (int i = 0; i < SIZE / data_T::size; i++) { +// data_T data_pack; +// for (int j = 0; j < data_T::size; j++) { +// data_pack[j] = 0.; +// } +// data.write(data_pack); +// } +// } + +// #ifdef VITIS_ACCELERATOR +template void fill_zero(hls::stream &data) { + for (int i = 0; i < SIZE / underlying_data_T::size; i++) { data_T data_pack; - for (int j = 0; j < data_T::size; j++) { - data_pack[j] = 0.; + for (int j = 0; j < underlying_data_T::size; j++) { + data_pack.data = 0.; + if ((i==(SIZE / underlying_data_T::size-1)) && (j==(underlying_data_T::size-1))) { + data_pack.last = 1; + } + else { + data_pack.last = 0; + } + data.write(data_pack); } - data.write(data_pack); + } } +// #endif template int read_file_1D(const char *filename, dataType data[nrows]) { FILE *fp; @@ -386,6 +439,7 @@ template void hls_stream_debug(hls::stream &dat res << datareg; } } +#endif constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); } diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py index 306de31bb8..9019021fa2 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_writer.py @@ -40,7 +40,7 @@ def write_axi_wrapper(self, model): newline += f'static const unsigned N_IN = {inp.size()};\n' newline += f'static const unsigned N_OUT = {out.size()};\n' if self.vitis_accelerator_config.get_interface() == 'axi_stream': - newline += f'typedef hls::axis<{inp_axi_t}, 0, 0, 0> my_pkt;;\n' + newline += f'typedef hls::axis my_pkt;\n' else: # TODO: handle this case newline += f'typedef {inp_axi_t} input_axi_t;\n' newline += f'typedef {out_axi_t} output_axi_t;\n' @@ -277,20 +277,23 @@ def write_wrapper_test(self, model): newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n' elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: newline = ( - line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'hls::stream< my_pkt >') + line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt') ) elif out.size_cpp() in line or out.name in line or out.type.name in line: newline = ( - line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'hls::stream< my_pkt >') + line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt') ) else: newline = line if self.vitis_accelerator_config.get_interface() == 'axi_stream': if 'nnet::fill_zero' in line: - indent = line.split('n')[0] - newline = indent + 'inputs[N_IN-1].last = 1;\n' + newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ") + # indent = line.split('n')[0] + # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n' if 'copy_data' in line: - newline = newline.replace('copy_data', 'copy_data_axi') + newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "") + if 'print_result' in line: + newline = newline.replace("print_result<", f"print_result<{out.type.name}, ") fout.write(newline) f.close() From 1303bbaf67f3f756762fc49370602cb3d30f0a6e Mon Sep 17 00:00:00 2001 From: steltze Date: Mon, 18 Nov 2024 17:35:21 +0100 Subject: [PATCH 077/103] Fix tcl script to find cosim report --- hls4ml/templates/vivado/build_prj.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index af37b0f4aa..5714f05f1a 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -206,7 +206,7 @@ if {$opt(cosim)} { set time_end [clock clicks -milliseconds] puts "INFO:" - if {[string equal "$backend" "vivadoaccelerator"]} { + if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisaccelerator"]} { puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]] } else { puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]] From 8d3a1f27a1db1e8fd7ba231621e687041c768f17 Mon Sep 17 00:00:00 2001 From: steltze Date: Tue, 19 Nov 2024 00:35:39 +0100 Subject: [PATCH 078/103] Correct PYNQ Z2 vivado tcl script, bitstream generated --- .../pynq-z2/tcl_scripts/axi_stream_design.tcl | 50 +++++++++++++------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl index aa06e8a6d2..ecdfb2ac4b 100644 --- a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl +++ b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl @@ -4,7 +4,7 @@ source [file join $tcldir project.tcl] create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force -set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] +# set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] set_property ip_repo_paths ${project_name}_prj [current_project] update_ip_catalog @@ -17,23 +17,29 @@ endgroup apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0] startgroup -set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0] +set_property -dict [list \ + CONFIG.PCW_USE_S_AXI_HP0 {1} \ + CONFIG.PCW_USE_S_AXI_HP2 {1} \ +] [get_bd_cells processing_system7_0] +# set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0] endgroup startgroup create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 endgroup -set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] -set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] +set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list \ + CONFIG.c_include_sg {0} \ + CONFIG.c_m_axi_mm2s_data_width {64} \ + CONFIG.c_m_axi_s2mm_data_width {64} \ + CONFIG.c_mm2s_burst_size {32} \ + CONFIG.c_sg_length_width {26} \ +] [get_bd_cells axi_dma_0] -startgroup -apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] - -apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP0] -endgroup - -apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] +# set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +# set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] +# startgroup create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 @@ -42,9 +48,23 @@ endgroup connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r] connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] -apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP0] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP2} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP2] +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (50 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins myproject_axi_0/ap_clk] +endgroup + + +# apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] + +#todo: make clock a variable +# apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] +validate_bd_design + +# group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0] -group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0] +open_bd_design {./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd} make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top @@ -52,8 +72,10 @@ add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_ reset_run impl_1 reset_run synth_1 -launch_runs impl_1 -to_step write_bitstream -jobs 6 +#todo: make number of jobs a variable +launch_runs impl_1 -to_step write_bitstream -jobs 18 wait_on_run -timeout 360 impl_1 +# open_run impl_1 report_utilization -file util.rpt -hierarchical -hierarchical_percentages From a8e04978bb1ddbf681257ba4322b16f4866c2987 Mon Sep 17 00:00:00 2001 From: steltze Date: Tue, 19 Nov 2024 10:32:24 +0100 Subject: [PATCH 079/103] Clean pynq tcl script --- .../pynq-z2/tcl_scripts/axi_stream_design.tcl | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl index ecdfb2ac4b..c481995dae 100644 --- a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl +++ b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl @@ -21,7 +21,6 @@ set_property -dict [list \ CONFIG.PCW_USE_S_AXI_HP0 {1} \ CONFIG.PCW_USE_S_AXI_HP2 {1} \ ] [get_bd_cells processing_system7_0] -# set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0] endgroup startgroup @@ -37,10 +36,6 @@ set_property -dict [list \ CONFIG.c_sg_length_width {26} \ ] [get_bd_cells axi_dma_0] -# set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] -# set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] -# - startgroup create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 endgroup @@ -48,6 +43,7 @@ endgroup connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r] connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] +#todo: make clock a variable startgroup apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP0] @@ -55,15 +51,8 @@ apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Cl apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (50 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins myproject_axi_0/ap_clk] endgroup - -# apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] - -#todo: make clock a variable -# apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] validate_bd_design -# group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0] - open_bd_design {./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd} make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top @@ -73,9 +62,8 @@ add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_ reset_run impl_1 reset_run synth_1 #todo: make number of jobs a variable -launch_runs impl_1 -to_step write_bitstream -jobs 18 +launch_runs impl_1 -to_step write_bitstream -jobs 10 wait_on_run -timeout 360 impl_1 -# open_run impl_1 report_utilization -file util.rpt -hierarchical -hierarchical_percentages From 48686d33ed5d37ee9208eab8c361581b467bae6d Mon Sep 17 00:00:00 2001 From: steltze Date: Tue, 19 Nov 2024 12:14:49 +0100 Subject: [PATCH 080/103] Fix compatibility of nnet helper functions with vitis axis --- .../vivado/nnet_utils/nnet_helpers.h | 78 +++++++++---------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h index 88a6561f7d..9949ee7d80 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h @@ -11,6 +11,8 @@ #include #include +// this header cannot be included by Vivado HLS +// "VITIS_ACCELERATOR" is defined on the build_lib.sh of the Vitis Accelerator backend files #ifdef VITIS_ACCELERATOR #include "ap_axi_sdata.h" #endif @@ -289,18 +291,17 @@ void copy_data(std::vector src, hls::stream &dst) { } } -// template void copy_data_axi(std::vector src, dst_T dst[SIZE]) { -// for (auto i = 0; i < SIZE; i++) { -// dst[i].data = src[i]; -// if (i == SIZE - 1) { -// dst[i].last = 1; -// } else { -// dst[i].last = 0; -// } -// } -// } - -// #ifdef VITIS_ACCELERATOR +template void copy_data_axi(std::vector src, dst_T dst[SIZE]) { + for (auto i = 0; i < SIZE; i++) { + dst[i].data = src[i]; + if (i == SIZE - 1) { + dst[i].last = 1; + } else { + dst[i].last = 0; + } + } +} + template void copy_data_axi(std::vector src, hls::stream &dst) { for (auto i = 0; i < SIZE; i++) { dst_T pack; @@ -313,7 +314,6 @@ template void copy_data_axi(std::vector< dst.write(pack); } } -// #endif template void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) { for (int i = 0; i < SIZE; i++) { @@ -322,20 +322,20 @@ template void print_result(res_T result[SIZE], std::o out << std::endl; } -// template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { -// for (int i = 0; i < SIZE / res_T::size; i++) { -// res_T res_pack = result.read(); -// for (int j = 0; j < res_T::size; j++) { -// out << res_pack[j] << " "; -// } -// if (keep) { -// result.write(res_pack); -// } -// } -// out << std::endl; -// } - -// #ifdef VITIS_ACCELERATOR +template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { + for (int i = 0; i < SIZE / res_T::size; i++) { + res_T res_pack = result.read(); + for (int j = 0; j < res_T::size; j++) { + out << res_pack[j] << " "; + } + if (keep) { + result.write(res_pack); + } + } + out << std::endl; +} + +// compatible with Vitis Accelerator for res_T = hls::axis template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { for (int i = 0; i < SIZE / underlying_res_T::size; i++) { res_T res_pack; @@ -349,21 +349,20 @@ template void print_result(hl } out << std::endl; } -// #endif template void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); } -// template void fill_zero(hls::stream &data) { -// for (int i = 0; i < SIZE / data_T::size; i++) { -// data_T data_pack; -// for (int j = 0; j < data_T::size; j++) { -// data_pack[j] = 0.; -// } -// data.write(data_pack); -// } -// } - -// #ifdef VITIS_ACCELERATOR +template void fill_zero(hls::stream &data) { + for (int i = 0; i < SIZE / data_T::size; i++) { + data_T data_pack; + for (int j = 0; j < data_T::size; j++) { + data_pack[j] = 0.; + } + data.write(data_pack); + } +} + +// compatible with Vitis Accelerator for res_T = hls::axis template void fill_zero(hls::stream &data) { for (int i = 0; i < SIZE / underlying_data_T::size; i++) { data_T data_pack; @@ -380,7 +379,6 @@ template void fill_zero(hls } } -// #endif template int read_file_1D(const char *filename, dataType data[nrows]) { FILE *fp; From bae450b04b0cd6517d7044b6e2466d6e9949567d Mon Sep 17 00:00:00 2001 From: steltze Date: Tue, 19 Nov 2024 15:04:25 +0100 Subject: [PATCH 081/103] Setup vivado tcl script for zcu102 --- .../pynq-z2/tcl_scripts/axi_lite_design.tcl | 26 ------------- .../zcu102/tcl_scripts/axi_stream_design.tcl | 37 +++++++++++-------- 2 files changed, 22 insertions(+), 41 deletions(-) delete mode 100644 hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl deleted file mode 100644 index 4d23da26cc..0000000000 --- a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl +++ /dev/null @@ -1,26 +0,0 @@ -set tcldir [file dirname [info script]] -source [file join $tcldir project.tcl] - -create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force - -set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] -set_property ip_repo_paths ${project_name}_prj [current_project] -update_ip_catalog - -# Create Block Designer design -create_bd_design "design_1" -create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0 -apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0] -create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 -apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${project_name}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins ${project_name}_axi_0/s_axi_AXILiteS] - -make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top -add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v - -reset_run impl_1 -reset_run synth_1 -launch_runs impl_1 -to_step write_bitstream -jobs 6 -wait_on_run -timeout 360 impl_1 - -open_run impl_1 -report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl index 5d886c6f25..103fec0178 100644 --- a/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl +++ b/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl @@ -13,37 +13,44 @@ set_property ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_proje update_ip_catalog startgroup -create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0 +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.5 zynq_ultra_ps_e_1 endgroup -apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_0] +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_1] -set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0] +set_property -dict [list \ + CONFIG.PSU__SAXIGP2__DATA_WIDTH {64} \ + CONFIG.PSU__SAXIGP4__DATA_WIDTH {64} \ + CONFIG.PSU__USE__S_AXI_GP2 {1} \ + CONFIG.PSU__USE__S_AXI_GP4 {1} \ +] [get_bd_cells zynq_ultra_ps_e_1] startgroup create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 endgroup -set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] -set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] -startgroup -apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] -apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD] -endgroup +set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list \ + CONFIG.c_include_sg {0} \ + CONFIG.c_m_axi_mm2s_data_width {64} \ + CONFIG.c_m_axi_s2mm_data_width {64} \ + CONFIG.c_mm2s_burst_size {32} \ + CONFIG.c_sg_length_width {26} \ +] [get_bd_cells axi_dma_0] -startgroup -apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] -apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD] -endgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_1/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_1/S_AXI_HP0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP0_FPD] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_1/S_AXI_HP2_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP2_FPD] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_1/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_1/M_AXI_HPM1_FPD] startgroup create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 endgroup + connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r] connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r] -apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] -group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0] +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top From dde91243bb56815671207aa69ea00c5cee5c608b Mon Sep 17 00:00:00 2001 From: steltze Date: Tue, 19 Nov 2024 15:47:19 +0100 Subject: [PATCH 082/103] Rename backend to VitisAcceleratorIPFLow to prevent conflicts with kernel flow-versal --- hls4ml/backends/__init__.py | 6 +- .../__init__.py | 0 .../passes/__init__.py | 0 .../passes/fifo_depth_optimization.py | 0 .../supported_boards.json | 0 .../vitis_accelerator_ip_flow_backend.py} | 12 ++-- .../vitis_accelerator_ip_flow_config.py} | 16 ++--- .../build_lib.sh | 2 +- .../myproject_axi.cpp | 0 .../myproject_axi.h | 0 .../python_drivers/axi_stream_driver.py | 0 .../pynq-z2/tcl_scripts/axi_stream_design.tcl | 8 +-- .../python_drivers/axi_stream_driver.py | 0 .../zcu102/tcl_scripts/axi_stream_design.tcl | 0 hls4ml/templates/vivado/build_prj.tcl | 2 +- .../vivado/nnet_utils/nnet_helpers.h | 6 +- hls4ml/writer/__init__.py | 4 +- ...py => vitis_accelerator_ip_flow_writer.py} | 60 +++++++++---------- 18 files changed, 58 insertions(+), 58 deletions(-) rename hls4ml/backends/{vitis_accelerator => vitis_accelerator_ip_flow}/__init__.py (100%) rename hls4ml/backends/{vitis_accelerator => vitis_accelerator_ip_flow}/passes/__init__.py (100%) rename hls4ml/backends/{vitis_accelerator => vitis_accelerator_ip_flow}/passes/fifo_depth_optimization.py (100%) rename hls4ml/backends/{vitis_accelerator => vitis_accelerator_ip_flow}/supported_boards.json (100%) rename hls4ml/backends/{vitis_accelerator/vitis_accelerator_backend.py => vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py} (95%) rename hls4ml/backends/{vitis_accelerator/vitis_accelerator_config.py => vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py} (90%) rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/build_lib.sh (92%) rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/myproject_axi.cpp (100%) rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/myproject_axi.h (100%) rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/pynq-z2/python_drivers/axi_stream_driver.py (100%) rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/pynq-z2/tcl_scripts/axi_stream_design.tcl (88%) rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/zcu102/python_drivers/axi_stream_driver.py (100%) rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/zcu102/tcl_scripts/axi_stream_design.tcl (100%) rename hls4ml/writer/{vitis_accelerator_writer.py => vitis_accelerator_ip_flow_writer.py} (89%) diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index f1eebd3c1f..1f60bdb449 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -10,13 +10,13 @@ from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip -from hls4ml.backends.vitis_accelerator.vitis_accelerator_backend import VitisAcceleratorBackend # isort: skip -from hls4ml.backends.vitis_accelerator.vitis_accelerator_config import VitisAcceleratorConfig # noqa: F401 +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import VitisAcceleratorIPFlowBackend # isort: skip +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import VitisAcceleratorIPFlowConfig # noqa: F401 register_backend('Vivado', VivadoBackend) register_backend('VivadoAccelerator', VivadoAcceleratorBackend) register_backend('Vitis', VitisBackend) -register_backend('VitisAccelerator', VitisAcceleratorBackend) +register_backend('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowBackend) register_backend('Quartus', QuartusBackend) register_backend('Catapult', CatapultBackend) register_backend('SymbolicExpression', SymbolicExpressionBackend) diff --git a/hls4ml/backends/vitis_accelerator/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py similarity index 100% rename from hls4ml/backends/vitis_accelerator/__init__.py rename to hls4ml/backends/vitis_accelerator_ip_flow/__init__.py diff --git a/hls4ml/backends/vitis_accelerator/passes/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py similarity index 100% rename from hls4ml/backends/vitis_accelerator/passes/__init__.py rename to hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py diff --git a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py similarity index 100% rename from hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py rename to hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json similarity index 100% rename from hls4ml/backends/vitis_accelerator/supported_boards.json rename to hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py similarity index 95% rename from hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py rename to hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py index 2e3de9a1cd..6ade53b39d 100644 --- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -5,9 +5,9 @@ from hls4ml.report import parse_vivado_report -class VitisAcceleratorBackend(VitisBackend): +class VitisAcceleratorIPFlowBackend(VitisBackend): def __init__(self): - super(VivadoBackend, self).__init__(name='VitisAccelerator') + super(VivadoBackend, self).__init__(name='VitisAcceleratorIPFlow') self._register_layer_attributes() self._register_flows() @@ -21,7 +21,7 @@ def build( validation=False, export=False, vsynth=False, - fifo_opt=False, + # fifo_opt=False, bitfile=False, ): # run the VitisBackend build @@ -37,9 +37,9 @@ def build( # fifo_opt=fifo_opt, ) # Get Config to view Board and Platform - from hls4ml.backends import VitisAcceleratorConfig + from hls4ml.backends import VitisAcceleratorIPFlowConfig - vitis_accelerator_config = VitisAcceleratorConfig( + vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig( model.config, model.get_input_variables(), model.get_output_variables() ) # now make a bitfile @@ -154,7 +154,7 @@ def get_writer_flow(self): def _register_flows(self): vivado_ip = 'vivado:ip' - writer_passes = ['make_stamp', 'vitisaccelerator:write_hls'] + writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls'] self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name) self._default_flow = vivado_ip diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py similarity index 90% rename from hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py rename to hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py index b0bf4e894b..d00e54a284 100644 --- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py @@ -6,7 +6,7 @@ from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType -class VitisAcceleratorConfig: +class VitisAcceleratorIPFlowConfig: def __init__(self, config, model_inputs, model_outputs): self.config = config.config self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2') @@ -54,10 +54,10 @@ def __init__(self, config, model_inputs, model_outputs): assert ( len(model_inputs) == 1 - ), "Only models with one input tensor are currently supported by VitisAcceleratorBackend" + ), "Only models with one input tensor are currently supported by VitisAcceleratorIPFlowBackend" assert ( len(model_outputs) == 1 - ), "Only models with one output tensor are currently supported by VitisAcceleratorBackend" + ), "Only models with one output tensor are currently supported by VitisAcceleratorIPFlowBackend" self.inp = model_inputs[0] self.out = model_outputs[0] inp_axi_t = self.input_type @@ -131,16 +131,16 @@ def get_clock_period(self): def get_driver_path(self): if self.board.startswith('alveo'): - return '../templates/vitis_accelerator/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file() + return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file() else: - return '../templates/vitis_accelerator/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file() + return '../templates/vitis_accelerator_ip_flow/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file() def get_driver_file(self): driver_ext = '.py' if self.driver == 'python' else '.h' return self.interface + '_driver' + driver_ext def get_krnl_rtl_src_dir(self): - return '../templates/vitis_accelerator/' + 'alveo/' + '/krnl_rtl_src' + return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/krnl_rtl_src' def get_input_type(self): return self.input_type @@ -157,6 +157,6 @@ def get_tcl_file_path(self): if tcl_script is None: raise Exception('No tcl script definition available for the desired interface in supported_board.json') if self.board.startswith('alveo'): - return '../templates/vitis_accelerator/' + 'alveo/' + '/tcl_scripts/' + tcl_script + return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/tcl_scripts/' + tcl_script else: - return '../templates/vitis_accelerator/' + self.board + '/tcl_scripts/' + tcl_script + return '../templates/vitis_accelerator_ip_flow/' + self.board + '/tcl_scripts/' + tcl_script diff --git a/hls4ml/templates/vitis_accelerator/build_lib.sh b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh similarity index 92% rename from hls4ml/templates/vitis_accelerator/build_lib.sh rename to hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh index db929714cf..262ce00d63 100644 --- a/hls4ml/templates/vitis_accelerator/build_lib.sh +++ b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh @@ -6,7 +6,7 @@ if [[ "$OSTYPE" == "linux-gnu" ]]; then elif [[ "$OSTYPE" == "darwin"* ]]; then CFLAGS="-O3 -fPIC -std=c++11" fi -VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR" +VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR_IP_FLOW" CFLAGS="$CFLAGS -D$VITIS_ACCELERATOR_FLAGS" INCFLAGS="-Ifirmware/ap_types/" diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp similarity index 100% rename from hls4ml/templates/vitis_accelerator/myproject_axi.cpp rename to hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h similarity index 100% rename from hls4ml/templates/vitis_accelerator/myproject_axi.h rename to hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py similarity index 100% rename from hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py rename to hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl similarity index 88% rename from hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl rename to hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl index c481995dae..e8db1e6782 100644 --- a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl +++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl @@ -2,7 +2,7 @@ set tcldir [file dirname [info script]] source [file join $tcldir project.tcl] -create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force +create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xc7z020clg400-1 -force # set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] set_property ip_repo_paths ${project_name}_prj [current_project] @@ -53,11 +53,11 @@ endgroup validate_bd_design -open_bd_design {./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd} +open_bd_design {./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd} -make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top +make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top -add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v +add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v reset_run impl_1 reset_run synth_1 diff --git a/hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py similarity index 100% rename from hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py rename to hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py diff --git a/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl similarity index 100% rename from hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl rename to hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index 5714f05f1a..50596091f2 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -206,7 +206,7 @@ if {$opt(cosim)} { set time_end [clock clicks -milliseconds] puts "INFO:" - if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisaccelerator"]} { + if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisacceleratoripflow"]} { puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]] } else { puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]] diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h index 9949ee7d80..2942cf08fa 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h @@ -12,8 +12,8 @@ #include // this header cannot be included by Vivado HLS -// "VITIS_ACCELERATOR" is defined on the build_lib.sh of the Vitis Accelerator backend files -#ifdef VITIS_ACCELERATOR +// "VITIS_ACCELERATOR_IP_FLOW" is defined on the build_lib.sh of the `Vitis Accelerator` template files +#ifdef VITIS_ACCELERATOR_IP_FLOW #include "ap_axi_sdata.h" #endif namespace nnet { @@ -166,7 +166,7 @@ template void convert_data(hls::stre } } -#ifdef VITIS_ACCELERATOR +#ifdef VITIS_ACCELERATOR_IP_FLOW template void convert_data(srcType *src, hls::stream> &dst) { for (size_t i = 0; i < SIZE; i++) { hls::axis ctype; diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index c49b23f58c..70a2eabd75 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -2,7 +2,7 @@ from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter from hls4ml.writer.vitis_writer import VitisWriter -from hls4ml.writer.vitis_accelerator_writer import VitisAcceleratorWriter +from hls4ml.writer.vitis_accelerator_ip_flow_writer import VitisAcceleratorIPFlowWriter from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter from hls4ml.writer.vivado_writer import VivadoWriter from hls4ml.writer.writers import Writer, get_writer, register_writer # noqa: F401 @@ -10,7 +10,7 @@ register_writer('Vivado', VivadoWriter) register_writer('VivadoAccelerator', VivadoAcceleratorWriter) register_writer('Vitis', VitisWriter) -register_writer('VitisAccelerator', VitisAcceleratorWriter) +register_writer('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowWriter) register_writer('Quartus', QuartusWriter) register_writer('Catapult', CatapultWriter) register_writer('SymbolicExpression', SymbolicExpressionWriter) diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py similarity index 89% rename from hls4ml/writer/vitis_accelerator_writer.py rename to hls4ml/writer/vitis_accelerator_ip_flow_writer.py index 9019021fa2..e1817b87e5 100644 --- a/hls4ml/writer/vitis_accelerator_writer.py +++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py @@ -6,17 +6,17 @@ from hls4ml.writer.vitis_writer import VitisWriter -class VitisAcceleratorWriter(VitisWriter): +class VitisAcceleratorIPFlowWriter(VitisWriter): def __init__(self): super().__init__() - self.vitis_accelerator_config = None + self.vitis_accelerator_ip_flow_config = None def write_axi_wrapper(self, model): '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces Args: model : The ModelGraph to write the wrapper for ''' - inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_config.get_corrected_types() + inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types() indent = ' ' ####################### @@ -24,7 +24,7 @@ def write_axi_wrapper(self, model): ####################### filedir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(filedir, '../templates/vitis_accelerator/myproject_axi.h')) + f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.h')) fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w') for line in f.readlines(): @@ -39,7 +39,7 @@ def write_axi_wrapper(self, model): newline = '' newline += f'static const unsigned N_IN = {inp.size()};\n' newline += f'static const unsigned N_OUT = {out.size()};\n' - if self.vitis_accelerator_config.get_interface() == 'axi_stream': + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline += f'typedef hls::axis my_pkt;\n' else: # TODO: handle this case newline += f'typedef {inp_axi_t} input_axi_t;\n' @@ -54,7 +54,7 @@ def write_axi_wrapper(self, model): # myproject_axi.cpp ####################### - f = open(os.path.join(filedir, '../templates/vitis_accelerator/myproject_axi.cpp')) + f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.cpp')) fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w') io_type = model.config.get_config_value("IOType") @@ -66,7 +66,7 @@ def write_axi_wrapper(self, model): newline = f'#include "{model.config.get_project_name()}_axi.h"\n' elif '// hls-fpga-machine-learning insert local vars' in line: newline = '' - # if self.vitis_accelerator_config.get_interface() == 'axi_stream': + # if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': # newline += indent + 'bool is_last = false;\n' if io_type == 'io_parallel': # TODO: handle io_parallel newline += indent + inp.type.name + ' in_local[N_IN];\n' @@ -83,12 +83,12 @@ def write_axi_wrapper(self, model): elif '// hls-fpga-machine-learning insert call' in line: newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n' elif '// hls-fpga-machine-learning insert interface' in line: - if self.vitis_accelerator_config.get_interface() == 'axi_lite': # TODO: handle axi_lite + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite': # TODO: handle axi_lite newline = '' newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n' newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n' - elif self.vitis_accelerator_config.get_interface() == 'axi_master': # TODO: handle axi_master + elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master': # TODO: handle axi_master newline = '' newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n' newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format( @@ -97,7 +97,7 @@ def write_axi_wrapper(self, model): newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format( model.get_output_variables()[0].pragma[1] ) - elif self.vitis_accelerator_config.get_interface() == 'axi_stream': + elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline = '' newline += indent + '#pragma HLS INTERFACE axis port=in\n' newline += indent + '#pragma HLS INTERFACE axis port=out\n' @@ -109,7 +109,7 @@ def write_axi_wrapper(self, model): if io_type == 'io_parallel': # TODO: handle io_parallel newline = '' newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n' - if self.vitis_accelerator_config.get_interface() == 'axi_stream': + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline += indent + indent + '#pragma HLS PIPELINE\n' newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n' newline += indent + indent + 'is_last |= (in[i].last == 1)? true: false;\n' @@ -130,7 +130,7 @@ def write_axi_wrapper(self, model): # newline += indent + indent + 'pragma HLS aggregate variable=ctype compact=auto' # TODO: check if needed newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed - if self.vitis_accelerator_config.get_interface() == 'axi_stream': + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline += ( indent + indent @@ -164,7 +164,7 @@ def write_axi_wrapper(self, model): if io_type == 'io_parallel': # TODO: handle this case newline = '' newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n' - if self.vitis_accelerator_config.get_interface() == 'axi_stream': + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline += indent + indent + '#pragma HLS PIPELINE\n' newline += indent + indent + 'out[i].data = out_local[i]; // Write output with cast\n' newline += indent + indent + 'out[i].last = (is_last && (i == N_OUT - 1))? true : false;\n' @@ -179,7 +179,7 @@ def write_axi_wrapper(self, model): newline += indent + indent + '{result_t} ctype = out_local.read();\n' newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' - if self.vitis_accelerator_config.get_interface() == 'axi_stream': + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline += ( indent + indent + indent + f'tmp_b.data = ({inp_axi_t}) (ctype[j]);\n' ) @@ -235,7 +235,7 @@ def modify_build_script(self, model): # build_lib.sh ################### - f = open(os.path.join(filedir, '../templates/vitis_accelerator/build_lib.sh')) + f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/build_lib.sh')) fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w') for line in f.readlines(): @@ -253,7 +253,7 @@ def write_wrapper_test(self, model): oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp' newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp' - inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_config.get_corrected_types() + inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types() f = open(oldfile) fout = open(newfile, 'w') @@ -285,7 +285,7 @@ def write_wrapper_test(self, model): ) else: newline = line - if self.vitis_accelerator_config.get_interface() == 'axi_stream': + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': if 'nnet::fill_zero' in line: newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ") # indent = line.split('n')[0] @@ -338,16 +338,16 @@ def write_wrapper_test(self, model): def write_board_script(self, model): ''' - Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAccelerator + Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAcceleratorIPFlow ''' filedir = os.path.dirname(os.path.abspath(__file__)) copyfile( - os.path.join(filedir, self.vitis_accelerator_config.get_tcl_file_path()), + os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_tcl_file_path()), f'{model.config.get_output_dir()}/design.tcl', ) # Generic alveo board - if self.vitis_accelerator_config.get_board().startswith('alveo'): - src_dir = os.path.join(filedir, self.vitis_accelerator_config.get_krnl_rtl_src_dir()) + if self.vitis_accelerator_ip_flow_config.get_board().startswith('alveo'): + src_dir = os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_krnl_rtl_src_dir()) dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src' copy_tree(src_dir, dst_dir) @@ -358,17 +358,17 @@ def write_board_script(self, model): f.write('variable project_name\n') f.write(f'set project_name "{model.config.get_project_name()}"\n') f.write('variable backend\n') - f.write('set backend "vitisaccelerator"\n') + f.write('set backend "vitisacceleratoripflow"\n') f.write('variable part\n') - f.write(f'set part "{self.vitis_accelerator_config.get_part()}"\n') + f.write(f'set part "{self.vitis_accelerator_ip_flow_config.get_part()}"\n') f.write('variable clock_period\n') f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) f.write('variable clock_uncertainty\n') f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) f.write('variable version\n') f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) - if self.vitis_accelerator_config.get_interface() == 'axi_stream': - in_bit, out_bit = self.vitis_accelerator_config.get_io_bitwidth() + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + in_bit, out_bit = self.vitis_accelerator_ip_flow_config.get_io_bitwidth() f.write(f'set bit_width_hls_output {in_bit}\n') f.write(f'set bit_width_hls_input {out_bit}\n') f.close() @@ -376,8 +376,8 @@ def write_board_script(self, model): def write_driver(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) copyfile( - os.path.join(filedir, self.vitis_accelerator_config.get_driver_path()), - ('{}/' + self.vitis_accelerator_config.get_driver_file()).format(model.config.get_output_dir()), + os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_driver_path()), + ('{}/' + self.vitis_accelerator_ip_flow_config.get_driver_file()).format(model.config.get_output_dir()), ) def write_new_tar(self, model): @@ -386,12 +386,12 @@ def write_new_tar(self, model): def write_hls(self, model): """ - Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAccelerator/AXI interface + Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAcceleratorIPFlow/AXI interface """ # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package - from hls4ml.backends import VitisAcceleratorConfig + from hls4ml.backends import VitisAcceleratorIPFlowConfig - self.vitis_accelerator_config = VitisAcceleratorConfig( + self.vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig( model.config, model.get_input_variables(), model.get_output_variables() ) super().write_hls(model) From 663181ff0f640f0467a1a0507927db3731a1a8f6 Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 20 Nov 2024 12:05:55 +0100 Subject: [PATCH 083/103] Fix compatiblity between axi stream and io parallel --- .../vivado/nnet_utils/nnet_helpers.h | 43 +++++++++++++- .../vitis_accelerator_ip_flow_writer.py | 56 ++++++++++--------- 2 files changed, 70 insertions(+), 29 deletions(-) diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h index 2942cf08fa..389d687089 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h @@ -167,6 +167,7 @@ template void convert_data(hls::stre } #ifdef VITIS_ACCELERATOR_IP_FLOW +//todo avoid hardcoding hls::axis and use template template void convert_data(srcType *src, hls::stream> &dst) { for (size_t i = 0; i < SIZE; i++) { hls::axis ctype; @@ -322,7 +323,9 @@ template void print_result(res_T result[SIZE], std::o out << std::endl; } -template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { +template ::value, int>::type = 0> +void print_result(hls::stream &result, std::ostream &out, bool keep = false) { for (int i = 0; i < SIZE / res_T::size; i++) { res_T res_pack = result.read(); for (int j = 0; j < res_T::size; j++) { @@ -335,7 +338,23 @@ template void print_result(hls::stream &result out << std::endl; } -// compatible with Vitis Accelerator for res_T = hls::axis +// compatible with Vitis Accelerator for res_T = hls::axis<...> and io_parallel +template ::value, int>::type = 0> +void print_result(hls::stream &result, std::ostream &out, bool keep = false) { + for (int i = 0; i < SIZE; i++) { + res_T res_pack = result.read(); + + out << res_pack.data << " "; + + if (keep) { + result.write(res_pack); + } + } + out << std::endl; +} + +// compatible with Vitis Accelerator for res_T = hls::axis and io_stream template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { for (int i = 0; i < SIZE / underlying_res_T::size; i++) { res_T res_pack; @@ -352,7 +371,9 @@ template void print_result(hl template void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); } -template void fill_zero(hls::stream &data) { +template ::value, int>::type = 0> +void fill_zero(hls::stream &data) { for (int i = 0; i < SIZE / data_T::size; i++) { data_T data_pack; for (int j = 0; j < data_T::size; j++) { @@ -362,6 +383,22 @@ template void fill_zero(hls::stream &data) { } } +template ::value, int>::type = 0> +void fill_zero(hls::stream &data) { + for (int i = 0; i < SIZE; i++) { + data_T data_pack; + data_pack.data = 0.; + if (i==SIZE-1) { + data_pack.last = 1; + } + else { + data_pack.last = 0; + } + data.write(data_pack); + } +} + // compatible with Vitis Accelerator for res_T = hls::axis template void fill_zero(hls::stream &data) { for (int i = 0; i < SIZE / underlying_data_T::size; i++) { diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py index e1817b87e5..535a43b4bc 100644 --- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py +++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py @@ -66,11 +66,12 @@ def write_axi_wrapper(self, model): newline = f'#include "{model.config.get_project_name()}_axi.h"\n' elif '// hls-fpga-machine-learning insert local vars' in line: newline = '' - # if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': - # newline += indent + 'bool is_last = false;\n' + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + newline += indent + 'bool is_last = false;\n' if io_type == 'io_parallel': # TODO: handle io_parallel newline += indent + inp.type.name + ' in_local[N_IN];\n' - newline += indent + out.type.name + ' out_local[N_OUT];\n' + newline += indent + out.type.name + ' out_local[N_OUT];\n' + newline += indent + 'my_pkt tmp;\n' elif io_type == 'io_stream': newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n' newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n' @@ -111,17 +112,17 @@ def write_axi_wrapper(self, model): newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n' if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline += indent + indent + '#pragma HLS PIPELINE\n' - newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n' - newline += indent + indent + 'is_last |= (in[i].last == 1)? true: false;\n' + newline += indent + indent + 'tmp = in.read(); // Read input with cast\n' + newline += indent + indent + 'in_local[i] = tmp.data;\n' + newline += indent + indent + 'is_last = tmp.last;\n' else: newline += indent + indent + '#pragma HLS UNROLL\n' newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n' newline += indent + '}\n' + newline += indent + 'tmp.last = 0;\n' elif io_type == 'io_stream': newline = '' - newline += indent + 'my_pkt tmp_a;\n' - - newline += indent + 'my_pkt tmp_b;\n' + newline += indent + 'my_pkt tmp;\n' newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n' # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed @@ -135,17 +136,17 @@ def write_axi_wrapper(self, model): indent + indent + indent - + 'in.read(tmp_a);\n' + + 'in.read(tmp);\n' ) newline += ( indent + indent + indent - + 'ctype[j] = tmp_a.data;\n' + + 'ctype[j] = tmp.data;\n' + ) + newline += ( + indent + indent + indent + 'is_last = tmp.last;\n' ) - # newline += ( - # indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n' - # ) else: # TODO: handle this case newline += ( indent @@ -156,8 +157,7 @@ def write_axi_wrapper(self, model): newline += indent + indent + '}}\n' newline += indent + indent + 'in_local.write(ctype);\n' newline += indent + '}}\n' - newline += indent + 'tmp_b = tmp_a;\n' - newline += indent + 'tmp_b.last = 0;\n' + newline += indent + 'tmp.last = 0;\n' newline = newline.format(input_t=inp.type.name) elif '// hls-fpga-machine-learning insert dequeue' in line: io_type = model.config.get_config_value("IOType") @@ -166,8 +166,9 @@ def write_axi_wrapper(self, model): newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n' if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline += indent + indent + '#pragma HLS PIPELINE\n' - newline += indent + indent + 'out[i].data = out_local[i]; // Write output with cast\n' - newline += indent + indent + 'out[i].last = (is_last && (i == N_OUT - 1))? true : false;\n' + newline += indent + indent + 'tmp.data = out_local[i];\n' + newline += indent + indent + 'tmp.last = (is_last && (i == N_OUT - 1))? true : false;\n' + newline += indent + indent + 'out.write(tmp);\n' else: newline += indent + indent + '#pragma HLS UNROLL\n' newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n' @@ -181,15 +182,15 @@ def write_axi_wrapper(self, model): # newline += indent + indent + indent + '#pragma HLS UNROLL\n' if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline += ( - indent + indent + indent + f'tmp_b.data = ({inp_axi_t}) (ctype[j]);\n' + indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n' ) newline += ( - indent + indent + indent + 'if(tmp_a.last == 1) {{tmp_b.last = (((i+1)*(j+1))==N_OUT);}}\n' + indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n' ) newline += ( - indent + indent + indent + 'out.write(tmp_b);\n' + indent + indent + indent + 'out.write(tmp);\n' ) else: newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n' @@ -260,6 +261,7 @@ def write_wrapper_test(self, model): inp = model.get_input_variables()[0] out = model.get_output_variables()[0] + io_type = model.config.get_config_value("IOType") for line in f.readlines(): if f'{model.config.get_project_name()}.h' in line: @@ -286,14 +288,16 @@ def write_wrapper_test(self, model): else: newline = line if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': - if 'nnet::fill_zero' in line: - newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ") - # indent = line.split('n')[0] - # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n' if 'copy_data' in line: newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "") - if 'print_result' in line: - newline = newline.replace("print_result<", f"print_result<{out.type.name}, ") + + if io_type == 'io_stream': + if 'nnet::fill_zero' in line: + newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ") + # indent = line.split('n')[0] + # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n' + if 'print_result' in line: + newline = newline.replace("print_result<", f"print_result<{out.type.name}, ") fout.write(newline) f.close() From e32f4d0b2763452bb82095d61d5f51fba9220187 Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 20 Nov 2024 13:36:43 +0100 Subject: [PATCH 084/103] Update pynq driver for zcu102 --- .../python_drivers/axi_stream_driver.py | 91 +++++++++++-------- 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py index 1aac79f2d3..fda308e9ca 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py +++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py @@ -1,39 +1,40 @@ +from pynq import DefaultHierarchy, DefaultIP, allocate +from pynq import Overlay from datetime import datetime - +import pynq.lib.dma import numpy as np -from pynq import Overlay, allocate +from pynq import PL class NeuralNetworkOverlay(Overlay): - def __init__( - self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None - ): + def __init__(self, bitfile_name, dtbo=None, download=True, ignore_version=False, device=None): super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) - self.sendchannel = self.hier_0.axi_dma_0.sendchannel - self.recvchannel = self.hier_0.axi_dma_0.recvchannel - self.input_buffer = allocate(shape=x_shape, dtype=dtype) - self.output_buffer = allocate(shape=y_shape, dtype=dtype) - + def _print_dt(self, timea, timeb, N): - dt = timeb - timea + dt = (timeb - timea) dts = dt.seconds + dt.microseconds * 10**-6 rate = N / dts - print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)") + print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate)) return dts, rate - - def predict(self, X, debug=False, profile=False, encode=None, decode=None): + + def reset_PL(): + PL.reset() + + def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None): """ Obtain the predictions of the NN implemented in the FPGA. Parameters: - X : the input vector. Should be numpy ndarray. - - dtype : the data type of the elements of the input/output vectors. - Note: it should be set depending on the interface of the accelerator; if it uses 'float' - types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and + for sizing the output vector shape. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot - any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` doc for more info). - In this case the encoding/decoding has to be computed by the PS. For example for - 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode 'float' -> 'ap_fixed<16,6>': ``` def encode(xi): @@ -48,28 +49,38 @@ def decode(yi): - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to the namesake parameter. """ - if profile: - timea = datetime.now() + if encode is not None: X = encode(X) - self.input_buffer[:] = X - self.sendchannel.transfer(self.input_buffer) - self.recvchannel.transfer(self.output_buffer) - if debug: - print("Transfer OK") - self.sendchannel.wait() - if debug: - print("Send OK") - self.recvchannel.wait() - if debug: - print("Receive OK") - # result = self.output_buffer.copy() + with allocate(shape=X.shape, dtype=dtype) as input_buffer, \ + allocate(shape=y_shape, dtype=dtype) as output_buffer: + input_buffer[:] = X + + if profile: + timea = datetime.now() + + self.axi_dma_0.sendchannel.transfer(input_buffer) + self.axi_dma_0.recvchannel.transfer(output_buffer) + if debug: + print("Transfer OK") + self.axi_dma_0.sendchannel.wait() + if debug: + print("Send OK") + self.axi_dma_0.recvchannel.wait() + + if profile: + timeb = datetime.now() + + if debug: + print("Receive OK") + + result = output_buffer.copy() + if decode is not None: - self.output_buffer = decode(self.output_buffer) - + result = decode(result) + if profile: - timeb = datetime.now() dts, rate = self._print_dt(timea, timeb, len(X)) - return self.output_buffer, dts, rate - else: - return self.output_buffer + return result, dts, rate + + return result \ No newline at end of file From c52ec75ade7c59166c163bbc7a7dcce6eaa67601 Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 20 Nov 2024 14:06:41 +0100 Subject: [PATCH 085/103] Run pre-commit --- hls4ml/backends/__init__.py | 10 ++- .../vitis_accelerator_ip_flow_backend.py | 15 ++--- .../vitis_accelerator_ip_flow_config.py | 9 ++- .../myproject_axi.cpp | 2 +- .../vitis_accelerator_ip_flow/myproject_axi.h | 2 +- .../python_drivers/axi_stream_driver.py | 47 +++++++------- .../vivado/nnet_utils/nnet_helpers.h | 46 +++++++------- hls4ml/writer/__init__.py | 2 +- .../vitis_accelerator_ip_flow_writer.py | 62 +++++++------------ 9 files changed, 88 insertions(+), 107 deletions(-) diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 1f60bdb449..68562d75ed 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -2,16 +2,20 @@ from hls4ml.backends.fpga.fpga_backend import FPGABackend # noqa: F401 from hls4ml.backends.quartus.quartus_backend import QuartusBackend from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend - +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import ( # noqa: F401 + VitisAcceleratorIPFlowConfig, +) from hls4ml.backends.vivado.vivado_backend import VivadoBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401 +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import ( # isort: skip + VitisAcceleratorIPFlowBackend, +) + from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip -from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import VitisAcceleratorIPFlowBackend # isort: skip -from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import VitisAcceleratorIPFlowConfig # noqa: F401 register_backend('Vivado', VivadoBackend) register_backend('VivadoAccelerator', VivadoAcceleratorBackend) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py index 6ade53b39d..cd57df5a4a 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -37,22 +37,19 @@ def build( # fifo_opt=fifo_opt, ) # Get Config to view Board and Platform - from hls4ml.backends import VitisAcceleratorIPFlowConfig + # from hls4ml.backends import VitisAcceleratorIPFlowConfig - vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig( - model.config, model.get_input_variables(), model.get_output_variables() - ) + # vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig( + # model.config, model.get_input_variables(), model.get_output_variables() + # ) # now make a bitfile if bitfile: - # if vitis_accelerator_config.get_board().startswith('alveo'): - # self.make_xclbin(model, vitis_accelerator_config.get_platform()) - # else: curr_dir = os.getcwd() os.chdir(model.config.get_output_dir()) try: - os.system('vivado -mode batch -source design.tcl') # check if this is accepted as a command + os.system('vivado -mode batch -source design.tcl') # check if this is accepted as a command except Exception: - print("Something went wrong, check the Vivado logs") + print("Something went wrong, check the Vivado logs") os.chdir(curr_dir) return parse_vivado_report(model.config.get_output_dir()) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py index d00e54a284..07961a9b6f 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py @@ -133,7 +133,14 @@ def get_driver_path(self): if self.board.startswith('alveo'): return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file() else: - return '../templates/vitis_accelerator_ip_flow/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file() + return ( + '../templates/vitis_accelerator_ip_flow/' + + self.board + + '/' + + self.driver + + '_drivers/' + + self.get_driver_file() + ) def get_driver_file(self): driver_ext = '.py' if self.driver == 'python' else '.h' diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp index 01238643ed..cf6c0b9c25 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp +++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp @@ -1,6 +1,6 @@ // hls-fpga-machine-learning insert include -void myproject_axi(hls::stream< my_pkt > &in, hls::stream< my_pkt > &out) { +void myproject_axi(hls::stream &in, hls::stream &out) { // hls-fpga-machine-learning insert interface diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h index d49f98ba14..d0d88bfecf 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h +++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h @@ -6,5 +6,5 @@ // hls-fpga-machine-learning insert definitions -void myproject_axi(hls::stream< my_pkt > &in, hls::stream< my_pkt > &out); +void myproject_axi(hls::stream &in, hls::stream &out); #endif diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py index fda308e9ca..1d70e55406 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py +++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py @@ -1,25 +1,23 @@ -from pynq import DefaultHierarchy, DefaultIP, allocate -from pynq import Overlay from datetime import datetime -import pynq.lib.dma + import numpy as np -from pynq import PL +from pynq import PL, Overlay, allocate class NeuralNetworkOverlay(Overlay): def __init__(self, bitfile_name, dtbo=None, download=True, ignore_version=False, device=None): super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) - + def _print_dt(self, timea, timeb, N): - dt = (timeb - timea) + dt = timeb - timea dts = dt.seconds + dt.microseconds * 10**-6 rate = N / dts - print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate)) + print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)") return dts, rate - + def reset_PL(): PL.reset() - + def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None): """ Obtain the predictions of the NN implemented in the FPGA. @@ -27,14 +25,14 @@ def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encod - X : the input vector. Should be numpy ndarray. - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and for sizing the output vector shape. - - dtype : the data type of the elements of the input/output vectors. - Note: it should be set depending on the interface of the accelerator; if it uses 'float' - types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot - any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` doc for more info). - In this case the encoding/decoding has to be computed by the PS. For example for - 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode 'float' -> 'ap_fixed<16,6>': ``` def encode(xi): @@ -52,13 +50,12 @@ def decode(yi): if encode is not None: X = encode(X) - with allocate(shape=X.shape, dtype=dtype) as input_buffer, \ - allocate(shape=y_shape, dtype=dtype) as output_buffer: + with allocate(shape=X.shape, dtype=dtype) as input_buffer, allocate(shape=y_shape, dtype=dtype) as output_buffer: input_buffer[:] = X - + if profile: timea = datetime.now() - + self.axi_dma_0.sendchannel.transfer(input_buffer) self.axi_dma_0.recvchannel.transfer(output_buffer) if debug: @@ -67,20 +64,20 @@ def decode(yi): if debug: print("Send OK") self.axi_dma_0.recvchannel.wait() - + if profile: timeb = datetime.now() - + if debug: print("Receive OK") - + result = output_buffer.copy() - + if decode is not None: result = decode(result) - + if profile: dts, rate = self._print_dt(timea, timeb, len(X)) return result, dts, rate - return result \ No newline at end of file + return result diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h index 389d687089..2a695d4e5a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h @@ -167,8 +167,9 @@ template void convert_data(hls::stre } #ifdef VITIS_ACCELERATOR_IP_FLOW -//todo avoid hardcoding hls::axis and use template -template void convert_data(srcType *src, hls::stream> &dst) { +// todo avoid hardcoding hls::axis and use template +template +void convert_data(srcType *src, hls::stream> &dst) { for (size_t i = 0; i < SIZE; i++) { hls::axis ctype; ctype.data = dstType(src[i]); @@ -176,7 +177,8 @@ template void convert_data(srcTyp } } -template void convert_data(hls::stream> &src, dstType *dst) { +template +void convert_data(hls::stream> &src, dstType *dst) { for (size_t i = 0; i < SIZE; i++) { hls::axis ctype = src.read(); dst[i] = dstType(ctype.data); @@ -323,8 +325,7 @@ template void print_result(res_T result[SIZE], std::o out << std::endl; } -template ::value, int>::type = 0> +template ::value, int>::type = 0> void print_result(hls::stream &result, std::ostream &out, bool keep = false) { for (int i = 0; i < SIZE / res_T::size; i++) { res_T res_pack = result.read(); @@ -333,29 +334,29 @@ void print_result(hls::stream &result, std::ostream &out, bool keep = fal } if (keep) { result.write(res_pack); - } + } } out << std::endl; } // compatible with Vitis Accelerator for res_T = hls::axis<...> and io_parallel -template ::value, int>::type = 0> +template ::value, int>::type = 0> void print_result(hls::stream &result, std::ostream &out, bool keep = false) { for (int i = 0; i < SIZE; i++) { res_T res_pack = result.read(); - + out << res_pack.data << " "; - + if (keep) { result.write(res_pack); - } + } } out << std::endl; } // compatible with Vitis Accelerator for res_T = hls::axis and io_stream -template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { +template +void print_result(hls::stream &result, std::ostream &out, bool keep = false) { for (int i = 0; i < SIZE / underlying_res_T::size; i++) { res_T res_pack; for (int j = 0; j < underlying_res_T::size; j++) { @@ -363,16 +364,15 @@ template void print_result(hl out << res_pack.data << " "; if (keep) { result.write(res_pack); - } - } + } + } } out << std::endl; } template void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); } -template ::value, int>::type = 0> +template ::value, int>::type = 0> void fill_zero(hls::stream &data) { for (int i = 0; i < SIZE / data_T::size; i++) { data_T data_pack; @@ -383,16 +383,14 @@ void fill_zero(hls::stream &data) { } } -template ::value, int>::type = 0> +template ::value, int>::type = 0> void fill_zero(hls::stream &data) { for (int i = 0; i < SIZE; i++) { data_T data_pack; data_pack.data = 0.; - if (i==SIZE-1) { + if (i == SIZE - 1) { data_pack.last = 1; - } - else { + } else { data_pack.last = 0; } data.write(data_pack); @@ -405,15 +403,13 @@ template void fill_zero(hls data_T data_pack; for (int j = 0; j < underlying_data_T::size; j++) { data_pack.data = 0.; - if ((i==(SIZE / underlying_data_T::size-1)) && (j==(underlying_data_T::size-1))) { + if ((i == (SIZE / underlying_data_T::size - 1)) && (j == (underlying_data_T::size - 1))) { data_pack.last = 1; - } - else { + } else { data_pack.last = 0; } data.write(data_pack); } - } } diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index 70a2eabd75..31238b18c8 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -1,8 +1,8 @@ from hls4ml.writer.catapult_writer import CatapultWriter from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter -from hls4ml.writer.vitis_writer import VitisWriter from hls4ml.writer.vitis_accelerator_ip_flow_writer import VitisAcceleratorIPFlowWriter +from hls4ml.writer.vitis_writer import VitisWriter from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter from hls4ml.writer.vivado_writer import VivadoWriter from hls4ml.writer.writers import Writer, get_writer, register_writer # noqa: F401 diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py index 535a43b4bc..78e1fa982d 100644 --- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py +++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py @@ -40,8 +40,10 @@ def write_axi_wrapper(self, model): newline += f'static const unsigned N_IN = {inp.size()};\n' newline += f'static const unsigned N_OUT = {out.size()};\n' if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': - newline += f'typedef hls::axis my_pkt;\n' - else: # TODO: handle this case + newline += 'typedef hls::axis my_pkt;\n' + # might need to make "float" a variable according to the + # configuration set by the user and the DMA available data widths + else: # TODO: handle this case newline += f'typedef {inp_axi_t} input_axi_t;\n' newline += f'typedef {out_axi_t} output_axi_t;\n' else: @@ -68,9 +70,9 @@ def write_axi_wrapper(self, model): newline = '' if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': newline += indent + 'bool is_last = false;\n' - if io_type == 'io_parallel': # TODO: handle io_parallel + if io_type == 'io_parallel': # TODO: handle io_parallel newline += indent + inp.type.name + ' in_local[N_IN];\n' - newline += indent + out.type.name + ' out_local[N_OUT];\n' + newline += indent + out.type.name + ' out_local[N_OUT];\n' newline += indent + 'my_pkt tmp;\n' elif io_type == 'io_stream': newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n' @@ -84,12 +86,12 @@ def write_axi_wrapper(self, model): elif '// hls-fpga-machine-learning insert call' in line: newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n' elif '// hls-fpga-machine-learning insert interface' in line: - if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite': # TODO: handle axi_lite + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite': # TODO: handle axi_lite newline = '' newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n' newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n' - elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master': # TODO: handle axi_master + elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master': # TODO: handle axi_master newline = '' newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n' newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format( @@ -107,7 +109,7 @@ def write_axi_wrapper(self, model): newline += indent + '#pragma HLS DATAFLOW\n' elif '// hls-fpga-machine-learning insert enqueue' in line: io_type = model.config.get_config_value("IOType") - if io_type == 'io_parallel': # TODO: handle io_parallel + if io_type == 'io_parallel': # TODO: handle io_parallel newline = '' newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n' if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': @@ -132,22 +134,10 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': - newline += ( - indent - + indent - + indent - + 'in.read(tmp);\n' - ) - newline += ( - indent - + indent - + indent - + 'ctype[j] = tmp.data;\n' - ) - newline += ( - indent + indent + indent + 'is_last = tmp.last;\n' - ) - else: # TODO: handle this case + newline += indent + indent + indent + 'in.read(tmp);\n' + newline += indent + indent + indent + 'ctype[j] = tmp.data;\n' + newline += indent + indent + indent + 'is_last = tmp.last;\n' + else: # TODO: handle this case newline += ( indent + indent @@ -181,17 +171,11 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': - newline += ( - indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n' - ) + newline += indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n' - newline += ( - indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n' - ) + newline += indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n' - newline += ( - indent + indent + indent + 'out.write(tmp);\n' - ) + newline += indent + indent + indent + 'out.write(tmp);\n' else: newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n' newline += indent + indent + '}}\n' @@ -253,7 +237,7 @@ def write_wrapper_test(self, model): ################### oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp' newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp' - + inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types() f = open(oldfile) @@ -278,19 +262,15 @@ def write_wrapper_test(self, model): indent_amount = line.split(model.config.get_project_name())[0] newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n' elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: - newline = ( - line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt') - ) + newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt') elif out.size_cpp() in line or out.name in line or out.type.name in line: - newline = ( - line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt') - ) + newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt') else: newline = line if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': if 'copy_data' in line: newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "") - + if io_type == 'io_stream': if 'nnet::fill_zero' in line: newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ") @@ -331,7 +311,7 @@ def write_wrapper_test(self, model): elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, inp_axi_t) elif out.size_cpp() in line or out.name in line or out.type.name in line: - newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, out_axi_t) + newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, out_axi_t) else: newline = line fout.write(newline) From 9d9e6454c195505068504ae0f6d84d9c39d418c1 Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 20 Nov 2024 14:30:02 +0100 Subject: [PATCH 086/103] Remove unused file --- .../passes/fifo_depth_optimization.py | 69 ------------------- 1 file changed, 69 deletions(-) delete mode 100644 hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py deleted file mode 100644 index e983ca49fb..0000000000 --- a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py +++ /dev/null @@ -1,69 +0,0 @@ -# from hls4ml.backends.vivado.passes.fifo_depth_optimization import ( -# generate_max_depth_file, -# get_vcd_data, -# populate_values, -# set_big_fifos, -# set_fifo_depth, -# ) -# from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass - - -# class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass): -# def __init__(self): -# self.values = [] - -# def transform(self, model): -# # use `large_fifo_depth = 0` to keep the default fifo depth -# profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000) - -# # check axi-stream or io-stream, if not one the 2 exit -# if not ( -# model.config.get_config_value('IOType') == 'io_stream' -# or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_stream' -# or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_master' -# ): -# raise Exception( -# 'To use this optimization you have to set `IOType` field to `io_stream` in the HLS config ' -# 'or `axi_stream` or `axi_master` in `AcceleratorConfig` interface field' -# ) - -# # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be -# # profiled - -# if profiling_fifo_depth: -# set_big_fifos(model.output_vars, profiling_fifo_depth) - -# data = get_vcd_data(model) - -# for i in range(1, len(data['children'][0]['children'][0]['children'])): -# # wrapper fifos -# populate_values( -# self.values, -# data['children'][0]['children'][0]['children'][i]['name'], -# data['children'][0]['children'][0]['children'][i]['children'][0]['data'], -# data['children'][0]['children'][0]['children'][i]['children'][1]['data'], -# ) - -# n_elem = len(data['children'][0]['children'][0]['children'][0]['children']) -# for i in range(n_elem): -# name = data['children'][0]['children'][0]['children'][0]['children'][i]['name'] -# data_p = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data'] -# depth = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data'] -# populate_values(self.values, name, data_p, depth) - -# maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values] - -# generate_max_depth_file(model, maxs) - -# set_fifo_depth(model, maxs) - -# inp = model.get_input_variables()[0] -# out = model.get_output_variables()[0] -# for x in maxs: -# if 'in_local' in x['name']: -# inp.pragma = (inp.pragma[0], x['max'] + 1) -# elif 'out_local' in x['name']: -# out.pragma = (out.pragma[0], x['max'] + 1) - -# print('[hls4ml] - FIFO optimization completed') -# return False From 80697c0a8931d206eec5cf0b2f6bb3f918c99cee Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 20 Nov 2024 14:33:05 +0100 Subject: [PATCH 087/103] Remove unused xclbin generator --- .../vitis_accelerator_ip_flow_backend.py | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py index cd57df5a4a..0372a75b75 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -54,44 +54,6 @@ def build( return parse_vivado_report(model.config.get_output_dir()) - # def make_xclbin(self, model, platform='xilinx_u250_xdma_201830_2'): - # """Create the xclbin for the given model and target platform. - - # Args: - # model (ModelGraph): Compiled and build model. - # platform (str, optional): Development/Deployment target platform, must be installed first. - # The host machine only requires the deployment target platform. Refer to the Getting Started section of - # the Alveo guide. Defaults to 'xilinx_u250_xdma_201830_2'. - # """ - # curr_dir = os.getcwd() - # abs_path_dir = os.path.abspath(model.config.get_output_dir()) - # os.chdir(abs_path_dir) - # os.makedirs('xo_files', exist_ok=True) - # try: - # os.system('vivado -mode batch -source design.tcl') - # except Exception: - # print("Something went wrong, check the Vivado logs") - # project_name = model.config.get_project_name() - # ip_repo_path = abs_path_dir + '/' + project_name + '_prj' + '/solution1/impl/ip' - # os.makedirs('xclbin_files', exist_ok=True) - # os.chdir(abs_path_dir + '/xclbin_files') - # # TODO Add other platforms - # vitis_cmd = ( - # "v++ -t hw --platform " - # + platform - # + " --link ../xo_files/" - # + project_name - # + "_kernel.xo -o'" - # + project_name - # + "_kernel.xclbin' --user_ip_repo_paths " - # + ip_repo_path - # ) - # try: - # os.system(vitis_cmd) - # except Exception: - # print("Something went wrong, check the Vitis/Vivado logs") - # os.chdir(curr_dir) - def create_initial_config( self, board='pynq-z2', From f46782919774f7d5298b0e7376e0eba2c87111b2 Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 27 Nov 2024 09:58:00 +0100 Subject: [PATCH 088/103] Clean backends init --- hls4ml/backends/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 68562d75ed..ca3fff0e77 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -2,19 +2,16 @@ from hls4ml.backends.fpga.fpga_backend import FPGABackend # noqa: F401 from hls4ml.backends.quartus.quartus_backend import QuartusBackend from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend -from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import ( # noqa: F401 - VitisAcceleratorIPFlowConfig, -) from hls4ml.backends.vivado.vivado_backend import VivadoBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401 - -from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import ( # isort: skip +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import ( VitisAcceleratorIPFlowBackend, ) - +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import ( + VitisAcceleratorIPFlowConfig, +) from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip - from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip register_backend('Vivado', VivadoBackend) From 4c7455092a72d83ffb5f4f1234530e5c70e1453b Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 27 Nov 2024 10:20:31 +0100 Subject: [PATCH 089/103] Fix backend import sequence --- hls4ml/backends/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index ca3fff0e77..7ba2ad4fbb 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -5,14 +5,14 @@ from hls4ml.backends.vivado.vivado_backend import VivadoBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401 +from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip +from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import ( VitisAcceleratorIPFlowBackend, ) from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import ( VitisAcceleratorIPFlowConfig, ) -from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip -from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip register_backend('Vivado', VivadoBackend) register_backend('VivadoAccelerator', VivadoAcceleratorBackend) From 542b9508b20356f561c4ea29ebb3e58f6b5ab716 Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 19 Feb 2025 14:39:19 +0100 Subject: [PATCH 090/103] Start cleaning up code --- .../supported_boards.json | 28 ------------------- .../vitis_accelerator_ip_flow_backend.py | 9 +----- .../vitis/nnet_utils/nnet_sepconv1d_stream.h | 2 +- .../vitis/nnet_utils/nnet_sepconv2d_stream.h | 2 +- .../myproject_axi.cpp | 2 +- .../vitis_accelerator_ip_flow/myproject_axi.h | 2 +- .../vivado/nnet_utils/nnet_sepconv1d_stream.h | 2 +- .../vivado/nnet_utils/nnet_sepconv2d_stream.h | 2 +- .../vitis_accelerator_ip_flow_writer.py | 18 ++++++------ 9 files changed, 16 insertions(+), 51 deletions(-) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json index 1279ec22d0..4a54ea2924 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json +++ b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json @@ -10,33 +10,5 @@ "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"}, "python_drivers": {"axi_stream": "axi_stream_driver.py"}, "c_drivers": {} - }, - "alveo-u50": { - "part": "xcu50-fsvh2104-2-e", - "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, - "c_drivers": {} - }, - "alveo-u250": { - "part": "xcu250-figd2104-2L-e", - "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, - "c_drivers": {} - }, - "alveo-u200": { - "part": "xcu200-fsgd2104-2-e", - "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, - "c_drivers": {} - }, - "alveo-u280": { - "part": "xcu280-fsvh2892-2L-e", - "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, - "c_drivers": {} } } diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py index 0372a75b75..872f6383e4 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -21,7 +21,7 @@ def build( validation=False, export=False, vsynth=False, - # fifo_opt=False, + fifo_opt=False, bitfile=False, ): # run the VitisBackend build @@ -36,12 +36,7 @@ def build( vsynth=vsynth, # fifo_opt=fifo_opt, ) - # Get Config to view Board and Platform - # from hls4ml.backends import VitisAcceleratorIPFlowConfig - # vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig( - # model.config, model.get_input_variables(), model.get_output_variables() - # ) # now make a bitfile if bitfile: curr_dir = os.getcwd() @@ -100,8 +95,6 @@ def create_initial_config( config['AcceleratorConfig']['Precision']['Output'] = {} config['AcceleratorConfig']['Precision']['Input'] = input_type # float, double or ap_fixed config['AcceleratorConfig']['Precision']['Output'] = output_type # float, double or ap_fixed - # if board.startswith('alveo'): - # config['AcceleratorConfig']['Platform'] = platform return config diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h index aad5d9a430..20b6fecb49 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h @@ -86,7 +86,7 @@ void separable_conv_1d_cl(hls::stream &data, hls::stream &res, #pragma HLS DATAFLOW hls::stream depthwise_res; - const unsigned res_depth = CONFIG_T::depthwise_config::out_width; + unsigned res_depth = CONFIG_T::depthwise_config::out_width; #pragma HLS STREAM variable=depthwise_res depth=res_depth depthwise_conv_1d_buffer_cl(data, depthwise_res, diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h index a119fb9e2a..a3747990e0 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h @@ -120,7 +120,7 @@ void separable_conv_2d_cl(hls::stream &data, hls::stream &res, #pragma HLS DATAFLOW hls::stream depthwise_res; - const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; + unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; #pragma HLS STREAM variable=depthwise_res depth=res_depth depthwise_conv_2d_buffer_cl(data, depthwise_res, diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp index cf6c0b9c25..1655ce506b 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp +++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp @@ -1,6 +1,6 @@ // hls-fpga-machine-learning insert include -void myproject_axi(hls::stream &in, hls::stream &out) { +void myproject_axi(hls::stream &in, hls::stream &out) { // hls-fpga-machine-learning insert interface diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h index d0d88bfecf..1c019b5f10 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h +++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h @@ -6,5 +6,5 @@ // hls-fpga-machine-learning insert definitions -void myproject_axi(hls::stream &in, hls::stream &out); +void myproject_axi(hls::stream &in, hls::stream &out); #endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h index 11622efbf0..ca3143d01e 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h @@ -109,7 +109,7 @@ void separable_conv_1d_cl(hls::stream &data, hls::stream &res, #pragma HLS DATAFLOW hls::stream depthwise_res; - const unsigned res_depth = CONFIG_T::depthwise_config::out_width; + unsigned res_depth = CONFIG_T::depthwise_config::out_width; #pragma HLS STREAM variable=depthwise_res depth=res_depth depthwise_conv_1d_cl(data, depthwise_res, depthwise_weights, diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h index f5cafd2ee7..7f4dd866c9 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h @@ -133,7 +133,7 @@ void separable_conv_2d_cl(hls::stream &data, hls::stream &res, #pragma HLS DATAFLOW hls::stream depthwise_res; - const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; + unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; #pragma HLS STREAM variable=depthwise_res depth=res_depth depthwise_conv_2d_cl(data, depthwise_res, depthwise_weights, diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py index 78e1fa982d..4f96e38f33 100644 --- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py +++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py @@ -40,7 +40,7 @@ def write_axi_wrapper(self, model): newline += f'static const unsigned N_IN = {inp.size()};\n' newline += f'static const unsigned N_OUT = {out.size()};\n' if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': - newline += 'typedef hls::axis my_pkt;\n' + newline += 'typedef hls::axis dma_data_packet;\n' # might need to make "float" a variable according to the # configuration set by the user and the DMA available data widths else: # TODO: handle this case @@ -73,7 +73,7 @@ def write_axi_wrapper(self, model): if io_type == 'io_parallel': # TODO: handle io_parallel newline += indent + inp.type.name + ' in_local[N_IN];\n' newline += indent + out.type.name + ' out_local[N_OUT];\n' - newline += indent + 'my_pkt tmp;\n' + newline += indent + 'dma_data_packet tmp;\n' elif io_type == 'io_stream': newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n' newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n' @@ -124,7 +124,7 @@ def write_axi_wrapper(self, model): newline += indent + 'tmp.last = 0;\n' elif io_type == 'io_stream': newline = '' - newline += indent + 'my_pkt tmp;\n' + newline += indent + 'dma_data_packet tmp;\n' newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n' # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed @@ -252,19 +252,19 @@ def write_wrapper_test(self, model): newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') elif inp.definition_cpp() in line: newline = line.replace( - inp.definition_cpp(), 'hls::stream< my_pkt > inputs' + inp.definition_cpp(), 'hls::stream< dma_data_packet > inputs' ) # TODO instead of replacing strings, how about we use proper variables and their definition? elif out.definition_cpp() in line: - newline = line.replace(out.definition_cpp(), 'hls::stream< my_pkt > outputs') + newline = line.replace(out.definition_cpp(), 'hls::stream< dma_data_packet > outputs') elif 'unsigned short' in line: newline = '' elif f'{model.config.get_project_name()}(' in line: indent_amount = line.split(model.config.get_project_name())[0] newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n' elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: - newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt') + newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'dma_data_packet') elif out.size_cpp() in line or out.name in line or out.type.name in line: - newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt') + newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'dma_data_packet') else: newline = line if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': @@ -300,9 +300,9 @@ def write_wrapper_test(self, model): if f'{model.config.get_project_name()}.h' in line: newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') elif inp.definition_cpp(name_suffix='_ap') in line: - newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {inp.name}_ap') + newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {inp.name}_ap') elif out.definition_cpp(name_suffix='_ap') in line: - newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {out.name}_ap') + newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {out.name}_ap') elif f'{model.config.get_project_name()}(' in line: indent_amount = line.split(model.config.get_project_name())[0] newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format( From c78aec2d7c17346e7ff1806e6c4b994748c5cfdb Mon Sep 17 00:00:00 2001 From: steltze Date: Wed, 19 Feb 2025 14:59:22 +0100 Subject: [PATCH 091/103] Start integrating FIFO depth optimizer --- .../passes/fifo_depth_optimization.py | 247 ++++++++++++++++++ .../vitis_accelerator_ip_flow_backend.py | 11 +- hls4ml/templates/vivado/build_prj.tcl | 4 + .../vitis_accelerator_ip_flow_writer.py | 6 +- .../test_optimization/test_fifo_depth.py | 195 ++++++++++++++ 5 files changed, 453 insertions(+), 10 deletions(-) create mode 100644 hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py create mode 100644 test/pytest/test_optimization/test_fifo_depth.py diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py new file mode 100644 index 0000000000..de7b61075e --- /dev/null +++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py @@ -0,0 +1,247 @@ +import json +import os + +from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass + + +def initialize_large_fifos(model, profiling_fifo_depth): + """Set all FIFO depths equal to a large value so that they can be profiled. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + profiling_fifo_depth (int): A large non-negative integer, must be larger than the max expected depth of the FIFOs. + + Returns: + Dict[str, int]: A dictionary containing FIFO names as keys and their initial depths as values is returned for + comparison with the optimized depths. + """ + + # filter all the output variables and keep only the internal FIFOs, excluding output objects that are not FIFOs and the + # input and output FIFOs as they can't be profiled and are implementation dependant i.e AXI Stream, AXI Master or + # connected to another IP + vars_to_profile = { + output_variable_name: output_variable + for output_variable_name, output_variable in model.output_vars.items() + if ("VivadoStreamVariable" in str(type(output_variable))) + and output_variable != model.get_output_variables()[0] + and output_variable != model.get_input_variables()[0] + } + + # initialize all the fifos to `profiling_fifo_depth` so that they will be automatically implemented in BRAMs and so + # they will be profiled. Alternatively, "config_dataflow -override_user_fifo_depth profiling_fifo_depth" can be + # used inside build_prj.tcl to override all FIFO depths with the specified value + initial_fifo_depths = {} + for output_variable in vars_to_profile.values(): + if output_variable.pragma: + initial_fifo_depths[output_variable.name] = int(output_variable.pragma[1]) + output_variable.pragma = (output_variable.pragma[0], profiling_fifo_depth) + return initial_fifo_depths + + +def override_test_bench(model): + """In order for the FIFO depth profiling to produce correct results, it is necessary for the cosimulation to + call the top function - Vitis IP at **least twice**. The test bench produced by the Vivado Writer is + overwritten by adding a for-loop over the top function. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + """ + indent = " " + path_to_old_test_bench = f"{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp" + path_to_new_test_bench = f"{model.config.get_output_dir()}/{model.config.get_project_name()}_new_test.cpp" + + newline = "" + second_part_of_testbench = False + with open(path_to_old_test_bench) as old_test_bench: + file_iterator = iter(old_test_bench) + for line in file_iterator: + + if "// hls-fpga-machine-learning insert zero" in line: + newline += indent + indent + "const unsigned PROFILING_ITERATIONS = 2;\n" + newline += ( + indent + + indent + + "for(unsigned batch_iteration = 0; batch_iteration < PROFILING_ITERATIONS; ++batch_iteration) {\n" + ) + newline += line + second_part_of_testbench = True + elif ("// hls-fpga-machine-learning insert tb-output" in line) and second_part_of_testbench: + newline += line + newline += next(file_iterator) + newline += indent + "}\n" + else: + newline += line + + with open(path_to_new_test_bench, "w+") as new_test_bench: + new_test_bench.write(newline) + + # replace the old test bench with the new test bench that includes a for-loop + os.replace(path_to_new_test_bench, path_to_old_test_bench) + return + + +def execute_cosim_to_profile_fifos(model): + """Execute a cosimulation with a testh bench that calls the top function - Vitis IP at **least twice**, + to properly profile the max FIFO depths. The function will momentarily replace the initial test bench + with a suitable one for the optimization, and after the optimizer pass, the original test bench reinitialized. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + """ + model.write() + + override_test_bench(model) + + model.build( + reset=False, + csim=False, + synth=True, + cosim=True, + validation=False, + export=False, + vsynth=False, + fifo_opt=True, + ) + + return + + +def get_vitis_optimized_fifo_depths(model): + """Parse the files generated by the cosimulation to retrieve the optimized depths for the FIFOs. + Attention, only the FIFOs between the layers are profiled! + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + + Returns: + Dict[str, int]: A dictionary that contains the FIFO names as keys and the optimized depths as values. + """ + # channel.zip is generated after the cosimulation and contains the chan_status*.csv files + # in the chan_status*.csv files the max depth achieved during cosimulation can be found at the last (4th) line + path_to_zip_file = ( + model.config.get_output_dir() + + "/" + + model.config.get_project_name() + + "_prj" + + "/solution1/.autopilot/db/channel_depth_info/" + ) + + os.system(f"unzip -q -o {path_to_zip_file}channel.zip -d {path_to_zip_file}") + + # the channel_info.csv file contains the mapping of each fifo name (i.e layer4_out_U) to the respective + # chan_status*.csv file + names_file_path = ( + model.config.get_output_dir() + + "/" + + model.config.get_project_name() + + "_prj" + + "/solution1/.autopilot/db/channel_info.csv" + ) + + csv_fifo_depth_files = {} + with open(names_file_path) as names_file: + for line in names_file: + layer_name = line.split(",")[1] + csv_file_name = line.split(",")[3][:-1] + csv_fifo_depth_files[layer_name] = csv_file_name + + optmized_fifo_depths = {} + for layer_name, file_name in csv_fifo_depth_files.items(): + with open(path_to_zip_file + file_name) as chan_status_file: + lines = chan_status_file.readlines() + optmized_fifo_depths[layer_name[:-2]] = int( + lines[-1] + ) # remove "_U" from the layer name string and keep the last line of the file that contains the max depth + + return optmized_fifo_depths + + +def generate_depths_file(model, initial_fifo_depths, optimized_fifo_depths): + """Generate a json file with the names of the FIFOs, the initial depths set by hls4ml and their optimized depths, + for post-processing. The json file is not used by the rest of the pipeline, it is only produced for the user. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + initial_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the initial + depths as values. + optmized_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the optimized + depths as values. + """ + depths = {} + for fifo_name in initial_fifo_depths.keys(): + depths[fifo_name] = {} + depths[fifo_name]['initial'] = initial_fifo_depths[fifo_name] + depths[fifo_name]['optimized'] = optimized_fifo_depths[fifo_name] + + with open(model.config.get_output_dir() + "/fifo_depths.json", "w") as f: + json.dump(depths, f, indent=4) + + +def set_optimized_fifo_depths(model, optimized_fifo_depths): + """Set the new optimized FIFO depths. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + optmized_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the optimized + depths as values. + """ + + # iterate through the layer output FIFOs + for output_variable in model.output_vars.values(): + if "VivadoStreamVariable" in str(type(output_variable)): + if output_variable.pragma: + + if output_variable.name not in optimized_fifo_depths.keys(): + continue + + filtered_depth = optimized_fifo_depths[output_variable.name] + output_variable.pragma = (output_variable.pragma[0], filtered_depth) + return + + +class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass): + def __init__(self): + pass + + def transform(self, model): + """Perform FIFO depth optimization between the FIFOs of all layers to reduce resource utilization as the + initial FIFOs set by hls4ml might be larger than required. At the end of the optimization the FIFOs will + have the largest depths achieved during cosimulation without causing any deadlocks between the layers + (producer-consumer), thus no additional delays between the layers. In some cases, this optimization + might lead to bigger FIFOs than initially set by the hls4ml tool in order to prevent deadlocks. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + + Raises: + ValueError: If the FIFO depth for profiling provided by the user is not a non-negative integer. + RuntimeError: If the IO type is not set to "io_stream". + + Returns: + bool: The execution state of the Optimzer Pass + """ + + # use `large_fifo_depth = 0` to keep the default fifo depth + # consider changing 100_000 either with a very very large value > of any total bram storage space + # or via vitis 2023.2 c-simulation + profiling_fifo_depth = getattr(self, "profiling_fifo_depth", 100_000) + + if not isinstance(profiling_fifo_depth, int) or profiling_fifo_depth <= 0: + raise ValueError("The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer.") + + # check axi-stream or io-stream + if not (model.config.get_config_value("IOType") == "io_stream"): + raise RuntimeError("To use this optimization you have to set `IOType` field to `io_stream` in the HLS config.") + + initial_fifo_depths = initialize_large_fifos(model, profiling_fifo_depth) + + execute_cosim_to_profile_fifos(model) + + optimized_fifo_depths = get_vitis_optimized_fifo_depths(model) + + generate_depths_file(model, initial_fifo_depths, optimized_fifo_depths) + + set_optimized_fifo_depths(model, optimized_fifo_depths) + + print("[hls4ml] - FIFO optimization completed") + return False diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py index 872f6383e4..08eeef0032 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -34,7 +34,6 @@ def build( validation=validation, export=export, vsynth=vsynth, - # fifo_opt=fifo_opt, ) # now make a bitfile @@ -59,8 +58,7 @@ def create_initial_config( interface='axi_stream', driver='python', input_type='float', - output_type='float', - platform='xilinx_u250_xdma_201830_2', + output_type='float' ): ''' Create initial accelerator config with default parameters @@ -110,6 +108,9 @@ def _register_flows(self): self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name) self._default_flow = vivado_ip - # fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes + # Register the fifo depth optimization flow which is different from the one for vivado + fifo_depth_opt_passes = [ + 'vitisacceleratoripflow:fifo_depth_optimization' + ] + writer_passes # After optimization, a new project will be written - # register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name) + register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=['vitisacceleratoripflow:ip'], backend=self.name) diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index 50596091f2..9dbab5b9d6 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -196,6 +196,10 @@ if {$opt(cosim)} { if {$opt(fifo_opt)} { puts "\[hls4ml\] - FIFO optimization started" add_vcd_instructions_tcl + + if {[string equal "$backend" "vivado"] || [string equal $backend "vivadoaccelerator"]} { + add_vcd_instructions_tcl + } } remove_recursive_log_wave diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py index 4f96e38f33..977a6d6e04 100644 --- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py +++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py @@ -329,11 +329,6 @@ def write_board_script(self, model): os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_tcl_file_path()), f'{model.config.get_output_dir()}/design.tcl', ) - # Generic alveo board - if self.vitis_accelerator_ip_flow_config.get_board().startswith('alveo'): - src_dir = os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_krnl_rtl_src_dir()) - dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src' - copy_tree(src_dir, dst_dir) ################### # project.tcl @@ -356,6 +351,7 @@ def write_board_script(self, model): f.write(f'set bit_width_hls_output {in_bit}\n') f.write(f'set bit_width_hls_input {out_bit}\n') f.close() + return def write_driver(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) diff --git a/test/pytest/test_optimization/test_fifo_depth.py b/test/pytest/test_optimization/test_fifo_depth.py new file mode 100644 index 0000000000..1e99a7adc9 --- /dev/null +++ b/test/pytest/test_optimization/test_fifo_depth.py @@ -0,0 +1,195 @@ +import json +import os +import re +from pathlib import Path + +import numpy as np +import pytest +import qonnx.core.onnx_exec as oxe +from qonnx.core.modelwrapper import ModelWrapper +from tensorflow.keras.layers import SeparableConv2D +from tensorflow.keras.models import Sequential + +import hls4ml +from hls4ml.backends.vitis.passes.fifo_depth_optimization import override_test_bench + +test_root_path = Path(__file__).parent +example_model_path = (test_root_path / '../../../example-models').resolve() + +backend_options = ['Vitis'] + + +def parse_cosim_report(project_path): + """Parse the cosimulation report to check whether the cosimulation passed or failed and therefore a deadlock is + detected. + """ + prj_dir = None + top_func_name = None + + project_tcl_path = project_path + '/project.tcl' + + with open(project_tcl_path) as f: + for line in f.readlines(): + if 'set project_name' in line: + top_func_name = line.split('"')[-2] + prj_dir = top_func_name + '_prj' + + cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_cosim.rpt' + + if os.path.isfile(cosim_file_path): + return cosim_file_path + else: + raise FileNotFoundError("Co-simulation report not found.") + + +def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type): + """Execute the FIFO depth optimization sequence on a dummy Keras model.""" + + # create a keras model + input_shape = (128, 128, 3) + activation = 'relu' + kernel_size = (3, 3) + padding = 'same' + + model = Sequential() + model.add( + SeparableConv2D(filters=4, kernel_size=kernel_size, padding=padding, activation=activation, input_shape=input_shape) + ) + model.add(SeparableConv2D(filters=8, kernel_size=kernel_size, padding=padding, activation=activation)) + model.compile(optimizer='adam', loss='mse') + + X_input = np.random.rand(1, *input_shape) + keras_prediction = model.predict(X_input) + + config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>') + + # include the FIFO Depth optimizer do the flows + config['Flows'] = ['vitis:fifo_depth_optimization'] + hls4ml.model.optimizer.get_optimizer('vitis:fifo_depth_optimization').configure( + profiling_fifo_depth=profiling_fifo_depth + ) + + output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_keras_backend_{backend}') + + # execute fifo optimization + hls_model = hls4ml.converters.convert_from_keras_model( + model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend + ) + + hls_model.compile() + hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape) + + np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.01) + + # check that the FIFOs have been optimized succesfully + fifo_depth_optimization_checks(hls_model) + + +def fifo_depth_optimization_checks(hls_model): + """Execute the FIFO depth optimization sequence on an hls4ml model.""" + + # force the top-function to execute twice in the cosimulation, to verify no deadlocks occur even + # when streaming multiple inputs into the network + override_test_bench(hls_model) + + # build the new project with optimized depths and execute cosimulation to check for deadlocks + # due to the new FIFO depths + hls_model.build(reset=False, csim=False, synth=True, cosim=True) + + # checks if the fifo depths decreased/were optimized + fifo_depths = {} + with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file: + fifo_depths = json.load(fifo_depths_file) + + fifo_depths_decreased = all(fifo['optimized'] < fifo['initial'] for fifo in fifo_depths.values()) + + # checks that the cosimulation ran succesfully without detecting deadlocks + cosim_report_path = parse_cosim_report(hls_model.config.get_output_dir()) + + with open(cosim_report_path) as cosim_report_file: + cosim_succesful = any("Pass" in line for line in cosim_report_file) + + assert fifo_depths_decreased and cosim_succesful + + +def expect_exception(error, message, backend, profiling_fifo_depth, io_type): + with pytest.raises(error, match=re.escape(message)): + run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +@pytest.mark.parametrize('profiling_fifo_depth', [-2, 3.14, "a"]) +def test_value_error(backend, profiling_fifo_depth): + """Test the FIFO depth optimizer with faulty inputs of profiling_fifo_depth to verify that an exception is raised.""" + message = "The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer." + expect_exception(ValueError, message, backend, profiling_fifo_depth, io_type='io_stream') + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_runtime_error(backend): + """Test the FIFO depth optimizer with io_type='io_parallel' to verify that an exception is raised.""" + message = "To use this optimization you have to set `IOType` field to `io_stream` in the HLS config." + expect_exception(RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel') + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_dummy_keras(backend): + """Test the correct execution of the FIFO depth optimizer.""" + run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream') + + +def get_branched_model(): + """ + Load branched model, already channels-last and cleaned + """ + dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx") + assert os.path.isfile(dl_file) + model = ModelWrapper(dl_file) + return model + + +def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model): + """Execute the FIFO depth optimization sequence on a ONNX/QONNX model.""" + + ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) + X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) + X = (np.round(X * 2**16) * 2**-16).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + + config = hls4ml.utils.config.config_from_onnx_model( + model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>' + ) + + # add this line to remove the linear layer that quantizes the input of the NN + config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>' + + config['Flows'] = ['vitis:fifo_depth_optimization'] + hls4ml.model.optimizer.get_optimizer('vitis:fifo_depth_optimization').configure( + profiling_fifo_depth=profiling_fifo_depth + ) + + output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}') + + hls_model = hls4ml.converters.convert_from_onnx_model( + model, + output_dir=output_dir, + io_type=io_type, + backend=backend, + hls_config=config, + ) + hls_model.compile() + y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) + np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel()) + + fifo_depth_optimization_checks(hls_model) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_tiny_unet(backend): + """Test the correct execution of the FIFO depth optimizer.""" + run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model()) From 62b5c277c328a04eea577925795b554662d1180d Mon Sep 17 00:00:00 2001 From: steltze Date: Thu, 20 Feb 2025 15:00:01 +0100 Subject: [PATCH 092/103] Fix FIFO depth optimizer --- hls4ml/backends/vitis/vitis_backend.py | 24 ++++- .../passes/fifo_depth_optimization.py | 18 +++- .../vitis_accelerator_ip_flow_backend.py | 9 +- .../test_optimization/test_fifo_depth.py | 102 +++++++++--------- 4 files changed, 94 insertions(+), 59 deletions(-) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index 89484237f3..ff2104c795 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -75,7 +75,18 @@ def create_initial_config( return config - def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False): + def build( + self, + model, + reset=False, + csim=True, + synth=True, + cosim=False, + validation=False, + export=False, + vsynth=False, + fifo_opt=False, + ): if 'linux' in sys.platform: found = os.system('command -v vitis_hls > /dev/null') if found != 0: @@ -87,7 +98,16 @@ def build(self, model, reset=False, csim=True, synth=True, cosim=False, validati ( 'vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} ' 'validation={validation} export={export} vsynth={vsynth}"' - ).format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth) + ).format( + reset=reset, + csim=csim, + synth=synth, + cosim=cosim, + validation=validation, + export=export, + vsynth=vsynth, + fifo_opt=fifo_opt, + ) ) os.chdir(curr_dir) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py index de7b61075e..38706047a7 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py @@ -35,6 +35,14 @@ def initialize_large_fifos(model, profiling_fifo_depth): if output_variable.pragma: initial_fifo_depths[output_variable.name] = int(output_variable.pragma[1]) output_variable.pragma = (output_variable.pragma[0], profiling_fifo_depth) + + inp = model.get_input_variables()[0] + initial_fifo_depths['in_local'] = int(inp.pragma[1]) + inp.pragma = (inp.pragma[0], profiling_fifo_depth) + + outp = model.get_output_variables()[0] + initial_fifo_depths['out_local'] = int(outp.pragma[1]) + outp.pragma = (outp.pragma[0], profiling_fifo_depth) return initial_fifo_depths @@ -188,7 +196,7 @@ def set_optimized_fifo_depths(model, optimized_fifo_depths): # iterate through the layer output FIFOs for output_variable in model.output_vars.values(): - if "VivadoStreamVariable" in str(type(output_variable)): + if ("VivadoStreamVariable" in str(type(output_variable))) or (output_variable.name == 'in_local') or (output_variable.name == 'out_local'): if output_variable.pragma: if output_variable.name not in optimized_fifo_depths.keys(): @@ -196,6 +204,12 @@ def set_optimized_fifo_depths(model, optimized_fifo_depths): filtered_depth = optimized_fifo_depths[output_variable.name] output_variable.pragma = (output_variable.pragma[0], filtered_depth) + + inp = model.get_input_variables()[0] + inp.pragma = (inp.pragma[0], optimized_fifo_depths['in_local']) + + outp = model.get_output_variables()[0] + outp.pragma = (inp.pragma[0], optimized_fifo_depths['out_local']) return @@ -227,7 +241,7 @@ def transform(self, model): profiling_fifo_depth = getattr(self, "profiling_fifo_depth", 100_000) if not isinstance(profiling_fifo_depth, int) or profiling_fifo_depth <= 0: - raise ValueError("The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer.") + raise ValueError("The FIFO depth for profiling (profiling_fifo_depth variable) must be a positive integer.") # check axi-stream or io-stream if not (model.config.get_config_value("IOType") == "io_stream"): diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py index 08eeef0032..c5dff6f789 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -34,6 +34,7 @@ def build( validation=validation, export=export, vsynth=vsynth, + fifo_opt=True, ) # now make a bitfile @@ -103,14 +104,14 @@ def get_writer_flow(self): return self._writer_flow def _register_flows(self): - vivado_ip = 'vivado:ip' + # vivado_ip = 'vivado:ip' writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls'] - self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name) - self._default_flow = vivado_ip + self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) + # self._default_flow = vivado_ip # Register the fifo depth optimization flow which is different from the one for vivado fifo_depth_opt_passes = [ 'vitisacceleratoripflow:fifo_depth_optimization' ] + writer_passes # After optimization, a new project will be written - register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=['vitisacceleratoripflow:ip'], backend=self.name) + register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=['vitis:ip'], backend=self.name) diff --git a/test/pytest/test_optimization/test_fifo_depth.py b/test/pytest/test_optimization/test_fifo_depth.py index 1e99a7adc9..8589acbe51 100644 --- a/test/pytest/test_optimization/test_fifo_depth.py +++ b/test/pytest/test_optimization/test_fifo_depth.py @@ -5,18 +5,18 @@ import numpy as np import pytest -import qonnx.core.onnx_exec as oxe -from qonnx.core.modelwrapper import ModelWrapper +# import qonnx.core.onnx_exec as oxe +# from qonnx.core.modelwrapper import ModelWrapper from tensorflow.keras.layers import SeparableConv2D from tensorflow.keras.models import Sequential import hls4ml -from hls4ml.backends.vitis.passes.fifo_depth_optimization import override_test_bench +from hls4ml.backends.vitis_accelerator_ip_flow.passes.fifo_depth_optimization import override_test_bench test_root_path = Path(__file__).parent example_model_path = (test_root_path / '../../../example-models').resolve() -backend_options = ['Vitis'] +backend_options = ['VitisAcceleratorIPFlow'] def parse_cosim_report(project_path): @@ -34,7 +34,7 @@ def parse_cosim_report(project_path): top_func_name = line.split('"')[-2] prj_dir = top_func_name + '_prj' - cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_cosim.rpt' + cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_axi_cosim.rpt' if os.path.isfile(cosim_file_path): return cosim_file_path @@ -46,7 +46,7 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type): """Execute the FIFO depth optimization sequence on a dummy Keras model.""" # create a keras model - input_shape = (128, 128, 3) + input_shape = (32, 32, 3) activation = 'relu' kernel_size = (3, 3) padding = 'same' @@ -64,8 +64,8 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type): config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>') # include the FIFO Depth optimizer do the flows - config['Flows'] = ['vitis:fifo_depth_optimization'] - hls4ml.model.optimizer.get_optimizer('vitis:fifo_depth_optimization').configure( + config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] + hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( profiling_fifo_depth=profiling_fifo_depth ) @@ -73,7 +73,7 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type): # execute fifo optimization hls_model = hls4ml.converters.convert_from_keras_model( - model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend + model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend, clock_period=10 ) hls_model.compile() @@ -134,62 +134,62 @@ def test_runtime_error(backend): expect_exception(RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel') -@pytest.mark.skip(reason='Skipping synthesis tests for now') +# @pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_dummy_keras(backend): """Test the correct execution of the FIFO depth optimizer.""" run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream') -def get_branched_model(): - """ - Load branched model, already channels-last and cleaned - """ - dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx") - assert os.path.isfile(dl_file) - model = ModelWrapper(dl_file) - return model +# def get_branched_model(): +# """ +# Load branched model, already channels-last and cleaned +# """ +# dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx") +# assert os.path.isfile(dl_file) +# model = ModelWrapper(dl_file) +# return model -def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model): - """Execute the FIFO depth optimization sequence on a ONNX/QONNX model.""" +# def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model): +# """Execute the FIFO depth optimization sequence on a ONNX/QONNX model.""" - ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) - X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) - X = (np.round(X * 2**16) * 2**-16).astype(np.float32) - idict = {model.graph.input[0].name: X} - y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] +# ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) +# X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) +# X = (np.round(X * 2**16) * 2**-16).astype(np.float32) +# idict = {model.graph.input[0].name: X} +# y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] - config = hls4ml.utils.config.config_from_onnx_model( - model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>' - ) +# config = hls4ml.utils.config.config_from_onnx_model( +# model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>' +# ) - # add this line to remove the linear layer that quantizes the input of the NN - config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>' +# # add this line to remove the linear layer that quantizes the input of the NN +# config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>' - config['Flows'] = ['vitis:fifo_depth_optimization'] - hls4ml.model.optimizer.get_optimizer('vitis:fifo_depth_optimization').configure( - profiling_fifo_depth=profiling_fifo_depth - ) +# config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] +# hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( +# profiling_fifo_depth=profiling_fifo_depth +# ) - output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}') +# output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}') - hls_model = hls4ml.converters.convert_from_onnx_model( - model, - output_dir=output_dir, - io_type=io_type, - backend=backend, - hls_config=config, - ) - hls_model.compile() - y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) - np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel()) +# hls_model = hls4ml.converters.convert_from_onnx_model( +# model, +# output_dir=output_dir, +# io_type=io_type, +# backend=backend, +# hls_config=config, +# ) +# hls_model.compile() +# y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) +# np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel()) - fifo_depth_optimization_checks(hls_model) +# fifo_depth_optimization_checks(hls_model) -@pytest.mark.skip(reason='Skipping synthesis tests for now') -@pytest.mark.parametrize('backend', backend_options) -def test_successful_execution_of_tiny_unet(backend): - """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model()) +# @pytest.mark.skip(reason='Skipping synthesis tests for now') +# @pytest.mark.parametrize('backend', backend_options) +# def test_successful_execution_of_tiny_unet(backend): +# """Test the correct execution of the FIFO depth optimizer.""" +# run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model()) From d5f2192bb217c68097e0ba58229e0efa1b8dd95e Mon Sep 17 00:00:00 2001 From: steltze Date: Thu, 20 Feb 2025 15:20:41 +0100 Subject: [PATCH 093/103] Run precommit --- hls4ml/backends/__init__.py | 5 +- hls4ml/backends/vitis/vitis_backend.py | 2 +- .../passes/fifo_depth_optimization.py | 14 +-- .../vitis_accelerator_ip_flow_backend.py | 4 +- .../vitis_accelerator_ip_flow_writer.py | 22 +++-- .../test_optimization/test_fifo_depth.py | 86 +++++++++---------- 6 files changed, 74 insertions(+), 59 deletions(-) diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 7ba2ad4fbb..031c775c64 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -5,12 +5,13 @@ from hls4ml.backends.vivado.vivado_backend import VivadoBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401 + from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip -from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import ( +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import ( # isort: skip VitisAcceleratorIPFlowBackend, ) -from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import ( +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import ( # isort: skip # noqa: F401 VitisAcceleratorIPFlowConfig, ) diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py index ff2104c795..d1c094ca96 100644 --- a/hls4ml/backends/vitis/vitis_backend.py +++ b/hls4ml/backends/vitis/vitis_backend.py @@ -97,7 +97,7 @@ def build( os.system( ( 'vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} ' - 'validation={validation} export={export} vsynth={vsynth}"' + 'validation={validation} export={export} vsynth={vsynth} fifo_opt={fifo_opt}"' ).format( reset=reset, csim=csim, diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py index 38706047a7..077d3683c5 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py @@ -35,11 +35,11 @@ def initialize_large_fifos(model, profiling_fifo_depth): if output_variable.pragma: initial_fifo_depths[output_variable.name] = int(output_variable.pragma[1]) output_variable.pragma = (output_variable.pragma[0], profiling_fifo_depth) - + inp = model.get_input_variables()[0] initial_fifo_depths['in_local'] = int(inp.pragma[1]) inp.pragma = (inp.pragma[0], profiling_fifo_depth) - + outp = model.get_output_variables()[0] initial_fifo_depths['out_local'] = int(outp.pragma[1]) outp.pragma = (outp.pragma[0], profiling_fifo_depth) @@ -196,7 +196,11 @@ def set_optimized_fifo_depths(model, optimized_fifo_depths): # iterate through the layer output FIFOs for output_variable in model.output_vars.values(): - if ("VivadoStreamVariable" in str(type(output_variable))) or (output_variable.name == 'in_local') or (output_variable.name == 'out_local'): + if ( + ("VivadoStreamVariable" in str(type(output_variable))) + or (output_variable.name == 'in_local') + or (output_variable.name == 'out_local') + ): if output_variable.pragma: if output_variable.name not in optimized_fifo_depths.keys(): @@ -204,10 +208,10 @@ def set_optimized_fifo_depths(model, optimized_fifo_depths): filtered_depth = optimized_fifo_depths[output_variable.name] output_variable.pragma = (output_variable.pragma[0], filtered_depth) - + inp = model.get_input_variables()[0] inp.pragma = (inp.pragma[0], optimized_fifo_depths['in_local']) - + outp = model.get_output_variables()[0] outp.pragma = (inp.pragma[0], optimized_fifo_depths['out_local']) return diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py index c5dff6f789..66411489fc 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -59,7 +59,7 @@ def create_initial_config( interface='axi_stream', driver='python', input_type='float', - output_type='float' + output_type='float', ): ''' Create initial accelerator config with default parameters @@ -106,7 +106,7 @@ def get_writer_flow(self): def _register_flows(self): # vivado_ip = 'vivado:ip' writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls'] - self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) + self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) # self._default_flow = vivado_ip # Register the fifo depth optimization flow which is different from the one for vivado diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py index 977a6d6e04..9805c5b33f 100644 --- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py +++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py @@ -1,8 +1,6 @@ import os -from distutils.dir_util import copy_tree from shutil import copyfile -# from hls4ml.writer.vivado_writer import VivadoWriter from hls4ml.writer.vitis_writer import VitisWriter @@ -262,9 +260,17 @@ def write_wrapper_test(self, model): indent_amount = line.split(model.config.get_project_name())[0] newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n' elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: - newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'dma_data_packet') + newline = ( + line.replace(inp.size_cpp(), 'N_IN') + .replace(inp.name, 'inputs') + .replace(inp.type.name, 'dma_data_packet') + ) elif out.size_cpp() in line or out.name in line or out.type.name in line: - newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'dma_data_packet') + newline = ( + line.replace(out.size_cpp(), 'N_OUT') + .replace(out.name, 'outputs') + .replace(out.type.name, 'dma_data_packet') + ) else: newline = line if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': @@ -300,9 +306,13 @@ def write_wrapper_test(self, model): if f'{model.config.get_project_name()}.h' in line: newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') elif inp.definition_cpp(name_suffix='_ap') in line: - newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {inp.name}_ap') + newline = line.replace( + inp.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {inp.name}_ap' + ) elif out.definition_cpp(name_suffix='_ap') in line: - newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {out.name}_ap') + newline = line.replace( + out.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {out.name}_ap' + ) elif f'{model.config.get_project_name()}(' in line: indent_amount = line.split(model.config.get_project_name())[0] newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format( diff --git a/test/pytest/test_optimization/test_fifo_depth.py b/test/pytest/test_optimization/test_fifo_depth.py index 8589acbe51..6e03e3bf0e 100644 --- a/test/pytest/test_optimization/test_fifo_depth.py +++ b/test/pytest/test_optimization/test_fifo_depth.py @@ -5,8 +5,8 @@ import numpy as np import pytest -# import qonnx.core.onnx_exec as oxe -# from qonnx.core.modelwrapper import ModelWrapper +import qonnx.core.onnx_exec as oxe +from qonnx.core.modelwrapper import ModelWrapper from tensorflow.keras.layers import SeparableConv2D from tensorflow.keras.models import Sequential @@ -141,55 +141,55 @@ def test_successful_execution_of_dummy_keras(backend): run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream') -# def get_branched_model(): -# """ -# Load branched model, already channels-last and cleaned -# """ -# dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx") -# assert os.path.isfile(dl_file) -# model = ModelWrapper(dl_file) -# return model +def get_branched_model(): + """ + Load branched model, already channels-last and cleaned + """ + dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx") + assert os.path.isfile(dl_file) + model = ModelWrapper(dl_file) + return model -# def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model): -# """Execute the FIFO depth optimization sequence on a ONNX/QONNX model.""" +def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model): + """Execute the FIFO depth optimization sequence on a ONNX/QONNX model.""" -# ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) -# X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) -# X = (np.round(X * 2**16) * 2**-16).astype(np.float32) -# idict = {model.graph.input[0].name: X} -# y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) + X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) + X = (np.round(X * 2**16) * 2**-16).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] -# config = hls4ml.utils.config.config_from_onnx_model( -# model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>' -# ) + config = hls4ml.utils.config.config_from_onnx_model( + model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>' + ) -# # add this line to remove the linear layer that quantizes the input of the NN -# config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>' + # add this line to remove the linear layer that quantizes the input of the NN + config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>' -# config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] -# hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( -# profiling_fifo_depth=profiling_fifo_depth -# ) + config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] + hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( + profiling_fifo_depth=profiling_fifo_depth + ) -# output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}') + output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}') -# hls_model = hls4ml.converters.convert_from_onnx_model( -# model, -# output_dir=output_dir, -# io_type=io_type, -# backend=backend, -# hls_config=config, -# ) -# hls_model.compile() -# y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) -# np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel()) + hls_model = hls4ml.converters.convert_from_onnx_model( + model, + output_dir=output_dir, + io_type=io_type, + backend=backend, + hls_config=config, + ) + hls_model.compile() + y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) + np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel()) -# fifo_depth_optimization_checks(hls_model) + fifo_depth_optimization_checks(hls_model) -# @pytest.mark.skip(reason='Skipping synthesis tests for now') -# @pytest.mark.parametrize('backend', backend_options) -# def test_successful_execution_of_tiny_unet(backend): -# """Test the correct execution of the FIFO depth optimizer.""" -# run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model()) +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_tiny_unet(backend): + """Test the correct execution of the FIFO depth optimizer.""" + run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model()) From 14b413e38ba618668cdfadc124ddf209e77ee00f Mon Sep 17 00:00:00 2001 From: Stelios Tzelepis <79508119+steltze@users.noreply.github.com> Date: Fri, 21 Feb 2025 12:41:47 +0100 Subject: [PATCH 094/103] Update build_prj.tcl --- hls4ml/templates/vivado/build_prj.tcl | 1 - 1 file changed, 1 deletion(-) diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index cd398319c9..6018ba5171 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -195,7 +195,6 @@ if {$opt(cosim)} { if {$opt(fifo_opt)} { puts "\[hls4ml\] - FIFO optimization started" - add_vcd_instructions_tcl if {[string equal "$backend" "vivado"] || [string equal $backend "vivadoaccelerator"]} { add_vcd_instructions_tcl From 9f1c8b3be3f8d8f5b5c779367c247a7857f8eb0e Mon Sep 17 00:00:00 2001 From: steltze Date: Thu, 6 Mar 2025 16:11:50 +0100 Subject: [PATCH 095/103] Address pr comments and merge main --- .../vitis_accelerator_ip_flow_backend.py | 4 ++-- hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py index 66411489fc..ab0f49f585 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -104,10 +104,10 @@ def get_writer_flow(self): return self._writer_flow def _register_flows(self): - # vivado_ip = 'vivado:ip' + vitis_ip = 'vitis:ip' writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls'] self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) - # self._default_flow = vivado_ip + self._default_flow = vitis_ip # Register the fifo depth optimization flow which is different from the one for vivado fifo_depth_opt_passes = [ diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh index 262ce00d63..c8314badb0 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh +++ b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh @@ -13,9 +13,11 @@ INCFLAGS="-Ifirmware/ap_types/" PROJECT=myproject LIB_STAMP=mystamp +BASEDIR="$(cd "$(dirname "$0")" && pwd)" +WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\"" -${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o -${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o -${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}.cpp -o ${PROJECT}.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o ${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_axi.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so rm -f *.o From 47636924a2d26dcc2520caaf5f85199185598a2e Mon Sep 17 00:00:00 2001 From: steltze Date: Thu, 6 Mar 2025 17:36:45 +0100 Subject: [PATCH 096/103] Include tests without fifo optimization and checks for bitstream generation --- .../test_vitis_accelerator_ip_flow.py} | 90 +++++++++++-------- 1 file changed, 54 insertions(+), 36 deletions(-) rename test/pytest/{test_optimization/test_fifo_depth.py => test_backend/test_vitis_accelerator_ip_flow.py} (66%) diff --git a/test/pytest/test_optimization/test_fifo_depth.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py similarity index 66% rename from test/pytest/test_optimization/test_fifo_depth.py rename to test/pytest/test_backend/test_vitis_accelerator_ip_flow.py index 6e03e3bf0e..f855793ecb 100644 --- a/test/pytest/test_optimization/test_fifo_depth.py +++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py @@ -19,7 +19,7 @@ backend_options = ['VitisAcceleratorIPFlow'] -def parse_cosim_report(project_path): +def parse_cosim_report_and_search_for_bitstream(project_path): """Parse the cosimulation report to check whether the cosimulation passed or failed and therefore a deadlock is detected. """ @@ -35,14 +35,21 @@ def parse_cosim_report(project_path): prj_dir = top_func_name + '_prj' cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_axi_cosim.rpt' - - if os.path.isfile(cosim_file_path): - return cosim_file_path - else: + bitsteam_path = project_path + '/' + f"{top_func_name}_vitis_accelerator_ip_flow/project_1.runs/impl_1/design_1_wrapper.bit" + + cosim_report_exists = os.path.isfile(cosim_file_path) + bitstream_exists = os.path.isfile(bitsteam_path) + + if cosim_report_exists and bitstream_exists: + return cosim_file_path, bitstream_exists + elif not cosim_report_exists: raise FileNotFoundError("Co-simulation report not found.") - - -def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type): + elif not bitstream_exists: + raise FileNotFoundError("Bitstream not found.") + else: + raise FileNotFoundError("Co-simulation report and Bitstream not found.") + +def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization): """Execute the FIFO depth optimization sequence on a dummy Keras model.""" # create a keras model @@ -64,12 +71,13 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type): config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>') # include the FIFO Depth optimizer do the flows - config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] - hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( - profiling_fifo_depth=profiling_fifo_depth - ) + if run_fifo_depth_optimization: + config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] + hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( + profiling_fifo_depth=profiling_fifo_depth + ) - output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_keras_backend_{backend}') + output_dir = str(test_root_path / f'hls4mlprj_vitis_accelerator_backend_{backend}') # execute fifo optimization hls_model = hls4ml.converters.convert_from_keras_model( @@ -82,34 +90,32 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type): np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.01) # check that the FIFOs have been optimized succesfully - fifo_depth_optimization_checks(hls_model) + build_and_check(hls_model, run_fifo_depth_optimization) -def fifo_depth_optimization_checks(hls_model): +def build_and_check(hls_model, run_fifo_depth_optimization): """Execute the FIFO depth optimization sequence on an hls4ml model.""" - # force the top-function to execute twice in the cosimulation, to verify no deadlocks occur even - # when streaming multiple inputs into the network - override_test_bench(hls_model) - # build the new project with optimized depths and execute cosimulation to check for deadlocks # due to the new FIFO depths - hls_model.build(reset=False, csim=False, synth=True, cosim=True) + hls_model.build(synth=True, csim=False, export=True, cosim=True, bitfile=True, vsynth=False) # checks if the fifo depths decreased/were optimized - fifo_depths = {} - with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file: - fifo_depths = json.load(fifo_depths_file) + fifo_depths_decreased = False + if run_fifo_depth_optimization: + fifo_depths = {} + with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file: + fifo_depths = json.load(fifo_depths_file) - fifo_depths_decreased = all(fifo['optimized'] < fifo['initial'] for fifo in fifo_depths.values()) + fifo_depths_decreased = all(fifo['optimized'] < fifo['initial'] for fifo in fifo_depths.values()) - # checks that the cosimulation ran succesfully without detecting deadlocks - cosim_report_path = parse_cosim_report(hls_model.config.get_output_dir()) + # checks that the cosimulation ran succesfully without detecting deadlocks and if the bitstream was generated + cosim_report_path, bitstream_exists = parse_cosim_report_and_search_for_bitstream(hls_model.config.get_output_dir()) with open(cosim_report_path) as cosim_report_file: cosim_succesful = any("Pass" in line for line in cosim_report_file) - assert fifo_depths_decreased and cosim_succesful + assert (fifo_depths_decreased or (not run_fifo_depth_optimization)) and cosim_succesful and bitstream_exists def expect_exception(error, message, backend, profiling_fifo_depth, io_type): @@ -138,8 +144,13 @@ def test_runtime_error(backend): @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_dummy_keras(backend): """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream') - + run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False) + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend): + """Test the correct execution of the FIFO depth optimizer.""" + run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True) def get_branched_model(): """ @@ -151,7 +162,7 @@ def get_branched_model(): return model -def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model): +def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model, run_fifo_depth_optimization): """Execute the FIFO depth optimization sequence on a ONNX/QONNX model.""" ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) @@ -167,10 +178,11 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod # add this line to remove the linear layer that quantizes the input of the NN config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>' - config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] - hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( - profiling_fifo_depth=profiling_fifo_depth - ) + if run_fifo_depth_optimization: + config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] + hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( + profiling_fifo_depth=profiling_fifo_depth + ) output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}') @@ -185,11 +197,17 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel()) - fifo_depth_optimization_checks(hls_model) + build_and_check(hls_model, run_fifo_depth_optimization) @pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_tiny_unet(backend): """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model()) + run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model(), run_fifo_depth_optimization=False) + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend): + """Test the correct execution of the FIFO depth optimizer.""" + run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model(), run_fifo_depth_optimization=True) From e66ad403dafc855c3ba62db649faf9c847238ee8 Mon Sep 17 00:00:00 2001 From: steltze Date: Thu, 6 Mar 2025 17:52:34 +0100 Subject: [PATCH 097/103] Run precommit and remove unused override testbench --- .../passes/fifo_depth_optimization.py | 44 ------------------- .../test_vitis_accelerator_ip_flow.py | 41 ++++++++++++----- 2 files changed, 31 insertions(+), 54 deletions(-) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py index 077d3683c5..4194ae3365 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py @@ -46,48 +46,6 @@ def initialize_large_fifos(model, profiling_fifo_depth): return initial_fifo_depths -def override_test_bench(model): - """In order for the FIFO depth profiling to produce correct results, it is necessary for the cosimulation to - call the top function - Vitis IP at **least twice**. The test bench produced by the Vivado Writer is - overwritten by adding a for-loop over the top function. - - Args: - model (ModelGraph): The model to which FIFO depth optimization is applied. - """ - indent = " " - path_to_old_test_bench = f"{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp" - path_to_new_test_bench = f"{model.config.get_output_dir()}/{model.config.get_project_name()}_new_test.cpp" - - newline = "" - second_part_of_testbench = False - with open(path_to_old_test_bench) as old_test_bench: - file_iterator = iter(old_test_bench) - for line in file_iterator: - - if "// hls-fpga-machine-learning insert zero" in line: - newline += indent + indent + "const unsigned PROFILING_ITERATIONS = 2;\n" - newline += ( - indent - + indent - + "for(unsigned batch_iteration = 0; batch_iteration < PROFILING_ITERATIONS; ++batch_iteration) {\n" - ) - newline += line - second_part_of_testbench = True - elif ("// hls-fpga-machine-learning insert tb-output" in line) and second_part_of_testbench: - newline += line - newline += next(file_iterator) - newline += indent + "}\n" - else: - newline += line - - with open(path_to_new_test_bench, "w+") as new_test_bench: - new_test_bench.write(newline) - - # replace the old test bench with the new test bench that includes a for-loop - os.replace(path_to_new_test_bench, path_to_old_test_bench) - return - - def execute_cosim_to_profile_fifos(model): """Execute a cosimulation with a testh bench that calls the top function - Vitis IP at **least twice**, to properly profile the max FIFO depths. The function will momentarily replace the initial test bench @@ -98,8 +56,6 @@ def execute_cosim_to_profile_fifos(model): """ model.write() - override_test_bench(model) - model.build( reset=False, csim=False, diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py index f855793ecb..b775efe634 100644 --- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py +++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py @@ -11,7 +11,6 @@ from tensorflow.keras.models import Sequential import hls4ml -from hls4ml.backends.vitis_accelerator_ip_flow.passes.fifo_depth_optimization import override_test_bench test_root_path = Path(__file__).parent example_model_path = (test_root_path / '../../../example-models').resolve() @@ -35,11 +34,13 @@ def parse_cosim_report_and_search_for_bitstream(project_path): prj_dir = top_func_name + '_prj' cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_axi_cosim.rpt' - bitsteam_path = project_path + '/' + f"{top_func_name}_vitis_accelerator_ip_flow/project_1.runs/impl_1/design_1_wrapper.bit" - + bitsteam_path = ( + project_path + '/' + f"{top_func_name}_vitis_accelerator_ip_flow/project_1.runs/impl_1/design_1_wrapper.bit" + ) + cosim_report_exists = os.path.isfile(cosim_file_path) bitstream_exists = os.path.isfile(bitsteam_path) - + if cosim_report_exists and bitstream_exists: return cosim_file_path, bitstream_exists elif not cosim_report_exists: @@ -48,7 +49,8 @@ def parse_cosim_report_and_search_for_bitstream(project_path): raise FileNotFoundError("Bitstream not found.") else: raise FileNotFoundError("Co-simulation report and Bitstream not found.") - + + def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization): """Execute the FIFO depth optimization sequence on a dummy Keras model.""" @@ -144,13 +146,19 @@ def test_runtime_error(backend): @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_dummy_keras(backend): """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False) - + run_fifo_depth_optimization_keras( + backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False + ) + + @pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend): """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True) + run_fifo_depth_optimization_keras( + backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True + ) + def get_branched_model(): """ @@ -204,10 +212,23 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_tiny_unet(backend): """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model(), run_fifo_depth_optimization=False) + run_fifo_depth_optimization_onnx( + backend, + profiling_fifo_depth=200_000, + io_type='io_stream', + model=get_branched_model(), + run_fifo_depth_optimization=False, + ) + @pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend): """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model(), run_fifo_depth_optimization=True) + run_fifo_depth_optimization_onnx( + backend, + profiling_fifo_depth=200_000, + io_type='io_stream', + model=get_branched_model(), + run_fifo_depth_optimization=True, + ) From f51be88830c02f21cf6db832047fd705d18ba4c7 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 7 Mar 2025 08:49:29 +0100 Subject: [PATCH 098/103] Fix qonnx test --- .../test_backend/test_vitis_accelerator_ip_flow.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py index b775efe634..6699f5570b 100644 --- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py +++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py @@ -100,7 +100,7 @@ def build_and_check(hls_model, run_fifo_depth_optimization): # build the new project with optimized depths and execute cosimulation to check for deadlocks # due to the new FIFO depths - hls_model.build(synth=True, csim=False, export=True, cosim=True, bitfile=True, vsynth=False) + hls_model.build(reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization) # checks if the fifo depths decreased/were optimized fifo_depths_decreased = False @@ -142,7 +142,7 @@ def test_runtime_error(backend): expect_exception(RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel') -# @pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_dummy_keras(backend): """Test the correct execution of the FIFO depth optimizer.""" @@ -200,6 +200,9 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod io_type=io_type, backend=backend, hls_config=config, + part="xczu9eg-ffvb1156-2-e", + board='zcu102', + clock_period=10 ) hls_model.compile() y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) @@ -221,7 +224,7 @@ def test_successful_execution_of_tiny_unet(backend): ) -@pytest.mark.skip(reason='Skipping synthesis tests for now') +# @pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend): """Test the correct execution of the FIFO depth optimizer.""" From 85c233c38536ced1dfc86b3e0097100bca06a9eb Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 7 Mar 2025 10:05:56 +0100 Subject: [PATCH 099/103] Fix keras fifo optimization test --- .../pynq-z2/tcl_scripts/axi_stream_design.tcl | 2 +- .../zcu102/tcl_scripts/axi_stream_design.tcl | 2 +- .../test_vitis_accelerator_ip_flow.py | 20 +++++++++++++------ 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl index e8db1e6782..7db291fda6 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl +++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl @@ -63,7 +63,7 @@ reset_run impl_1 reset_run synth_1 #todo: make number of jobs a variable launch_runs impl_1 -to_step write_bitstream -jobs 10 -wait_on_run -timeout 360 impl_1 +wait_on_run -timeout 480 impl_1 open_run impl_1 report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl index 103fec0178..34f5468e7e 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl +++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl @@ -59,7 +59,7 @@ add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources reset_run impl_1 reset_run synth_1 launch_runs impl_1 -to_step write_bitstream -jobs 6 -wait_on_run -timeout 360 impl_1 +wait_on_run -timeout 480 impl_1 open_run impl_1 report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py index 6699f5570b..4b70589052 100644 --- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py +++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py @@ -55,7 +55,7 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, ru """Execute the FIFO depth optimization sequence on a dummy Keras model.""" # create a keras model - input_shape = (32, 32, 3) + input_shape = (16, 16, 3) activation = 'relu' kernel_size = (3, 3) padding = 'same' @@ -100,7 +100,9 @@ def build_and_check(hls_model, run_fifo_depth_optimization): # build the new project with optimized depths and execute cosimulation to check for deadlocks # due to the new FIFO depths - hls_model.build(reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization) + hls_model.build( + reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization + ) # checks if the fifo depths decreased/were optimized fifo_depths_decreased = False @@ -109,7 +111,13 @@ def build_and_check(hls_model, run_fifo_depth_optimization): with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file: fifo_depths = json.load(fifo_depths_file) - fifo_depths_decreased = all(fifo['optimized'] < fifo['initial'] for fifo in fifo_depths.values()) + # omit checking for the input and output AXIS FIFOs as they are not always optimized + # as the last kernel e.g pointwise is faster than AXIS speed + fifo_depths_decreased = all( + fifo_depths['optimized'] < fifo_depths['initial'] + for fifo_name, fifo_depths in fifo_depths.items() + if fifo_name not in {'in_local', 'out_local'} + ) # checks that the cosimulation ran succesfully without detecting deadlocks and if the bitstream was generated cosim_report_path, bitstream_exists = parse_cosim_report_and_search_for_bitstream(hls_model.config.get_output_dir()) @@ -200,9 +208,9 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod io_type=io_type, backend=backend, hls_config=config, - part="xczu9eg-ffvb1156-2-e", + part="xczu9eg-ffvb1156-2-e", board='zcu102', - clock_period=10 + clock_period=10, ) hls_model.compile() y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) @@ -224,7 +232,7 @@ def test_successful_execution_of_tiny_unet(backend): ) -# @pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend): """Test the correct execution of the FIFO depth optimizer.""" From b91b6414fec78cccd16e7841fe1ba1b9af9f47f8 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 7 Mar 2025 14:20:33 +0100 Subject: [PATCH 100/103] Fix test documentation --- .../test_vitis_accelerator_ip_flow.py | 130 ++++++++++-------- 1 file changed, 71 insertions(+), 59 deletions(-) diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py index 4b70589052..9d370186a3 100644 --- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py +++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py @@ -20,7 +20,7 @@ def parse_cosim_report_and_search_for_bitstream(project_path): """Parse the cosimulation report to check whether the cosimulation passed or failed and therefore a deadlock is - detected. + detected and check if the bitstream was generated without errors. """ prj_dir = None top_func_name = None @@ -43,16 +43,18 @@ def parse_cosim_report_and_search_for_bitstream(project_path): if cosim_report_exists and bitstream_exists: return cosim_file_path, bitstream_exists + elif (not cosim_report_exists) and (not bitstream_exists): + raise FileNotFoundError("Co-simulation report and Bitstream not found.") elif not cosim_report_exists: raise FileNotFoundError("Co-simulation report not found.") - elif not bitstream_exists: - raise FileNotFoundError("Bitstream not found.") else: - raise FileNotFoundError("Co-simulation report and Bitstream not found.") + raise FileNotFoundError("Bitstream not found.") -def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization): - """Execute the FIFO depth optimization sequence on a dummy Keras model.""" +def run_bitstream_generation_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization): + """Execute bitstream generation on a dummy Keras model and the FIFO optimization sequence if + `run_fifo_depth_optimization` is set. + """ # create a keras model input_shape = (16, 16, 3) @@ -72,16 +74,17 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, ru config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>') - # include the FIFO Depth optimizer do the flows + # include the FIFO Depth optimizer do the flows if `run_fifo_depth_optimization` is set if run_fifo_depth_optimization: config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( profiling_fifo_depth=profiling_fifo_depth ) - output_dir = str(test_root_path / f'hls4mlprj_vitis_accelerator_backend_{backend}') + output_dir = str( + test_root_path / f'hls4mlprj_keras_model_backend_{backend}_fifo_optimization_{run_fifo_depth_optimization}' + ) - # execute fifo optimization hls_model = hls4ml.converters.convert_from_keras_model( model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend, clock_period=10 ) @@ -91,15 +94,16 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, ru np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.01) - # check that the FIFOs have been optimized succesfully + # build the hls4ml model and check if the bitstream was generated and the FIFOs were optimized if + # `run_fifo_depth_optimization` is set build_and_check(hls_model, run_fifo_depth_optimization) def build_and_check(hls_model, run_fifo_depth_optimization): """Execute the FIFO depth optimization sequence on an hls4ml model.""" - # build the new project with optimized depths and execute cosimulation to check for deadlocks - # due to the new FIFO depths + # try to generate a bitstream. Use the optimized FIFO depths and execute cosimulation to check for deadlocks + # due to the new FIFO depths if `run_fifo_depth_optimization` is set hls_model.build( reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization ) @@ -130,47 +134,12 @@ def build_and_check(hls_model, run_fifo_depth_optimization): def expect_exception(error, message, backend, profiling_fifo_depth, io_type): with pytest.raises(error, match=re.escape(message)): - run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type) - - -@pytest.mark.skip(reason='Skipping synthesis tests for now') -@pytest.mark.parametrize('backend', backend_options) -@pytest.mark.parametrize('profiling_fifo_depth', [-2, 3.14, "a"]) -def test_value_error(backend, profiling_fifo_depth): - """Test the FIFO depth optimizer with faulty inputs of profiling_fifo_depth to verify that an exception is raised.""" - message = "The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer." - expect_exception(ValueError, message, backend, profiling_fifo_depth, io_type='io_stream') - - -@pytest.mark.skip(reason='Skipping synthesis tests for now') -@pytest.mark.parametrize('backend', backend_options) -def test_runtime_error(backend): - """Test the FIFO depth optimizer with io_type='io_parallel' to verify that an exception is raised.""" - message = "To use this optimization you have to set `IOType` field to `io_stream` in the HLS config." - expect_exception(RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel') - - -@pytest.mark.skip(reason='Skipping synthesis tests for now') -@pytest.mark.parametrize('backend', backend_options) -def test_successful_execution_of_dummy_keras(backend): - """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_keras( - backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False - ) - - -@pytest.mark.skip(reason='Skipping synthesis tests for now') -@pytest.mark.parametrize('backend', backend_options) -def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend): - """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_keras( - backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True - ) + run_bitstream_generation_keras(backend, profiling_fifo_depth, io_type) def get_branched_model(): """ - Load branched model, already channels-last and cleaned + Load branched model, already channels-last and cleaned. """ dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx") assert os.path.isfile(dl_file) @@ -178,8 +147,10 @@ def get_branched_model(): return model -def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model, run_fifo_depth_optimization): - """Execute the FIFO depth optimization sequence on a ONNX/QONNX model.""" +def run_bitstream_generation_onnx(backend, profiling_fifo_depth, io_type, model, run_fifo_depth_optimization): + """Execute bitstream generation on a QONNX branched model and the FIFO optimization sequence if + `run_fifo_depth_optimization` is set. + """ ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) @@ -200,7 +171,9 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod profiling_fifo_depth=profiling_fifo_depth ) - output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}') + output_dir = str( + test_root_path / f'hls4mlprj_branched_model_backend_{backend}_fifo_optimization_{run_fifo_depth_optimization}' + ) hls_model = hls4ml.converters.convert_from_onnx_model( model, @@ -221,9 +194,48 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod @pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) -def test_successful_execution_of_tiny_unet(backend): - """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_onnx( +@pytest.mark.parametrize('profiling_fifo_depth', [-2, 3.14, "a"]) +def test_value_error(backend, profiling_fifo_depth): + """Test the FIFO depth optimizer with faulty inputs of profiling_fifo_depth to verify that an exception is raised.""" + message = "The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer." + expect_exception( + ValueError, message, backend, profiling_fifo_depth, io_type='io_stream', run_fifo_depth_optimization=True + ) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_runtime_error(backend): + """Test the FIFO depth optimizer with io_type='io_parallel' to verify that an exception is raised.""" + message = "To use this optimization you have to set `IOType` field to `io_stream` in the HLS config." + expect_exception( + RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel', run_fifo_depth_optimization=True + ) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_dummy_keras(backend): + """Test the correct execution of the bitstream generation.""" + run_bitstream_generation_keras( + backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False + ) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend): + """Test the correct execution of the bitstream generation with the FIFO depth optimizer.""" + run_bitstream_generation_keras( + backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True + ) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_branched_model(backend): + """Test the correct execution of the bitstream generation.""" + run_bitstream_generation_onnx( backend, profiling_fifo_depth=200_000, io_type='io_stream', @@ -232,11 +244,11 @@ def test_successful_execution_of_tiny_unet(backend): ) -@pytest.mark.skip(reason='Skipping synthesis tests for now') +# @pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) -def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend): - """Test the correct execution of the FIFO depth optimizer.""" - run_fifo_depth_optimization_onnx( +def test_successful_execution_of_branched_model_with_fifo_optimization(backend): + """Test the correct execution of the bitstream generation with the FIFO depth optimizer.""" + run_bitstream_generation_onnx( backend, profiling_fifo_depth=200_000, io_type='io_stream', From 5bc54d3d08cb2f1c36e24734aa0aca13a5042d8b Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 7 Mar 2025 16:36:19 +0100 Subject: [PATCH 101/103] Fix vivado project path in the build tcl for zcu102 --- .../zcu102/tcl_scripts/axi_stream_design.tcl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl index 34f5468e7e..689186eb5f 100644 --- a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl +++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl @@ -2,7 +2,7 @@ set tcldir [file dirname [info script]] source [file join $tcldir project.tcl] -create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force +create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xczu9eg-ffvb1156-2-e -force set_property board_part xilinx.com:zcu102:part0:3.3 [current_project] set_property ip_repo_paths ${project_name}_prj [current_project] @@ -52,9 +52,9 @@ connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins $ apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] -make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top +make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top -add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v +add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v reset_run impl_1 reset_run synth_1 From 0a0d7d1a3db19521ef90a59bc714364e5082a264 Mon Sep 17 00:00:00 2001 From: steltze Date: Fri, 7 Mar 2025 18:48:16 +0100 Subject: [PATCH 102/103] Skip all tests --- test/pytest/test_backend/test_vitis_accelerator_ip_flow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py index 9d370186a3..00af95fce6 100644 --- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py +++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py @@ -244,7 +244,7 @@ def test_successful_execution_of_branched_model(backend): ) -# @pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.skip(reason='Skipping synthesis tests for now') @pytest.mark.parametrize('backend', backend_options) def test_successful_execution_of_branched_model_with_fifo_optimization(backend): """Test the correct execution of the bitstream generation with the FIFO depth optimizer.""" From e55c52e43e839147c6b140cf91ff15eb45df613b Mon Sep 17 00:00:00 2001 From: steltze Date: Thu, 13 Mar 2025 18:21:50 +0100 Subject: [PATCH 103/103] Link backend fifo optimization options --- .../vitis_accelerator_ip_flow_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py index ab0f49f585..f1f16a1e83 100644 --- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -34,7 +34,7 @@ def build( validation=validation, export=export, vsynth=vsynth, - fifo_opt=True, + fifo_opt=fifo_opt, ) # now make a bitfile