From 8be3ccec127524a838b3407e6e20380037038d5e Mon Sep 17 00:00:00 2001 From: EJ Kreinar <ejkreinar@case.edu> Date: Wed, 22 Aug 2018 21:41:29 -0400 Subject: [PATCH 1/4] nnet_layer: Fix io_serial pragmas for dense layer with flattened weights --- nnet_utils/nnet_layer.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nnet_utils/nnet_layer.h b/nnet_utils/nnet_layer.h index f845c122c..b4b32647f 100644 --- a/nnet_utils/nnet_layer.h +++ b/nnet_utils/nnet_layer.h @@ -74,9 +74,10 @@ void compute_layer( #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation } else if (CONFIG_T::io_type == io_serial){ - #pragma HLS ARRAY_RESHAPE variable=weights complete dim=1 - #pragma HLS ARRAY_PARTITION variable=mult complete dim=1 - #pragma HLS ARRAY_PARTITION variable=acc complete dim=1 + unsigned int cycle_factor = CONFIG_T::n_out; + #pragma HLS ARRAY_PARTITION variable=weights cyclic factor=cycle_factor + #pragma HLS ARRAY_PARTITION variable=mult cyclic factor=cycle_factor + #pragma HLS ARRAY_PARTITION variable=acc complete #pragma HLS DATAFLOW #pragma HLS STREAM variable=mult depth=1 #pragma HLS STREAM variable=acc depth=1 -- GitLab From f77c28596d7e473083b37f03ce156817266c6f04 Mon Sep 17 00:00:00 2001 From: EJ Kreinar <ejkreinar@case.edu> Date: Wed, 29 Aug 2018 22:18:31 -0400 Subject: [PATCH 2/4] nnet_activation: Fix softmax RTL simulation for io_serial When input format is serial mode, array indexing needs to be performed in order, one element at a time. Solution: Create a data_cache variable that pre-loads data in order before running the nested for-loop. When using io_parallel, the outer PIPELINE directive should ensure that data_cache = data without taking any extra clock cycles or resources. --- nnet_utils/nnet_activation.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/nnet_utils/nnet_activation.h b/nnet_utils/nnet_activation.h index 10d5930dd..3df813529 100644 --- a/nnet_utils/nnet_activation.h +++ b/nnet_utils/nnet_activation.h @@ -233,12 +233,11 @@ void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // Index into the lookup table based on data for exponentials typename CONFIG_T::table_t exp_res[CONFIG_T::n_in];// different, independent, fixed point precision typename CONFIG_T::table_t exp_diff_res[CONFIG_T::n_in][CONFIG_T::n_in];// different, independent, fixed point precision + data_T data_cache[CONFIG_T::n_in]; int data_round; int index; for (int ii=0; ii<CONFIG_T::n_in; ii++) { - if (CONFIG_T::io_type == io_serial){ - #pragma HLS UNROLL - } + data_cache[ii] = data[ii]; exp_res[ii] = 0; } for (int ii=0; ii<CONFIG_T::n_in; ii++) { @@ -248,7 +247,7 @@ void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) } if (ii==jj) exp_diff_res[ii][jj] = 1; else { - data_round = (data[jj]-data[ii])*CONFIG_T::table_size/16; + data_round = (data_cache[jj]-data_cache[ii])*CONFIG_T::table_size/16; index = data_round + 8*CONFIG_T::table_size/16; if (index < 0) index = 0; if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; -- GitLab From d19fe40cd727f87df65118b71d702d83438a6999 Mon Sep 17 00:00:00 2001 From: EJ Kreinar <ejkreinar@case.edu> Date: Thu, 30 Aug 2018 22:00:39 -0400 Subject: [PATCH 3/4] nnet_layer: Use BRAMs when synthesizing dense layer Enable BRAMs with the "store_weights_in_bram" parameter. There's a little extra logic in here to reduce the consumed BRAMs when you increase the reuse_factor (but this is only confirmed if output layer size is evenly divisible by the selected reuse_factor) --- nnet_utils/nnet_layer.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/nnet_utils/nnet_layer.h b/nnet_utils/nnet_layer.h index b4b32647f..e8613e256 100644 --- a/nnet_utils/nnet_layer.h +++ b/nnet_utils/nnet_layer.h @@ -74,13 +74,23 @@ void compute_layer( #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation } else if (CONFIG_T::io_type == io_serial){ - unsigned int cycle_factor = CONFIG_T::n_out; + // Only reduce cycle_factor if n_out is evenly divisible by reuse_factor + // Otherwise, HLS wont be happy + int cycle_factor = CONFIG_T::n_out; + float reused_cycle = CONFIG_T::n_out / CONFIG_T::reuse_factor; + if (reused_cycle == ceil(reused_cycle)){ + // Dont use "ceil" here; as of 2018.2, HLS crashes mysteriously + cycle_factor = cycle_factor / CONFIG_T::reuse_factor; + } #pragma HLS ARRAY_PARTITION variable=weights cyclic factor=cycle_factor #pragma HLS ARRAY_PARTITION variable=mult cyclic factor=cycle_factor #pragma HLS ARRAY_PARTITION variable=acc complete #pragma HLS DATAFLOW #pragma HLS STREAM variable=mult depth=1 #pragma HLS STREAM variable=acc depth=1 + if (CONFIG_T::store_weights_in_bram){ + #pragma HLS RESOURCE variable=weights core=ROM_2P_BRAM + } } // Do the matrix-multiply -- GitLab From 6eac53ed951fcb11a7b26fca0d2bb00ba9446153 Mon Sep 17 00:00:00 2001 From: EJ Kreinar <ejkreinar@case.edu> Date: Tue, 4 Sep 2018 21:14:10 -0400 Subject: [PATCH 4/4] sublayers: Disable sublayer partitioning if using io_serial - hls_writer: Ignore "n_part" partition when using serial mode - keras-to-hls: Dont create partitions if using serial mode --- hls-writer/hls_writer.py | 3 ++- keras-to-hls/keras-to-hls.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hls-writer/hls_writer.py b/hls-writer/hls_writer.py index 3b247f7dc..b2c5c669d 100644 --- a/hls-writer/hls_writer.py +++ b/hls-writer/hls_writer.py @@ -148,7 +148,8 @@ def hls_writer(layer_list, yamlConfig): if yamlConfig["IOType"] == "io_parallel": newline += ' #pragma HLS ARRAY_PARTITION variable=logits{} complete dim=0\n'.format(i) if yamlConfig["IOType"] == "io_serial": newline += ' #pragma HLS STREAM variable=logits{} depth=1\n'.format(i) - if layer_list[i-1]['n_part']==1: + if layer_list[i-1]['n_part']==1 or yamlConfig["IOType"]=="io_serial": + # Use one layer if there's only 1 partition, or if we're using serial mode newline += ' nnet::compute_layer<{}, {}, config{}>({}, logits{}, w{}, b{});\n'.format(input_type, output_type, i, input_object, i, i, i, i) else: # initialize arrays for sublayer outputs diff --git a/keras-to-hls/keras-to-hls.py b/keras-to-hls/keras-to-hls.py index a9825b48c..21af26cd4 100644 --- a/keras-to-hls/keras-to-hls.py +++ b/keras-to-hls/keras-to-hls.py @@ -134,7 +134,7 @@ def main(): # if this layer is too big (more than MAXMULT multiplications); # break it out into chunks! layer['n_subout']=[weights.shape[1]] - if layer['n_in']*layer['n_out']>MAXMULT: + if layer['n_in']*layer['n_out']>MAXMULT and yamlConfig["IOType"] != "io_serial": n_subout = int(MAXMULT/layer['n_in']) n_totout = 0 layer['n_subout'] = [] -- GitLab