From 8be3ccec127524a838b3407e6e20380037038d5e Mon Sep 17 00:00:00 2001
From: EJ Kreinar <ejkreinar@case.edu>
Date: Wed, 22 Aug 2018 21:41:29 -0400
Subject: [PATCH 1/4] nnet_layer: Fix io_serial pragmas for dense layer with
 flattened weights

---
 nnet_utils/nnet_layer.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nnet_utils/nnet_layer.h b/nnet_utils/nnet_layer.h
index f845c122c..b4b32647f 100644
--- a/nnet_utils/nnet_layer.h
+++ b/nnet_utils/nnet_layer.h
@@ -74,9 +74,10 @@ void compute_layer(
         #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
 
     } else if (CONFIG_T::io_type == io_serial){
-        #pragma HLS ARRAY_RESHAPE variable=weights complete dim=1
-        #pragma HLS ARRAY_PARTITION variable=mult complete dim=1
-        #pragma HLS ARRAY_PARTITION variable=acc complete dim=1
+        unsigned int cycle_factor = CONFIG_T::n_out;
+        #pragma HLS ARRAY_PARTITION variable=weights cyclic factor=cycle_factor
+        #pragma HLS ARRAY_PARTITION variable=mult cyclic factor=cycle_factor
+        #pragma HLS ARRAY_PARTITION variable=acc complete
         #pragma HLS DATAFLOW
         #pragma HLS STREAM variable=mult depth=1
         #pragma HLS STREAM variable=acc depth=1
-- 
GitLab


From f77c28596d7e473083b37f03ce156817266c6f04 Mon Sep 17 00:00:00 2001
From: EJ Kreinar <ejkreinar@case.edu>
Date: Wed, 29 Aug 2018 22:18:31 -0400
Subject: [PATCH 2/4] nnet_activation: Fix softmax RTL simulation for io_serial

When input format is serial mode, array indexing needs to be
performed in order, one element at a time.

Solution: Create a data_cache variable that pre-loads
data in order before running the nested for-loop.

When using io_parallel, the outer PIPELINE directive
should ensure that data_cache = data without taking
any extra clock cycles or resources.
---
 nnet_utils/nnet_activation.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/nnet_utils/nnet_activation.h b/nnet_utils/nnet_activation.h
index 10d5930dd..3df813529 100644
--- a/nnet_utils/nnet_activation.h
+++ b/nnet_utils/nnet_activation.h
@@ -233,12 +233,11 @@ void  softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
     // Index into the lookup table based on data for exponentials
     typename CONFIG_T::table_t exp_res[CONFIG_T::n_in];// different, independent, fixed point precision
     typename CONFIG_T::table_t exp_diff_res[CONFIG_T::n_in][CONFIG_T::n_in];// different, independent, fixed point precision
+    data_T data_cache[CONFIG_T::n_in];
     int data_round;
     int index;
     for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-      if (CONFIG_T::io_type == io_serial){
-            #pragma HLS UNROLL
-      }
+      data_cache[ii] = data[ii];
       exp_res[ii] = 0;
     }
     for (int ii=0; ii<CONFIG_T::n_in; ii++) {
@@ -248,7 +247,7 @@ void  softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
         }
 	if (ii==jj) exp_diff_res[ii][jj] = 1;
 	else {
-	  data_round = (data[jj]-data[ii])*CONFIG_T::table_size/16;
+	  data_round = (data_cache[jj]-data_cache[ii])*CONFIG_T::table_size/16;
 	  index = data_round + 8*CONFIG_T::table_size/16;
 	  if (index < 0)   index = 0;
 	  if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-- 
GitLab


From d19fe40cd727f87df65118b71d702d83438a6999 Mon Sep 17 00:00:00 2001
From: EJ Kreinar <ejkreinar@case.edu>
Date: Thu, 30 Aug 2018 22:00:39 -0400
Subject: [PATCH 3/4] nnet_layer: Use BRAMs when synthesizing dense layer

Enable BRAMs with the "store_weights_in_bram" parameter.

There's a little extra logic in here to reduce the consumed
BRAMs when you increase the reuse_factor (but this is only
confirmed if output layer size is evenly divisible by the
selected reuse_factor)
---
 nnet_utils/nnet_layer.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/nnet_utils/nnet_layer.h b/nnet_utils/nnet_layer.h
index b4b32647f..e8613e256 100644
--- a/nnet_utils/nnet_layer.h
+++ b/nnet_utils/nnet_layer.h
@@ -74,13 +74,23 @@ void compute_layer(
         #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
 
     } else if (CONFIG_T::io_type == io_serial){
-        unsigned int cycle_factor = CONFIG_T::n_out;
+        // Only reduce cycle_factor if n_out is evenly divisible by reuse_factor
+        // Otherwise, HLS wont be happy
+        int cycle_factor = CONFIG_T::n_out;
+        float reused_cycle = CONFIG_T::n_out / CONFIG_T::reuse_factor;
+        if (reused_cycle == ceil(reused_cycle)){
+            // Dont use "ceil" here; as of 2018.2, HLS crashes mysteriously
+            cycle_factor = cycle_factor / CONFIG_T::reuse_factor;
+        }
         #pragma HLS ARRAY_PARTITION variable=weights cyclic factor=cycle_factor
         #pragma HLS ARRAY_PARTITION variable=mult cyclic factor=cycle_factor
         #pragma HLS ARRAY_PARTITION variable=acc complete
         #pragma HLS DATAFLOW
         #pragma HLS STREAM variable=mult depth=1
         #pragma HLS STREAM variable=acc depth=1
+        if (CONFIG_T::store_weights_in_bram){
+            #pragma HLS RESOURCE variable=weights core=ROM_2P_BRAM
+        }
     }
     
     // Do the matrix-multiply
-- 
GitLab


From 6eac53ed951fcb11a7b26fca0d2bb00ba9446153 Mon Sep 17 00:00:00 2001
From: EJ Kreinar <ejkreinar@case.edu>
Date: Tue, 4 Sep 2018 21:14:10 -0400
Subject: [PATCH 4/4] sublayers: Disable sublayer partitioning if using
 io_serial

- hls_writer: Ignore "n_part" partition when using serial mode
- keras-to-hls: Dont create partitions if using serial mode
---
 hls-writer/hls_writer.py     | 3 ++-
 keras-to-hls/keras-to-hls.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hls-writer/hls_writer.py b/hls-writer/hls_writer.py
index 3b247f7dc..b2c5c669d 100644
--- a/hls-writer/hls_writer.py
+++ b/hls-writer/hls_writer.py
@@ -148,7 +148,8 @@ def hls_writer(layer_list, yamlConfig):
                     if yamlConfig["IOType"] == "io_parallel": newline += '    #pragma HLS ARRAY_PARTITION variable=logits{} complete dim=0\n'.format(i)
                     if yamlConfig["IOType"] == "io_serial":   newline += '    #pragma HLS STREAM variable=logits{} depth=1\n'.format(i)
                     
-                    if layer_list[i-1]['n_part']==1: 
+                    if layer_list[i-1]['n_part']==1 or yamlConfig["IOType"]=="io_serial":
+                        # Use one layer if there's only 1 partition, or if we're using serial mode
                         newline += '    nnet::compute_layer<{}, {}, config{}>({}, logits{}, w{}, b{});\n'.format(input_type, output_type, i, input_object, i, i, i, i)
                     else:
                         # initialize arrays for sublayer outputs
diff --git a/keras-to-hls/keras-to-hls.py b/keras-to-hls/keras-to-hls.py
index a9825b48c..21af26cd4 100644
--- a/keras-to-hls/keras-to-hls.py
+++ b/keras-to-hls/keras-to-hls.py
@@ -134,7 +134,7 @@ def main():
             # if this layer is too big (more than MAXMULT multiplications); 
             # break it out into chunks!
             layer['n_subout']=[weights.shape[1]]
-            if layer['n_in']*layer['n_out']>MAXMULT:
+            if layer['n_in']*layer['n_out']>MAXMULT and yamlConfig["IOType"] != "io_serial":
                 n_subout = int(MAXMULT/layer['n_in'])
                 n_totout = 0
                 layer['n_subout'] = []
-- 
GitLab