foss-for-synopsys-dwc-arc-processors
diff --git a/‎README.md
Lines changed: 24 additions & 3 deletions b/‎README.md
Lines changed: 24 additions & 3 deletions
diff --git a/‎build/rules.mk
Lines changed: 9 additions & 0 deletions b/‎build/rules.mk
Lines changed: 9 additions & 0 deletions
diff --git a/‎doc/documents/MLI_kernels/convolution_2d.rst
Lines changed: 1 addition & 1 deletion b/‎doc/documents/MLI_kernels/convolution_2d.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/documents/MLI_kernels/pooling_max.rst
Lines changed: 1 addition & 1 deletion b/‎doc/documents/MLI_kernels/pooling_max.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/documents/library_model/functions.rst
Lines changed: 5 additions & 2 deletions b/‎doc/documents/library_model/functions.rst
Lines changed: 5 additions & 2 deletions
diff --git a/‎doc/documents/library_model/hw_dependencies_config.rst
Lines changed: 1 addition & 1 deletion b/‎doc/documents/library_model/hw_dependencies_config.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/example_cifar10_caffe/Makefile
Lines changed: 1 addition & 1 deletion b/‎examples/example_cifar10_caffe/Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/example_cifar10_caffe/README.md
Lines changed: 15 additions & 0 deletions b/‎examples/example_cifar10_caffe/README.md
Lines changed: 15 additions & 0 deletions
diff --git a/‎examples/example_cifar10_caffe/cifar10_model_chw.c
Lines changed: 2 additions & 2 deletions b/‎examples/example_cifar10_caffe/cifar10_model_chw.c
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/example_har_smartphone/Makefile
Lines changed: 1 addition & 1 deletion b/‎examples/example_har_smartphone/Makefile
Lines changed: 1 addition & 1 deletion
@@ -1,8 +1,8 @@
 embARC Machine Learning Inference Library
 ==================================================
 
-This repository contains source code of embARC Machine Learning Inference Library (embARC MLI Library),
-examples and documentation.
+This repository contains source code of embARC Machine Learning Inference Library (embARC MLI Library), 
+documentation and examples. Read the documentation at [embarc.org](https://embarc.org/embarc_mli).
 
 ## Release notes
 ----------------
@@ -16,7 +16,7 @@ examples and documentation.
 	* Elementwise (add, sub, mul, min, max)
 	* Data manipulation (concatanation, permute, 2D padding)
 	* ReLU, Leaky ReLu, ReLu1, ReLu6
-	* Softmax, Sigmoid, ThanH
+	* Softmax, Sigmoid, TanH
 3. Supported data layout CHW (Channel-Height-Width standard for Caffe)
 
 ## Package structure
@@ -73,8 +73,29 @@ Building of embARC MLI library
 
 5. Result Quality shall be "S/N=1823.9     (65.2 db)"
 
+## Optimizations for code size
+------------------------------
+By default the embARC MLI Library is build for optimal speed. If code size needs to be reduced, there are two things that can be done:
+1. For convolution and pooling layers there are specialized funtions for specific kernel sizes, they are called by a wrapper functions based on the parameters.
+These parameters are compile time constant in the application, so the application can directly call the specialized functions. This will reduce over all code size.
+Please be aware that the list of specializations is not guaranteed to be backwards compatible between releases.
+
+2. Use a different optimization mode when calling the makefile. OPTMODE=size will optimize for size. default is OPTMODE=speed
+	'gmake TCF_FILE=../../hw/em9d.tcf OPTMODE=size'
 
 ## Known Issues
 ---------------
 1. Optimal performance for 8-bit data requires version of MetaWare Development Tools 2019.06 or later
 
+## Frequently Asked Questions
+---------------
+
+Q: Can I use ARC GNU tools to build embARC MLI library?
+A: No you cannot. embARC MLI Library must be built by MetaWare Development Tools only. Read the documentation at [embarc.org]( https://embarc.org/embarc_mli/doc/build/html/getting_started/getting_started.html#build-library) for details
+
+Q: Can I use MetaWare Development Tools Lite to pre-build embARC MLI library and ARC GNU to build example application?
+A: No you cannot. embARC MLI Library must be built by full version of MetaWare Development Tools. Binaries built with MWDT Lite are not compatible with ARC GNU Tools and full MetaWare Development Tools. Read the MWDT Lite documentation for details.
+
+Q: I can not build and run example application for my Synopsys board (EMSK, IoTDK, etc), what I shall do?
+A: If you build for Synopsys boards refer to documentation [embarc.org](https://embarc.org/platforms.html) as a good starting point. 
+You should also note that example applications support different configurations for pre trained models and thus memory requirements, not all configurations can be built and run on Synopsys boards due to memory limitations and HW capabilities, read example application readme for details. embARC MLI Library must be also pre built specifically for your board by MetaWare Development Tools. Please note that makefiles provided with examples are configured for IoTDK only if GNU tools are used.
@@ -64,6 +64,8 @@ quote=$(subst %,$(Q)%, \
 # Global settings
 #=============================================================
 TOOLCHAIN ?= gnu
+#optmization mode
+OPTMODE ?= speed
 
 export DEBUG_BUILD?=ON
 #export ASM_OUT?=OFF
@@ -76,6 +78,13 @@ endif
 #    # CFLAGS += -Hon=Print_var_info
 #endif
 
+ifeq ($(OPTMODE),size)
+	CFLAGS += -O2 -Hlto
+endif
+ifeq ($(OPTMODE),speed)
+	CFLAGS += -O3
+endif
+
 #=============================================================
 # Files and directories
 #=============================================================
 
@@ -49,7 +49,7 @@ inputs shape.
 
 For more details on calculations see convolution part of `TensorFlow–Neural Network details`_. 
 
-.. _TensorFlow–Neural Network details: https://www.tensorflow.org/api_guides/python/nn.
+.. _TensorFlow–Neural Network details: https://www.tensorflow.org/versions/r1.11/api_guides/python/nn
 
 ReLU activation function might be applied to result of convolution. The
 following types of ReLU activations are supported (for more info see
 
@@ -34,7 +34,7 @@ padding parameters. This logic is similar to convolution 2D operation
 For more information on calculations, see the pooling part of
 `TensorFlow–Neural Network details`_.
 
-.. _TensorFlow–Neural Network details: https://www.tensorflow.org/api_guides/python/nn   
+.. _TensorFlow–Neural Network details: https://www.tensorflow.org/versions/r1.11/api_guides/python/nn   
 
 .. caution::
    Ensure that input and output
 
@@ -137,8 +137,8 @@ Naming convention for the specializations: \
    |                       | parameters to achieve     |                       |
    |                       | same output size          |                       |
    |                       | (similar to ‘SAME’        |                       |
-   |                       | padding scheme used       |                       |
-   |                       | in TensorFlow [3])        |                       |
+   |                       | `padding scheme`_ used    |                       |
+   |                       | in TensorFlow)            |                       |
    +-----------------------+---------------------------+-----------------------+
    | ``Input channels``    | [_ch\ *n*]                | convolution group,    |
    |                       |                           | pooling group         |
@@ -182,6 +182,9 @@ Naming convention for the specializations: \
    |                       | specializations.          |                       |
    +-----------------------+---------------------------+-----------------------+
 
+.. _padding scheme: https://www.tensorflow.org/versions/r1.11/api_guides/python/nn#Notes_on_SAME_Convolution_Padding
+
+
 
 For example, the function name of a 16bit 2d convolution kernel with
 CHW layout and a kernel size of 3x3 and stride of 1 is:
 
@@ -109,7 +109,7 @@ round to the nearest even). All parameters are described in *MetaWare
 Fixed-Point Reference for ARC EM and ARC HS*.
 
 .. note::
-   The MLI Library sets the required DSP mode inside each function where it is needed, but does not restore it to previous state. If another ARC DSP code beside MLI library is used in an application, ensure that you set the required DSP mode before its execution. For more information see  “Configuring the ARC DSP Extensions” section of *MetaWare DSP Programming Guide for ARC EM and ARC HS* or “Using the FXAPI” section of entry [5] of *MetaWare Fixed-Point Reference for ARC EM and ARC HS*.
+   The MLI Library sets the required DSP mode inside each function where it is needed, but does not restore it to previous state. If another ARC DSP code beside MLI library is used in an application, ensure that you set the required DSP mode before its execution. For more information see  “Configuring the ARC DSP Extensions” section of *MetaWare DSP Programming Guide for ARC EM and ARC HS* or “Using the FXAPI” section of *MetaWare Fixed-Point Reference for ARC EM and ARC HS*.
 
 AGU Support
 ^^^^^^^^^^^
 
@@ -28,7 +28,7 @@ BUILD_DIR    ?= ./obj
 OUT_NAME     ?= example_cifar10_caffe
 ifeq ($(TOOLCHAIN),mwdt)
 # MWDT specific options
-CFLAGS       =  -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -O0 -Hsdata0
+CFLAGS       =  -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -Hsdata0 -Xdsp_ctrl=postshift,guard,convergent -Hdense_prologue
 else
 PREBUILT_LIB ?= $(EMBARC_MLI_DIR)/examples/prebuilt/libmli.a
 
 
@@ -106,6 +106,21 @@ More Options on Building and Running
 ---------------------------------------
 CIFAR-10 example application is implemented in the same way as LSTM Based HAR example and provides the same configuration and running abilities. For more details see appropriate HAR example [description part](/examples/example_har_smartphone/README.md#more-options-on-building-and-running).
 
+Data Memory Requirements
+----------------------------
+
+Example application uses statically allocated memory for model weights and intermediate results (activations) and structures. Requirements for them depends on model bit depth 
+configuration define and listed in table below. Before compiling application for desired hardware configuration, be sure it has enough memory to keep data.
+
+|                      Data                              |   MODEL_BIT_DEPTH=8   |  MODEL_BIT_DEPTH=816  |  MODEL_BIT_DEPTH=16  |
+| :----------------------------------------------------: | :-------------------: | :-------------------: | :------------------: |
+| Weights <br/>*.mli_model* and *mli_model_p2 * sections |  33212 bytes          | 33212 bytes           | 66420 bytes          |
+| Activations 1 <br/>*.Zdata * section                   |  32768 bytes          | 65536 bytes           | 65536 bytes          |
+| Activations 2 <br/>*.Ydata * section                   |  8192 bytes           | 16384 bytes           | 16384 bytes          |
+| Structures <br/>*.mli_data* section                    |  384 bytes            | 384 bytes             | 384 bytes            |
+
+By default, application uses MODEL_BIT_DEPTH=16 mode. Application code size depends on target hardware configuration and compilation flags. MLI Library code is wrapped into mli_lib section.
+
 References
 ----------------------------
 CIFAR-10 Dataset:
 
@@ -434,7 +434,7 @@ static void check_result(
 //========================================================================================
 #if (MODEL_BIT_DEPTH != MODEL_FX_8)
 static inline mli_status maxpool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
-    return mli_krn_maxpool_chw_fx16_k3x3(in, cfg, out);
+    return mli_krn_maxpool_chw_fx16_k3x3_krnpad(in, cfg, out);
 }
 
 static inline mli_status avepool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
@@ -455,7 +455,7 @@ static inline mli_status mli_krn_permute_fx(const mli_tensor *in, const mli_perm
 
 #else // MODEL_BIT_DEPTH == (MODEL_FX_8W16D || MODEL_FX_8W16D)
 static inline mli_status maxpool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
-    return mli_krn_maxpool_chw_fx8_k3x3(in, cfg, out);
+    return mli_krn_maxpool_chw_fx8_k3x3_krnpad(in, cfg, out);
 }
 
 static inline mli_status avepool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
 
@@ -28,7 +28,7 @@ BUILD_DIR    ?= ./obj
 OUT_NAME     ?= example_har_smartphone
 ifeq ($(TOOLCHAIN),mwdt)
 # MWDT specific options
-CFLAGS       =  -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -O0 -Hsdata0
+CFLAGS       =  -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -Hsdata0 -Xdsp_ctrl=postshift,guard,convergent -Hdense_prologue
 else
 PREBUILT_LIB ?= $(EMBARC_MLI_DIR)/examples/prebuilt/libmli.a
Original file line number	Diff line number	Diff line change
`@@ -434,7 +434,7 @@ static void check_result(`
`434`	`434`	`//========================================================================================`
`435`	`435`	`#if (MODEL_BIT_DEPTH != MODEL_FX_8)`
`436`	`436`	`static inline mli_status maxpool_chw(const mli_tensor in, const mli_pool_cfg cfg, mli_tensor *out) {`
`437`		`- return mli_krn_maxpool_chw_fx16_k3x3(in, cfg, out);`
	`437`	`+ return mli_krn_maxpool_chw_fx16_k3x3_krnpad(in, cfg, out);`
`438`	`438`	`}`
`439`	`439`
`440`	`440`	`static inline mli_status avepool_chw(const mli_tensor in, const mli_pool_cfg cfg, mli_tensor *out) {`
`@@ -455,7 +455,7 @@ static inline mli_status mli_krn_permute_fx(const mli_tensor *in, const mli_perm`
`455`	`455`
`456`	`456`	`#else // MODEL_BIT_DEPTH == (MODEL_FX_8W16D \|\| MODEL_FX_8W16D)`
`457`	`457`	`static inline mli_status maxpool_chw(const mli_tensor in, const mli_pool_cfg cfg, mli_tensor *out) {`
`458`		`- return mli_krn_maxpool_chw_fx8_k3x3(in, cfg, out);`
	`458`	`+ return mli_krn_maxpool_chw_fx8_k3x3_krnpad(in, cfg, out);`
`459`	`459`	`}`
`460`	`460`
`461`	`461`	`static inline mli_status avepool_chw(const mli_tensor in, const mli_pool_cfg cfg, mli_tensor *out) {`