[#4] Fixes bugs in nn_conv2d_hstrip_tail_deep() and nn_conv2d_hstrip_…

…tail_deep_padded(). The assembly implementations of those two functions misbehaved under certain conditions. In particular, the conditions seem to be that the input image has no input channel tail (channel count is multiple of 32), and there are multiple non-padding columns within the convolution window. It seems likely that there are additional conditions that would need to have been met, because otherwise I expect this would have manifested sooner.
xmos · Jan 14, 2021 · a5f0c8b · a5f0c8b
1 parent 3ff4129
commit a5f0c8b
Show file tree

Hide file tree

Showing 6 changed files with 324 additions and 10 deletions.
diff --git a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S
@@ -589,25 +589,48 @@ FUNCTION_NAME:
 #else
 
         {   shl r11, _32, 3                         ;   ldw tmp, sp[STACK_BSO]                  }
+        // Change to 16-bit mode
         {   ldaw r11, sp[STACK_VEC_TMP2]            ;   vsetc r11                               }
+        // Apply the shift1 to the accumulators
         {   add tmp, tmp, _32                       ;   vlsat tmp[0]                            }
+        // Store the 16-bit post-shift1 values to vec_tmp2
         {                                           ;   vstr r11[0]                             }
+        // Load scale into vC
         {   add tmp, tmp, _32                       ;   vldc tmp[0]                             }
+        // Clear the accumulators
         {                                           ;   vclrdr                                  }
+        // Apply scale to the 16-bit post-shift1 values 
         {                                           ;   vlmacc r11[0]                           }
+        // Load offset_scale into vC
         {   add tmp, tmp, _32                       ;   vldc tmp[0]                             }
+        // Add offset*offset_scale to the accumulators
         {   add tmp, tmp, _32                       ;   vlmacc tmp[0]                           }
-
+        // Apply shift2 to the accumulators
         {                                           ;   vlsat tmp[0]                            }
+        // Store 16-bit post-shift2 values into vec_tmp2
+        // For values other than -128, this will be the final value, except it's still encoded as
+        // 16 bits. We'll reduce the bit-depth to 8 bits later.
         {   ldaw r11, cp[VPU_VEC_0x007F]            ;   vstr r11[0]                             }
+        // Add 127 to vR
         {   ldaw r11, sp[STACK_TMP]                 ;   vladd r11[0]                            }
+        // Any elements of vR that were -128 before the previous instruction are now -1,
+        // and all other elements are non-negative. The vdepth1 creates 16-bit bitmask in 
+        // vR[0] with a 1 for any element that was -128. (The output Y[] has already had
+        // -128s written to all the elements, so this mask just says which elements to not
+        // overwrite).
         {   mkmsk Q(rows_left), 4                   ;   vdepth1                                 }
+        // Store the bitmask in tmp (note: stored as 32-bit word, with the top 16 bits all being 0)
         vstrpv r11[0], Q(rows_left)
         {   ldc Q(rows_left), 0                     ;   ldw Q(cig_left), sp[STACK_Y]            }
         {   ldaw r11, sp[STACK_VEC_TMP2]            ;   sub Q(rows_left), Q(rows_left), 8       }
+        // reload the 16-bit post-shift2 values, left-shifting them all 8 bits (i.e. multiply by 256)
         vlashr r11[0], Q(rows_left)
+        // load the bitmask into a register
         {   mkmsk r11, 16                           ;   ldw Q(rows_left), sp[STACK_TMP]         }
+        // r11 <-- 0xFFFF & ~(bitmask)
+        // Reduce bit-depth of elements to 8 bits (i.e. divide by 256)
         {   andnot r11, Q(rows_left)                ;   vdepth8                                 }
+        // Store final results in Y[]
         vstrpv Q(cig_left)[0], r11
         {   shl r11, _32, 4                         ;   ldw tmp, sp[STACK_Y_H_STRIDE]           }
         {   add Q(cig_left), Q(cig_left), tmp       ;   vsetc r11                               }

diff --git a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S
@@ -217,9 +217,9 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_center_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;                                           }
                 {   ldaw tmp, sp[STACK_VEC_TMP2]            ;   bt cols_left, .L_center_start           }
             .L_center_end:
             //                                          // Added to X back up at the start of the row

diff --git a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S
@@ -90,28 +90,40 @@ FUNCTION_NAME:
     ldaw r11, cp[vpu_vects]
     {   ldaw r11, cp[0]                         ;   set cp, r11                             }
     {                                           ;   stw r11, sp[STACK_CP]                   }
-
+    // Push the start addresses of Y, X and K onto the stack
     {                                           ;   stw r0, sp[STACK_Y]                     }
     {                                           ;   stw r1, sp[STACK_X]                     }
     {   ldc r0, 32                              ;   stw r2, sp[STACK_K]                     }
     {   shl r11, r0, 4                          ;                                           }
+    // Set the VPU mode to 8-bit
     {   mov r11, r1                             ;   vsetc r11                               }
+    // Clear vD:vR
     {   ldaw r1, sp[STACK_VEC_TMP1]             ;   vclrdr                                  }
+    // Store zeros to vec_tmp1
     {                                           ;   vstr r1[0]                              }
+    // Load biases into vD:vR
     {   add r11, r3, r0                         ;   vldd r3[0]                              }
     {   add r11, r11, r0                        ;   vldr r11[0]                             }
+    // Push BSO pointer onto stack
     {                                           ;   stw r11, sp[STACK_BSO]                  }
 
+    // r0 <-- K_h - pad_t - pad_b
+    // r0 here is the number of non-padding rows within the convolution window
     {                                           ;   ldw r0, sp[STACK_K_H]                   }
     {                                           ;   ldw r1, sp[STACK_PAD_T]                 }
     {   sub r0, r0, r1                          ;   ldw r1, sp[STACK_PAD_B]                 }
     {   sub r0, r0, r1                          ;                                           }
     {                                           ;   stw r0, sp[STACK_PATCH_ROWS]            }
+
+    // r0 <-- [Hori stride] * [X channels]
+    // This is the number of bytes that the convolution window pointer gets incremented for each *output column*.
     {                                           ;   ldw r0, sp[STACK_K_h_stride]            }
     {                                           ;   ldw r1, sp[STACK_C_IN]                  }
     mul r0, r0, r1
     {   shr r0, r1, 5                           ;   stw r0, sp[STACK_WIN_H_STRIDE]          }
+    // [input channel groups] <-- [X channels] / 32
     {   zext r1, 5                              ;   stw r0, sp[STACK_C_IN_GROUPS]           }
+    // [input channel tail] <-- [X channels] % 32
     {                                           ;   stw r1, sp[STACK_C_IN_TAIL]             }
 
     //  To move N accumulators from the beginning of vD:vR to the end of vD:vR (which is where
@@ -219,9 +231,10 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_pad_t_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;   bt cols_left, .L_pad_t_col_start        }    
+                {                                           ;   bt cols_left, .L_pad_t_col_start        }  
             .L_pad_t_col_end:
             {                                           ;   ldw tmp, sp[STACK_X_V_STRIDE]           }
             {   add X, X, tmp                           ;   bt rows_left, .L_pad_t_row_start        }
@@ -311,9 +324,10 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_pad_b_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;   bt cols_left, .L_pad_b_col_start        }    
+                {                                           ;   bt cols_left, .L_pad_b_col_start        }
             .L_pad_b_col_end:
             {   ldaw Q(X), sp[STACK_VEC_ADJ_B_HI]       ;   bt rows_left, .L_pad_b_row_start        }
         .L_pad_b_row_end:
@@ -455,9 +469,10 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_pad_l_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;   bt cols_left, .L_pad_l_start            }
+                {                                           ;   bt cols_left, .L_pad_l_start            }
             .L_pad_l_end:
             {                                           ;   ldw cols_left, sp[STACK_CENTER_COLS]    }
             {   ldaw tmp, sp[STACK_VEC_TMP2]            ;   bf cols_left, .L_center_end             }
@@ -528,9 +543,9 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_center_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;                                           }
                 {   ldaw tmp, sp[STACK_VEC_TMP2]            ;   bt cols_left, .L_center_start           }
             .L_center_end:
             {                                           ;   ldw cols_left, sp[STACK_PAD_R]          }
@@ -601,9 +616,10 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_pad_r_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;   bt cols_left, .L_pad_r_start            }
+                {                                           ;   bt cols_left, .L_pad_r_start            }
             .L_pad_r_end:
             //                                          // Added to X back up at the start of the row
             {   sub rows_left, rows_left, 1             ;   ldw tmp, sp[STACK_X_V_STRIDE]           }

diff --git a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c
@@ -482,6 +482,90 @@ void test_nn_conv2d_hstrip_tail_deep_case3()
 
 
 
+
+
+
+
+#define CHANS_IN        (32)
+#define CHANS_OUT       (28)
+#define X_HEIGHT        (8)
+#define X_WIDTH         (7)
+#define Y_HEIGHT        (1)
+#define Y_WIDTH         (1)
+#define K_h             (8)
+#define K_w             (7)
+#define K_hstride       (1)
+void test_nn_conv2d_hstrip_tail_deep_case4()
+{
+    PRINTF("%s...\n", __func__);
+
+    struct {
+        int32_t bias[CHANS_OUT];
+        int16_t shift1[CHANS_OUT];
+        int16_t scale[CHANS_OUT];
+        int16_t offset_scale[CHANS_OUT];
+        int16_t offset[CHANS_OUT];
+        int16_t shift2[CHANS_OUT];
+    } BSO;
+
+    nn_image_t WORD_ALIGNED  X[X_HEIGHT][X_WIDTH][CHANS_IN];
+    nn_tensor_t WORD_ALIGNED  K[CHANS_OUT][K_h][K_w][CHANS_IN];
+    nn_bso_block_t bso[BSO_BLOCK_COUNT(CHANS_OUT)];
+    nn_image_t WORD_ALIGNED  Y[Y_HEIGHT][Y_WIDTH][CHANS_OUT];
+
+    nn_image_params_t x_params = { X_HEIGHT, X_WIDTH, CHANS_IN };
+    nn_image_params_t y_params = { Y_HEIGHT, Y_WIDTH, CHANS_OUT };
+
+    memset(X, 1, sizeof(X));
+    memset(K, 1, sizeof(K));
+
+    for(int k = 0; k < y_params.channels; k++){
+        BSO.bias[k] = 32;
+        BSO.shift1[k] = 0;
+        BSO.scale[k] = 1;
+        BSO.offset_scale[k] = 0;
+        BSO.offset[k] = 0;
+        BSO.shift2[k] = 5;
+    }
+
+    nn_standard_BSO_layout(bso, (int32_t*) &BSO.bias, (int16_t*) &BSO.shift1, 
+                            (int16_t*) &BSO.scale, (int16_t*) &BSO.offset_scale, 
+                            (int16_t*) &BSO.offset, (int16_t*) &BSO.shift2, NULL, 
+                            y_params.channels);
+
+    mem_stride_t k_cout_stride = -K_h*K_w*x_params.channels;
+    nn_tensor_t* K_init = &K[CHANS_OUT-1][0][0][0];
+
+    memset(Y, 0xCC, sizeof(Y));
+    nn_conv2d_hstrip_tail_deep( &Y[0][0][16], (nn_image_t*) X, K_init, 
+                                (nn_bso_block_t*) &bso, K_h, K_w, K_hstride, x_params.channels,
+                                (x_params.width-K_w)*x_params.channels, k_cout_stride,
+                                y_params.channels, Y_WIDTH, y_params.channels % 16);
+
+    for(unsigned row = 0; row < y_params.height; row++){
+        for(unsigned col = 0; col < y_params.width; col++){
+            for(unsigned chn = 16; chn < y_params.channels; chn++){
+
+                int8_t y_exp = 57;
+
+                TEST_ASSERT_EQUAL(y_exp, Y[row][col][chn]);
+            }
+        }
+    }
+}
+#undef CHANS_IN  
+#undef CHANS_OUT 
+#undef X_HEIGHT  
+#undef X_WIDTH   
+#undef Y_HEIGHT  
+#undef Y_WIDTH   
+#undef K_h       
+#undef K_w       
+#undef K_hstride
+
+
+
+
 void test_nn_conv2d_hstrip_tail_deep()
 {
     UNITY_SET_FILE();
@@ -490,4 +574,5 @@ void test_nn_conv2d_hstrip_tail_deep()
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case1);
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case2);
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case3);
-}
+    RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case4);
+}
diff --git a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c
@@ -1077,6 +1077,101 @@ void test_nn_conv2d_hstrip_tail_deep_padded_case6()
 
 
 
+
+
+#define CHANS_IN        (32)
+#define CHANS_OUT       (28)
+#define X_HEIGHT        (8)
+#define X_WIDTH         (7)
+#define Y_HEIGHT        (1)
+#define Y_WIDTH         (1)
+#define K_h             (9)
+#define K_w             (7)
+#define K_hstride       (2)
+void test_nn_conv2d_hstrip_tail_deep_padded_case7()
+{
+    PRINTF("%s...\n", __func__);
+
+    struct {
+        int32_t bias[CHANS_OUT];
+        int16_t shift1[CHANS_OUT];
+        int16_t scale[CHANS_OUT];
+        int16_t offset_scale[CHANS_OUT];
+        int16_t offset[CHANS_OUT];
+        int16_t shift2[CHANS_OUT];
+    } BSO;
+
+    nn_image_t WORD_ALIGNED  X[X_HEIGHT][X_WIDTH][CHANS_IN];
+    nn_tensor_t WORD_ALIGNED  K[CHANS_OUT][K_h][K_w][CHANS_IN];
+    nn_bso_block_t bso[BSO_BLOCK_COUNT(CHANS_OUT)];
+    nn_image_t WORD_ALIGNED  Y[Y_HEIGHT][Y_WIDTH][CHANS_OUT];
+
+    nn_image_params_t x_params = { X_HEIGHT, X_WIDTH, CHANS_IN };
+    nn_image_params_t y_params = { Y_HEIGHT, Y_WIDTH, CHANS_OUT };
+
+    int8_t zero_point_vec[VPU_INT8_EPV];
+
+    memset(zero_point_vec, 0, sizeof(zero_point_vec));
+    memset(X, 1, sizeof(X));
+    memset(K, 1, sizeof(K));
+
+    for(int k = 0; k < y_params.channels; k++){
+        BSO.bias[k] = 32;
+        BSO.shift1[k] = 0;
+        BSO.scale[k] = 1;
+        BSO.offset_scale[k] = 0;
+        BSO.offset[k]       = 0;
+        BSO.shift2[k]       = 5;
+    }
+
+    nn_standard_BSO_layout(bso, (int32_t*) &BSO.bias, (int16_t*) &BSO.shift1, 
+                            (int16_t*) &BSO.scale, (int16_t*) &BSO.offset_scale, 
+                            (int16_t*) &BSO.offset, (int16_t*) &BSO.shift2, NULL, y_params.channels);
+
+
+    int pad_t =  0;
+    int pad_b =  1;
+    int pad_l =  1; //signed
+    int pad_r = -1; //signed
+
+    nn_image_t* X_patch_start = &X[-pad_t][-pad_l][0];
+
+    mem_stride_t k_cout_stride = -K_h*K_w*x_params.channels;
+    nn_tensor_t* K_init = &K[CHANS_OUT-1][0][0][0];
+
+
+    memset(Y, 0xCC, sizeof(Y));
+    nn_conv2d_hstrip_tail_deep_padded(&Y[0][0][16], X_patch_start, K_init, 
+                                    (nn_bso_block_t*) &bso, K_h, K_w, K_hstride, x_params.channels, 
+                                    pad_t, pad_b, pad_l, pad_r, (x_params.width-K_w)*x_params.channels, 
+                                    k_cout_stride, y_params.channels, Y_WIDTH, zero_point_vec, y_params.channels % 16);
+
+
+    for(unsigned row = 0; row < y_params.height; row++){
+        for(unsigned col = 0; col < y_params.width; col++){
+            for(unsigned chn = 16; chn < y_params.channels; chn++){
+
+                int8_t y_exp = 49;
+
+                TEST_ASSERT_EQUAL(y_exp, Y[row][col][chn]);
+            }
+        }
+    }
+}
+#undef CHANS_IN  
+#undef CHANS_OUT
+#undef X_HEIGHT  
+#undef X_WIDTH   
+#undef Y_HEIGHT  
+#undef Y_WIDTH   
+#undef K_h       
+#undef K_w       
+#undef K_hstride 
+
+
+
+
+
 void test_nn_conv2d_hstrip_tail_deep_padded()
 {
     UNITY_SET_FILE();
@@ -1088,4 +1183,5 @@ void test_nn_conv2d_hstrip_tail_deep_padded()
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case4);
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case5);
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case6);
-}
+    RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case7);
+}