diff --git a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S index e458b26b..87059b45 100644 --- a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S +++ b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S @@ -589,25 +589,48 @@ FUNCTION_NAME: #else { shl r11, _32, 3 ; ldw tmp, sp[STACK_BSO] } + // Change to 16-bit mode { ldaw r11, sp[STACK_VEC_TMP2] ; vsetc r11 } + // Apply the shift1 to the accumulators { add tmp, tmp, _32 ; vlsat tmp[0] } + // Store the 16-bit post-shift1 values to vec_tmp2 { ; vstr r11[0] } + // Load scale into vC { add tmp, tmp, _32 ; vldc tmp[0] } + // Clear the accumulators { ; vclrdr } + // Apply scale to the 16-bit post-shift1 values { ; vlmacc r11[0] } + // Load offset_scale into vC { add tmp, tmp, _32 ; vldc tmp[0] } + // Add offset*offset_scale to the accumulators { add tmp, tmp, _32 ; vlmacc tmp[0] } - + // Apply shift2 to the accumulators { ; vlsat tmp[0] } + // Store 16-bit post-shift2 values into vec_tmp2 + // For values other than -128, this will be the final value, except it's still encoded as + // 16 bits. We'll reduce the bit-depth to 8 bits later. { ldaw r11, cp[VPU_VEC_0x007F] ; vstr r11[0] } + // Add 127 to vR { ldaw r11, sp[STACK_TMP] ; vladd r11[0] } + // Any elements of vR that were -128 before the previous instruction are now -1, + // and all other elements are non-negative. The vdepth1 creates 16-bit bitmask in + // vR[0] with a 1 for any element that was -128. (The output Y[] has already had + // -128s written to all the elements, so this mask just says which elements to not + // overwrite). { mkmsk Q(rows_left), 4 ; vdepth1 } + // Store the bitmask in tmp (note: stored as 32-bit word, with the top 16 bits all being 0) vstrpv r11[0], Q(rows_left) { ldc Q(rows_left), 0 ; ldw Q(cig_left), sp[STACK_Y] } { ldaw r11, sp[STACK_VEC_TMP2] ; sub Q(rows_left), Q(rows_left), 8 } + // reload the 16-bit post-shift2 values, left-shifting them all 8 bits (i.e. multiply by 256) vlashr r11[0], Q(rows_left) + // load the bitmask into a register { mkmsk r11, 16 ; ldw Q(rows_left), sp[STACK_TMP] } + // r11 <-- 0xFFFF & ~(bitmask) + // Reduce bit-depth of elements to 8 bits (i.e. divide by 256) { andnot r11, Q(rows_left) ; vdepth8 } + // Store final results in Y[] vstrpv Q(cig_left)[0], r11 { shl r11, _32, 4 ; ldw tmp, sp[STACK_Y_H_STRIDE] } { add Q(cig_left), Q(cig_left), tmp ; vsetc r11 } diff --git a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S index 79e9ebaf..835cca52 100644 --- a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S +++ b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S @@ -217,9 +217,9 @@ FUNCTION_NAME: { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add K, K, c_in_tail ; vlmaccr tmp[0] } + { shl c_out_tail, c_out_tail, 1 ; } .L_center_tail_end: - { shl c_out_tail, c_out_tail, 1 ; } { ldaw tmp, sp[STACK_VEC_TMP2] ; bt cols_left, .L_center_start } .L_center_end: // // Added to X back up at the start of the row diff --git a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S index 0cd8640d..44043543 100644 --- a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S +++ b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S @@ -90,28 +90,40 @@ FUNCTION_NAME: ldaw r11, cp[vpu_vects] { ldaw r11, cp[0] ; set cp, r11 } { ; stw r11, sp[STACK_CP] } - + // Push the start addresses of Y, X and K onto the stack { ; stw r0, sp[STACK_Y] } { ; stw r1, sp[STACK_X] } { ldc r0, 32 ; stw r2, sp[STACK_K] } { shl r11, r0, 4 ; } + // Set the VPU mode to 8-bit { mov r11, r1 ; vsetc r11 } + // Clear vD:vR { ldaw r1, sp[STACK_VEC_TMP1] ; vclrdr } + // Store zeros to vec_tmp1 { ; vstr r1[0] } + // Load biases into vD:vR { add r11, r3, r0 ; vldd r3[0] } { add r11, r11, r0 ; vldr r11[0] } + // Push BSO pointer onto stack { ; stw r11, sp[STACK_BSO] } + // r0 <-- K_h - pad_t - pad_b + // r0 here is the number of non-padding rows within the convolution window { ; ldw r0, sp[STACK_K_H] } { ; ldw r1, sp[STACK_PAD_T] } { sub r0, r0, r1 ; ldw r1, sp[STACK_PAD_B] } { sub r0, r0, r1 ; } { ; stw r0, sp[STACK_PATCH_ROWS] } + + // r0 <-- [Hori stride] * [X channels] + // This is the number of bytes that the convolution window pointer gets incremented for each *output column*. { ; ldw r0, sp[STACK_K_h_stride] } { ; ldw r1, sp[STACK_C_IN] } mul r0, r0, r1 { shr r0, r1, 5 ; stw r0, sp[STACK_WIN_H_STRIDE] } + // [input channel groups] <-- [X channels] / 32 { zext r1, 5 ; stw r0, sp[STACK_C_IN_GROUPS] } + // [input channel tail] <-- [X channels] % 32 { ; stw r1, sp[STACK_C_IN_TAIL] } // To move N accumulators from the beginning of vD:vR to the end of vD:vR (which is where @@ -219,9 +231,10 @@ FUNCTION_NAME: { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add K, K, c_in_tail ; vlmaccr tmp[0] } + { shl c_out_tail, c_out_tail, 1 ; } .L_pad_t_tail_end: - { shl c_out_tail, c_out_tail, 1 ; bt cols_left, .L_pad_t_col_start } + { ; bt cols_left, .L_pad_t_col_start } .L_pad_t_col_end: { ; ldw tmp, sp[STACK_X_V_STRIDE] } { add X, X, tmp ; bt rows_left, .L_pad_t_row_start } @@ -311,9 +324,10 @@ FUNCTION_NAME: { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add K, K, c_in_tail ; vlmaccr tmp[0] } + { shl c_out_tail, c_out_tail, 1 ; } .L_pad_b_tail_end: - { shl c_out_tail, c_out_tail, 1 ; bt cols_left, .L_pad_b_col_start } + { ; bt cols_left, .L_pad_b_col_start } .L_pad_b_col_end: { ldaw Q(X), sp[STACK_VEC_ADJ_B_HI] ; bt rows_left, .L_pad_b_row_start } .L_pad_b_row_end: @@ -455,9 +469,10 @@ FUNCTION_NAME: { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add K, K, c_in_tail ; vlmaccr tmp[0] } + { shl c_out_tail, c_out_tail, 1 ; } .L_pad_l_tail_end: - { shl c_out_tail, c_out_tail, 1 ; bt cols_left, .L_pad_l_start } + { ; bt cols_left, .L_pad_l_start } .L_pad_l_end: { ; ldw cols_left, sp[STACK_CENTER_COLS] } { ldaw tmp, sp[STACK_VEC_TMP2] ; bf cols_left, .L_center_end } @@ -528,9 +543,9 @@ FUNCTION_NAME: { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add K, K, c_in_tail ; vlmaccr tmp[0] } + { shl c_out_tail, c_out_tail, 1 ; } .L_center_tail_end: - { shl c_out_tail, c_out_tail, 1 ; } { ldaw tmp, sp[STACK_VEC_TMP2] ; bt cols_left, .L_center_start } .L_center_end: { ; ldw cols_left, sp[STACK_PAD_R] } @@ -601,9 +616,10 @@ FUNCTION_NAME: { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add tmp, tmp, k_cout_stride ; vlmaccr tmp[0] } { add K, K, c_in_tail ; vlmaccr tmp[0] } + { shl c_out_tail, c_out_tail, 1 ; } .L_pad_r_tail_end: - { shl c_out_tail, c_out_tail, 1 ; bt cols_left, .L_pad_r_start } + { ; bt cols_left, .L_pad_r_start } .L_pad_r_end: // // Added to X back up at the start of the row { sub rows_left, rows_left, 1 ; ldw tmp, sp[STACK_X_V_STRIDE] } diff --git a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c index 72c44683..3e344421 100644 --- a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c +++ b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c @@ -482,6 +482,90 @@ void test_nn_conv2d_hstrip_tail_deep_case3() + + + + +#define CHANS_IN (32) +#define CHANS_OUT (28) +#define X_HEIGHT (8) +#define X_WIDTH (7) +#define Y_HEIGHT (1) +#define Y_WIDTH (1) +#define K_h (8) +#define K_w (7) +#define K_hstride (1) +void test_nn_conv2d_hstrip_tail_deep_case4() +{ + PRINTF("%s...\n", __func__); + + struct { + int32_t bias[CHANS_OUT]; + int16_t shift1[CHANS_OUT]; + int16_t scale[CHANS_OUT]; + int16_t offset_scale[CHANS_OUT]; + int16_t offset[CHANS_OUT]; + int16_t shift2[CHANS_OUT]; + } BSO; + + nn_image_t WORD_ALIGNED X[X_HEIGHT][X_WIDTH][CHANS_IN]; + nn_tensor_t WORD_ALIGNED K[CHANS_OUT][K_h][K_w][CHANS_IN]; + nn_bso_block_t bso[BSO_BLOCK_COUNT(CHANS_OUT)]; + nn_image_t WORD_ALIGNED Y[Y_HEIGHT][Y_WIDTH][CHANS_OUT]; + + nn_image_params_t x_params = { X_HEIGHT, X_WIDTH, CHANS_IN }; + nn_image_params_t y_params = { Y_HEIGHT, Y_WIDTH, CHANS_OUT }; + + memset(X, 1, sizeof(X)); + memset(K, 1, sizeof(K)); + + for(int k = 0; k < y_params.channels; k++){ + BSO.bias[k] = 32; + BSO.shift1[k] = 0; + BSO.scale[k] = 1; + BSO.offset_scale[k] = 0; + BSO.offset[k] = 0; + BSO.shift2[k] = 5; + } + + nn_standard_BSO_layout(bso, (int32_t*) &BSO.bias, (int16_t*) &BSO.shift1, + (int16_t*) &BSO.scale, (int16_t*) &BSO.offset_scale, + (int16_t*) &BSO.offset, (int16_t*) &BSO.shift2, NULL, + y_params.channels); + + mem_stride_t k_cout_stride = -K_h*K_w*x_params.channels; + nn_tensor_t* K_init = &K[CHANS_OUT-1][0][0][0]; + + memset(Y, 0xCC, sizeof(Y)); + nn_conv2d_hstrip_tail_deep( &Y[0][0][16], (nn_image_t*) X, K_init, + (nn_bso_block_t*) &bso, K_h, K_w, K_hstride, x_params.channels, + (x_params.width-K_w)*x_params.channels, k_cout_stride, + y_params.channels, Y_WIDTH, y_params.channels % 16); + + for(unsigned row = 0; row < y_params.height; row++){ + for(unsigned col = 0; col < y_params.width; col++){ + for(unsigned chn = 16; chn < y_params.channels; chn++){ + + int8_t y_exp = 57; + + TEST_ASSERT_EQUAL(y_exp, Y[row][col][chn]); + } + } + } +} +#undef CHANS_IN +#undef CHANS_OUT +#undef X_HEIGHT +#undef X_WIDTH +#undef Y_HEIGHT +#undef Y_WIDTH +#undef K_h +#undef K_w +#undef K_hstride + + + + void test_nn_conv2d_hstrip_tail_deep() { UNITY_SET_FILE(); @@ -490,4 +574,5 @@ void test_nn_conv2d_hstrip_tail_deep() RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case1); RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case2); RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case3); -} \ No newline at end of file + RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case4); +} diff --git a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c index aa7d334c..de2dee90 100644 --- a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c +++ b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c @@ -1077,6 +1077,101 @@ void test_nn_conv2d_hstrip_tail_deep_padded_case6() + + +#define CHANS_IN (32) +#define CHANS_OUT (28) +#define X_HEIGHT (8) +#define X_WIDTH (7) +#define Y_HEIGHT (1) +#define Y_WIDTH (1) +#define K_h (9) +#define K_w (7) +#define K_hstride (2) +void test_nn_conv2d_hstrip_tail_deep_padded_case7() +{ + PRINTF("%s...\n", __func__); + + struct { + int32_t bias[CHANS_OUT]; + int16_t shift1[CHANS_OUT]; + int16_t scale[CHANS_OUT]; + int16_t offset_scale[CHANS_OUT]; + int16_t offset[CHANS_OUT]; + int16_t shift2[CHANS_OUT]; + } BSO; + + nn_image_t WORD_ALIGNED X[X_HEIGHT][X_WIDTH][CHANS_IN]; + nn_tensor_t WORD_ALIGNED K[CHANS_OUT][K_h][K_w][CHANS_IN]; + nn_bso_block_t bso[BSO_BLOCK_COUNT(CHANS_OUT)]; + nn_image_t WORD_ALIGNED Y[Y_HEIGHT][Y_WIDTH][CHANS_OUT]; + + nn_image_params_t x_params = { X_HEIGHT, X_WIDTH, CHANS_IN }; + nn_image_params_t y_params = { Y_HEIGHT, Y_WIDTH, CHANS_OUT }; + + int8_t zero_point_vec[VPU_INT8_EPV]; + + memset(zero_point_vec, 0, sizeof(zero_point_vec)); + memset(X, 1, sizeof(X)); + memset(K, 1, sizeof(K)); + + for(int k = 0; k < y_params.channels; k++){ + BSO.bias[k] = 32; + BSO.shift1[k] = 0; + BSO.scale[k] = 1; + BSO.offset_scale[k] = 0; + BSO.offset[k] = 0; + BSO.shift2[k] = 5; + } + + nn_standard_BSO_layout(bso, (int32_t*) &BSO.bias, (int16_t*) &BSO.shift1, + (int16_t*) &BSO.scale, (int16_t*) &BSO.offset_scale, + (int16_t*) &BSO.offset, (int16_t*) &BSO.shift2, NULL, y_params.channels); + + + int pad_t = 0; + int pad_b = 1; + int pad_l = 1; //signed + int pad_r = -1; //signed + + nn_image_t* X_patch_start = &X[-pad_t][-pad_l][0]; + + mem_stride_t k_cout_stride = -K_h*K_w*x_params.channels; + nn_tensor_t* K_init = &K[CHANS_OUT-1][0][0][0]; + + + memset(Y, 0xCC, sizeof(Y)); + nn_conv2d_hstrip_tail_deep_padded(&Y[0][0][16], X_patch_start, K_init, + (nn_bso_block_t*) &bso, K_h, K_w, K_hstride, x_params.channels, + pad_t, pad_b, pad_l, pad_r, (x_params.width-K_w)*x_params.channels, + k_cout_stride, y_params.channels, Y_WIDTH, zero_point_vec, y_params.channels % 16); + + + for(unsigned row = 0; row < y_params.height; row++){ + for(unsigned col = 0; col < y_params.width; col++){ + for(unsigned chn = 16; chn < y_params.channels; chn++){ + + int8_t y_exp = 49; + + TEST_ASSERT_EQUAL(y_exp, Y[row][col][chn]); + } + } + } +} +#undef CHANS_IN +#undef CHANS_OUT +#undef X_HEIGHT +#undef X_WIDTH +#undef Y_HEIGHT +#undef Y_WIDTH +#undef K_h +#undef K_w +#undef K_hstride + + + + + void test_nn_conv2d_hstrip_tail_deep_padded() { UNITY_SET_FILE(); @@ -1088,4 +1183,5 @@ void test_nn_conv2d_hstrip_tail_deep_padded() RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case4); RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case5); RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case6); -} \ No newline at end of file + RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case7); +} diff --git a/test/unit_test/src/test_conv2d_deep.c b/test/unit_test/src/test_conv2d_deep.c index 4f830523..b305a3c2 100644 --- a/test/unit_test/src/test_conv2d_deep.c +++ b/test/unit_test/src/test_conv2d_deep.c @@ -1872,6 +1872,99 @@ void test_conv2d_deep_case18() +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +#define CHANS_IN ( 32 ) +#define CHANS_OUT ( 28 ) +#define K_H ( 9 ) +#define K_W ( 7 ) +#define X_HEIGHT ( 8 ) +#define X_WIDTH ( 7 ) +#define Y_HEIGHT ( 1 ) +#define Y_WIDTH ( 1 ) +#define K_V_STRIDE ( 2 ) +#define K_H_STRIDE ( 2 ) +#define ZERO_POINT ( 0 ) +void test_conv2d_deep_case19() +{ + + nn_tensor_t WORD_ALIGNED K[CHANS_OUT][K_H][K_W][CHANS_IN]; + nn_image_t WORD_ALIGNED X[X_HEIGHT][X_WIDTH][CHANS_IN]; + nn_image_t WORD_ALIGNED Y[Y_HEIGHT][Y_WIDTH][CHANS_OUT]; + + struct { + int32_t bias[CHANS_OUT]; + int16_t shift1[CHANS_OUT]; + int16_t scale[CHANS_OUT]; + int16_t offset_scale[CHANS_OUT]; + int16_t offset[CHANS_OUT]; + int16_t shift2[CHANS_OUT]; + } BSO; + + nn_bso_block_t bso[BSO_BLOCK_COUNT(CHANS_OUT)]; + + PRINTF("%s...\n", __func__); + + nn_window_params_t conv2d_window = { { K_H, K_W }, { 0, -1 }, { K_V_STRIDE, K_H_STRIDE } }; + + nn_image_params_t x_params = { X_HEIGHT, X_WIDTH, CHANS_IN }; + nn_image_params_t y_params = { Y_HEIGHT, Y_WIDTH, CHANS_OUT }; + + memset(X, 1, sizeof(X)); + memset(K, 1, sizeof(K)); + + // Expected 32-bit accumulator: 9 * 7 * 32 + bias = 2016 + bias + + for(int k = 0; k < CHANS_OUT; k++){ + BSO.bias[k] = 32; // 32-bit acc = 2048 = 2^11 + BSO.shift1[k] = 0; + BSO.scale[k] = 1; + BSO.offset_scale[k] = 0; + BSO.offset[k] = 0; + BSO.shift2[k] = 5; // 2048 >> shift2 = 2^6 + } + + nn_standard_BSO_layout(bso, (int32_t*) &BSO.bias, (int16_t*) &BSO.shift1, + (int16_t*) &BSO.scale, (int16_t*) &BSO.offset_scale, (int16_t*) &BSO.offset, + (int16_t*) &BSO.shift2, NULL, y_params.channels); + + nn_window_op_job_params_t job_params = {{0,0,0},{Y_HEIGHT, Y_WIDTH, CHANS_OUT}}; + + memset(Y, 0xCC, sizeof(Y)); + + conv2d_deep_ext((nn_image_t*) Y, (nn_image_t*) X, (nn_tensor_t*) K, bso, ZERO_POINT, + &x_params, &y_params, &conv2d_window, &job_params, 0); + + + for(int row = 0; row < y_params.height; row++){ + for(int col = 0; col < y_params.width; col++){ + for(int cout = 0; cout < y_params.channels; cout++){ + + + + int8_t y_exp = 49; + check_Y(y_exp, (nn_image_t*) Y, &y_params, row, col, cout, __LINE__); + } + } + } +} +#undef CHANS_IN +#undef CHANS_OUT +#undef K_H +#undef K_W +#undef X_HEIGHT +#undef X_WIDTH +#undef Y_HEIGHT +#undef Y_WIDTH +#undef K_V_STRIDE +#undef K_H_STRIDE +#undef ZERO_POINT + + + + + + void test_conv2d_deep() { @@ -1896,4 +1989,5 @@ void test_conv2d_deep() RUN_TEST(test_conv2d_deep_case16); RUN_TEST(test_conv2d_deep_case17); RUN_TEST(test_conv2d_deep_case18); + RUN_TEST(test_conv2d_deep_case19); } \ No newline at end of file