diff --git a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S
index e458b26b..87059b45 100644
--- a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S
+++ b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_deep_padded.S
@@ -589,25 +589,48 @@ FUNCTION_NAME:
 #else
 
         {   shl r11, _32, 3                         ;   ldw tmp, sp[STACK_BSO]                  }
+        // Change to 16-bit mode
         {   ldaw r11, sp[STACK_VEC_TMP2]            ;   vsetc r11                               }
+        // Apply the shift1 to the accumulators
         {   add tmp, tmp, _32                       ;   vlsat tmp[0]                            }
+        // Store the 16-bit post-shift1 values to vec_tmp2
         {                                           ;   vstr r11[0]                             }
+        // Load scale into vC
         {   add tmp, tmp, _32                       ;   vldc tmp[0]                             }
+        // Clear the accumulators
         {                                           ;   vclrdr                                  }
+        // Apply scale to the 16-bit post-shift1 values 
         {                                           ;   vlmacc r11[0]                           }
+        // Load offset_scale into vC
         {   add tmp, tmp, _32                       ;   vldc tmp[0]                             }
+        // Add offset*offset_scale to the accumulators
         {   add tmp, tmp, _32                       ;   vlmacc tmp[0]                           }
-
+        // Apply shift2 to the accumulators
         {                                           ;   vlsat tmp[0]                            }
+        // Store 16-bit post-shift2 values into vec_tmp2
+        // For values other than -128, this will be the final value, except it's still encoded as
+        // 16 bits. We'll reduce the bit-depth to 8 bits later.
         {   ldaw r11, cp[VPU_VEC_0x007F]            ;   vstr r11[0]                             }
+        // Add 127 to vR
         {   ldaw r11, sp[STACK_TMP]                 ;   vladd r11[0]                            }
+        // Any elements of vR that were -128 before the previous instruction are now -1,
+        // and all other elements are non-negative. The vdepth1 creates 16-bit bitmask in 
+        // vR[0] with a 1 for any element that was -128. (The output Y[] has already had
+        // -128s written to all the elements, so this mask just says which elements to not
+        // overwrite).
         {   mkmsk Q(rows_left), 4                   ;   vdepth1                                 }
+        // Store the bitmask in tmp (note: stored as 32-bit word, with the top 16 bits all being 0)
         vstrpv r11[0], Q(rows_left)
         {   ldc Q(rows_left), 0                     ;   ldw Q(cig_left), sp[STACK_Y]            }
         {   ldaw r11, sp[STACK_VEC_TMP2]            ;   sub Q(rows_left), Q(rows_left), 8       }
+        // reload the 16-bit post-shift2 values, left-shifting them all 8 bits (i.e. multiply by 256)
         vlashr r11[0], Q(rows_left)
+        // load the bitmask into a register
         {   mkmsk r11, 16                           ;   ldw Q(rows_left), sp[STACK_TMP]         }
+        // r11 <-- 0xFFFF & ~(bitmask)
+        // Reduce bit-depth of elements to 8 bits (i.e. divide by 256)
         {   andnot r11, Q(rows_left)                ;   vdepth8                                 }
+        // Store final results in Y[]
         vstrpv Q(cig_left)[0], r11
         {   shl r11, _32, 4                         ;   ldw tmp, sp[STACK_Y_H_STRIDE]           }
         {   add Q(cig_left), Q(cig_left), tmp       ;   vsetc r11                               }
diff --git a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S
index 79e9ebaf..835cca52 100644
--- a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S
+++ b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep.S
@@ -217,9 +217,9 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_center_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;                                           }
                 {   ldaw tmp, sp[STACK_VEC_TMP2]            ;   bt cols_left, .L_center_start           }
             .L_center_end:
             //                                          // Added to X back up at the start of the row
diff --git a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S
index 0cd8640d..44043543 100644
--- a/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S
+++ b/lib_nn/src/asm/util/deep/nn_conv2d_hstrip_tail_deep_padded.S
@@ -90,28 +90,40 @@ FUNCTION_NAME:
     ldaw r11, cp[vpu_vects]
     {   ldaw r11, cp[0]                         ;   set cp, r11                             }
     {                                           ;   stw r11, sp[STACK_CP]                   }
-
+    // Push the start addresses of Y, X and K onto the stack
     {                                           ;   stw r0, sp[STACK_Y]                     }
     {                                           ;   stw r1, sp[STACK_X]                     }
     {   ldc r0, 32                              ;   stw r2, sp[STACK_K]                     }
     {   shl r11, r0, 4                          ;                                           }
+    // Set the VPU mode to 8-bit
     {   mov r11, r1                             ;   vsetc r11                               }
+    // Clear vD:vR
     {   ldaw r1, sp[STACK_VEC_TMP1]             ;   vclrdr                                  }
+    // Store zeros to vec_tmp1
     {                                           ;   vstr r1[0]                              }
+    // Load biases into vD:vR
     {   add r11, r3, r0                         ;   vldd r3[0]                              }
     {   add r11, r11, r0                        ;   vldr r11[0]                             }
+    // Push BSO pointer onto stack
     {                                           ;   stw r11, sp[STACK_BSO]                  }
     
+    // r0 <-- K_h - pad_t - pad_b
+    // r0 here is the number of non-padding rows within the convolution window
     {                                           ;   ldw r0, sp[STACK_K_H]                   }
     {                                           ;   ldw r1, sp[STACK_PAD_T]                 }
     {   sub r0, r0, r1                          ;   ldw r1, sp[STACK_PAD_B]                 }
     {   sub r0, r0, r1                          ;                                           }
     {                                           ;   stw r0, sp[STACK_PATCH_ROWS]            }
+
+    // r0 <-- [Hori stride] * [X channels]
+    // This is the number of bytes that the convolution window pointer gets incremented for each *output column*.
     {                                           ;   ldw r0, sp[STACK_K_h_stride]            }
     {                                           ;   ldw r1, sp[STACK_C_IN]                  }
     mul r0, r0, r1
     {   shr r0, r1, 5                           ;   stw r0, sp[STACK_WIN_H_STRIDE]          }
+    // [input channel groups] <-- [X channels] / 32
     {   zext r1, 5                              ;   stw r0, sp[STACK_C_IN_GROUPS]           }
+    // [input channel tail] <-- [X channels] % 32
     {                                           ;   stw r1, sp[STACK_C_IN_TAIL]             }
 
     //  To move N accumulators from the beginning of vD:vR to the end of vD:vR (which is where
@@ -219,9 +231,10 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_pad_t_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;   bt cols_left, .L_pad_t_col_start        }    
+                {                                           ;   bt cols_left, .L_pad_t_col_start        }  
             .L_pad_t_col_end:
             {                                           ;   ldw tmp, sp[STACK_X_V_STRIDE]           }
             {   add X, X, tmp                           ;   bt rows_left, .L_pad_t_row_start        }
@@ -311,9 +324,10 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_pad_b_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;   bt cols_left, .L_pad_b_col_start        }    
+                {                                           ;   bt cols_left, .L_pad_b_col_start        }
             .L_pad_b_col_end:
             {   ldaw Q(X), sp[STACK_VEC_ADJ_B_HI]       ;   bt rows_left, .L_pad_b_row_start        }
         .L_pad_b_row_end:
@@ -455,9 +469,10 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_pad_l_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;   bt cols_left, .L_pad_l_start            }
+                {                                           ;   bt cols_left, .L_pad_l_start            }
             .L_pad_l_end:
             {                                           ;   ldw cols_left, sp[STACK_CENTER_COLS]    }
             {   ldaw tmp, sp[STACK_VEC_TMP2]            ;   bf cols_left, .L_center_end             }
@@ -528,9 +543,9 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_center_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;                                           }
                 {   ldaw tmp, sp[STACK_VEC_TMP2]            ;   bt cols_left, .L_center_start           }
             .L_center_end:
             {                                           ;   ldw cols_left, sp[STACK_PAD_R]          }
@@ -601,9 +616,10 @@ FUNCTION_NAME:
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add tmp, tmp, k_cout_stride             ;   vlmaccr tmp[0]                          }
                     {   add K, K, c_in_tail                     ;   vlmaccr tmp[0]                          }
+                    {   shl c_out_tail, c_out_tail, 1           ;                                           }
 
                 .L_pad_r_tail_end:
-                {   shl c_out_tail, c_out_tail, 1           ;   bt cols_left, .L_pad_r_start            }
+                {                                           ;   bt cols_left, .L_pad_r_start            }
             .L_pad_r_end:
             //                                          // Added to X back up at the start of the row
             {   sub rows_left, rows_left, 1             ;   ldw tmp, sp[STACK_X_V_STRIDE]           }
diff --git a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c
index 72c44683..3e344421 100644
--- a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c
+++ b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep.c
@@ -482,6 +482,90 @@ void test_nn_conv2d_hstrip_tail_deep_case3()
 
 
 
+
+
+
+
+#define CHANS_IN        (32)
+#define CHANS_OUT       (28)
+#define X_HEIGHT        (8)
+#define X_WIDTH         (7)
+#define Y_HEIGHT        (1)
+#define Y_WIDTH         (1)
+#define K_h             (8)
+#define K_w             (7)
+#define K_hstride       (1)
+void test_nn_conv2d_hstrip_tail_deep_case4()
+{
+    PRINTF("%s...\n", __func__);
+
+    struct {
+        int32_t bias[CHANS_OUT];
+        int16_t shift1[CHANS_OUT];
+        int16_t scale[CHANS_OUT];
+        int16_t offset_scale[CHANS_OUT];
+        int16_t offset[CHANS_OUT];
+        int16_t shift2[CHANS_OUT];
+    } BSO;
+
+    nn_image_t WORD_ALIGNED  X[X_HEIGHT][X_WIDTH][CHANS_IN];
+    nn_tensor_t WORD_ALIGNED  K[CHANS_OUT][K_h][K_w][CHANS_IN];
+    nn_bso_block_t bso[BSO_BLOCK_COUNT(CHANS_OUT)];
+    nn_image_t WORD_ALIGNED  Y[Y_HEIGHT][Y_WIDTH][CHANS_OUT];
+
+    nn_image_params_t x_params = { X_HEIGHT, X_WIDTH, CHANS_IN };
+    nn_image_params_t y_params = { Y_HEIGHT, Y_WIDTH, CHANS_OUT };
+
+    memset(X, 1, sizeof(X));
+    memset(K, 1, sizeof(K));
+
+    for(int k = 0; k < y_params.channels; k++){
+        BSO.bias[k] = 32;
+        BSO.shift1[k] = 0;
+        BSO.scale[k] = 1;
+        BSO.offset_scale[k] = 0;
+        BSO.offset[k] = 0;
+        BSO.shift2[k] = 5;
+    }
+
+    nn_standard_BSO_layout(bso, (int32_t*) &BSO.bias, (int16_t*) &BSO.shift1, 
+                            (int16_t*) &BSO.scale, (int16_t*) &BSO.offset_scale, 
+                            (int16_t*) &BSO.offset, (int16_t*) &BSO.shift2, NULL, 
+                            y_params.channels);
+
+    mem_stride_t k_cout_stride = -K_h*K_w*x_params.channels;
+    nn_tensor_t* K_init = &K[CHANS_OUT-1][0][0][0];
+    
+    memset(Y, 0xCC, sizeof(Y));
+    nn_conv2d_hstrip_tail_deep( &Y[0][0][16], (nn_image_t*) X, K_init, 
+                                (nn_bso_block_t*) &bso, K_h, K_w, K_hstride, x_params.channels,
+                                (x_params.width-K_w)*x_params.channels, k_cout_stride,
+                                y_params.channels, Y_WIDTH, y_params.channels % 16);
+
+    for(unsigned row = 0; row < y_params.height; row++){
+        for(unsigned col = 0; col < y_params.width; col++){
+            for(unsigned chn = 16; chn < y_params.channels; chn++){
+                
+                int8_t y_exp = 57;
+
+                TEST_ASSERT_EQUAL(y_exp, Y[row][col][chn]);
+            }
+        }
+    }
+}
+#undef CHANS_IN  
+#undef CHANS_OUT 
+#undef X_HEIGHT  
+#undef X_WIDTH   
+#undef Y_HEIGHT  
+#undef Y_WIDTH   
+#undef K_h       
+#undef K_w       
+#undef K_hstride
+
+
+
+
 void test_nn_conv2d_hstrip_tail_deep()
 {
     UNITY_SET_FILE();
@@ -490,4 +574,5 @@ void test_nn_conv2d_hstrip_tail_deep()
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case1);
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case2);
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case3);
-}
\ No newline at end of file
+    RUN_TEST(test_nn_conv2d_hstrip_tail_deep_case4);
+}
diff --git a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c
index aa7d334c..de2dee90 100644
--- a/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c
+++ b/test/unit_test/src/adv/deep/test_nn_conv2d_hstrip_tail_deep_padded.c
@@ -1077,6 +1077,101 @@ void test_nn_conv2d_hstrip_tail_deep_padded_case6()
 
 
 
+
+
+#define CHANS_IN        (32)
+#define CHANS_OUT       (28)
+#define X_HEIGHT        (8)
+#define X_WIDTH         (7)
+#define Y_HEIGHT        (1)
+#define Y_WIDTH         (1)
+#define K_h             (9)
+#define K_w             (7)
+#define K_hstride       (2)
+void test_nn_conv2d_hstrip_tail_deep_padded_case7()
+{
+    PRINTF("%s...\n", __func__);
+
+    struct {
+        int32_t bias[CHANS_OUT];
+        int16_t shift1[CHANS_OUT];
+        int16_t scale[CHANS_OUT];
+        int16_t offset_scale[CHANS_OUT];
+        int16_t offset[CHANS_OUT];
+        int16_t shift2[CHANS_OUT];
+    } BSO;
+
+    nn_image_t WORD_ALIGNED  X[X_HEIGHT][X_WIDTH][CHANS_IN];
+    nn_tensor_t WORD_ALIGNED  K[CHANS_OUT][K_h][K_w][CHANS_IN];
+    nn_bso_block_t bso[BSO_BLOCK_COUNT(CHANS_OUT)];
+    nn_image_t WORD_ALIGNED  Y[Y_HEIGHT][Y_WIDTH][CHANS_OUT];
+
+    nn_image_params_t x_params = { X_HEIGHT, X_WIDTH, CHANS_IN };
+    nn_image_params_t y_params = { Y_HEIGHT, Y_WIDTH, CHANS_OUT };
+
+    int8_t zero_point_vec[VPU_INT8_EPV];
+
+    memset(zero_point_vec, 0, sizeof(zero_point_vec));
+    memset(X, 1, sizeof(X));
+    memset(K, 1, sizeof(K));
+
+    for(int k = 0; k < y_params.channels; k++){
+        BSO.bias[k] = 32;
+        BSO.shift1[k] = 0;
+        BSO.scale[k] = 1;
+        BSO.offset_scale[k] = 0;
+        BSO.offset[k]       = 0;
+        BSO.shift2[k]       = 5;
+    }
+
+    nn_standard_BSO_layout(bso, (int32_t*) &BSO.bias, (int16_t*) &BSO.shift1, 
+                            (int16_t*) &BSO.scale, (int16_t*) &BSO.offset_scale, 
+                            (int16_t*) &BSO.offset, (int16_t*) &BSO.shift2, NULL, y_params.channels);
+
+    
+    int pad_t =  0;
+    int pad_b =  1;
+    int pad_l =  1; //signed
+    int pad_r = -1; //signed
+
+    nn_image_t* X_patch_start = &X[-pad_t][-pad_l][0];
+
+    mem_stride_t k_cout_stride = -K_h*K_w*x_params.channels;
+    nn_tensor_t* K_init = &K[CHANS_OUT-1][0][0][0];
+
+
+    memset(Y, 0xCC, sizeof(Y));
+    nn_conv2d_hstrip_tail_deep_padded(&Y[0][0][16], X_patch_start, K_init, 
+                                    (nn_bso_block_t*) &bso, K_h, K_w, K_hstride, x_params.channels, 
+                                    pad_t, pad_b, pad_l, pad_r, (x_params.width-K_w)*x_params.channels, 
+                                    k_cout_stride, y_params.channels, Y_WIDTH, zero_point_vec, y_params.channels % 16);
+
+
+    for(unsigned row = 0; row < y_params.height; row++){
+        for(unsigned col = 0; col < y_params.width; col++){
+            for(unsigned chn = 16; chn < y_params.channels; chn++){
+                
+                int8_t y_exp = 49;
+
+                TEST_ASSERT_EQUAL(y_exp, Y[row][col][chn]);
+            }
+        }
+    }
+}
+#undef CHANS_IN  
+#undef CHANS_OUT
+#undef X_HEIGHT  
+#undef X_WIDTH   
+#undef Y_HEIGHT  
+#undef Y_WIDTH   
+#undef K_h       
+#undef K_w       
+#undef K_hstride 
+
+
+
+
+
 void test_nn_conv2d_hstrip_tail_deep_padded()
 {
     UNITY_SET_FILE();
@@ -1088,4 +1183,5 @@ void test_nn_conv2d_hstrip_tail_deep_padded()
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case4);
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case5);
     RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case6);
-}
\ No newline at end of file
+    RUN_TEST(test_nn_conv2d_hstrip_tail_deep_padded_case7);
+}
diff --git a/test/unit_test/src/test_conv2d_deep.c b/test/unit_test/src/test_conv2d_deep.c
index 4f830523..b305a3c2 100644
--- a/test/unit_test/src/test_conv2d_deep.c
+++ b/test/unit_test/src/test_conv2d_deep.c
@@ -1872,6 +1872,99 @@ void test_conv2d_deep_case18()
 
 
 
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+#define CHANS_IN        ( 32 )
+#define CHANS_OUT       ( 28 )
+#define K_H             ( 9 )
+#define K_W             ( 7 )
+#define X_HEIGHT        ( 8 )
+#define X_WIDTH         ( 7 )
+#define Y_HEIGHT        ( 1 )
+#define Y_WIDTH         ( 1 )
+#define K_V_STRIDE      ( 2 )
+#define K_H_STRIDE      ( 2 )
+#define ZERO_POINT      ( 0 )
+void test_conv2d_deep_case19()
+{
+
+    nn_tensor_t WORD_ALIGNED K[CHANS_OUT][K_H][K_W][CHANS_IN];
+    nn_image_t  WORD_ALIGNED X[X_HEIGHT][X_WIDTH][CHANS_IN];
+    nn_image_t  WORD_ALIGNED Y[Y_HEIGHT][Y_WIDTH][CHANS_OUT];
+    
+    struct {
+        int32_t bias[CHANS_OUT];
+        int16_t shift1[CHANS_OUT];
+        int16_t scale[CHANS_OUT];
+        int16_t offset_scale[CHANS_OUT];
+        int16_t offset[CHANS_OUT];
+        int16_t shift2[CHANS_OUT];
+    } BSO;
+
+    nn_bso_block_t bso[BSO_BLOCK_COUNT(CHANS_OUT)];
+
+    PRINTF("%s...\n", __func__);
+
+    nn_window_params_t conv2d_window = { { K_H, K_W }, { 0, -1 }, { K_V_STRIDE, K_H_STRIDE } }; 
+
+    nn_image_params_t x_params = { X_HEIGHT, X_WIDTH, CHANS_IN };
+    nn_image_params_t y_params = { Y_HEIGHT, Y_WIDTH, CHANS_OUT };
+
+    memset(X, 1, sizeof(X));
+    memset(K, 1, sizeof(K));
+
+    // Expected 32-bit accumulator:  9 * 7 * 32 + bias = 2016 + bias
+
+    for(int k = 0; k < CHANS_OUT; k++){
+        BSO.bias[k] = 32; // 32-bit acc = 2048 = 2^11
+        BSO.shift1[k] = 0;
+        BSO.scale[k] = 1;
+        BSO.offset_scale[k] = 0;
+        BSO.offset[k]       = 0;
+        BSO.shift2[k] = 5; // 2048 >> shift2 = 2^6
+    }
+    
+    nn_standard_BSO_layout(bso, (int32_t*) &BSO.bias, (int16_t*) &BSO.shift1, 
+                        (int16_t*) &BSO.scale, (int16_t*) &BSO.offset_scale, (int16_t*) &BSO.offset, 
+                        (int16_t*) &BSO.shift2, NULL, y_params.channels);
+
+    nn_window_op_job_params_t job_params = {{0,0,0},{Y_HEIGHT, Y_WIDTH, CHANS_OUT}};
+
+    memset(Y, 0xCC, sizeof(Y));
+
+    conv2d_deep_ext((nn_image_t*) Y, (nn_image_t*) X, (nn_tensor_t*) K, bso, ZERO_POINT, 
+                &x_params, &y_params, &conv2d_window, &job_params, 0);
+
+
+    for(int row = 0; row < y_params.height; row++){
+        for(int col = 0; col < y_params.width; col++){
+            for(int cout = 0; cout < y_params.channels; cout++){
+
+
+
+                int8_t y_exp = 49;
+                check_Y(y_exp, (nn_image_t*) Y, &y_params, row, col, cout, __LINE__);
+            }
+        }
+    }
+}
+#undef CHANS_IN
+#undef CHANS_OUT
+#undef K_H
+#undef K_W
+#undef X_HEIGHT
+#undef X_WIDTH
+#undef Y_HEIGHT
+#undef Y_WIDTH
+#undef K_V_STRIDE
+#undef K_H_STRIDE
+#undef ZERO_POINT
+
+
+
+
+
+
 
 void test_conv2d_deep()
 {
@@ -1896,4 +1989,5 @@ void test_conv2d_deep()
     RUN_TEST(test_conv2d_deep_case16);
     RUN_TEST(test_conv2d_deep_case17);
     RUN_TEST(test_conv2d_deep_case18);
+    RUN_TEST(test_conv2d_deep_case19);
 }
\ No newline at end of file