aom: Add AVX2 for av1_resize_horz_dir()

From 01b991665819deb145adf58edbd49b1c3ee7032f Mon Sep 17 00:00:00 2001
From: Samuthirika S <[EMAIL REDACTED]>
Date: Fri, 3 May 2024 18:22:16 +0530
Subject: [PATCH] Add AVX2 for av1_resize_horz_dir()

This CL adds AVX2 implementation for av1_resize_horz_dir()
function. Also, unit test for the same is added.

Resolution       Average Scaling w.r.t C
 3840x2160               3.16x
 2560x1440               3.25x
 1920x1080               3.24x
 1280x720                3.42x
 640x480                 3.80x
 640x360                 3.85x
 256x256                 5.53x

This is a bit-exact change.

Change-Id: I19160f5fe66b3d95abdb53b9ea443500baa71ec6
---
 av1/common/av1_rtcd_defs.pl  |   7 +-
 av1/common/resize.c          |  31 ++--
 av1/common/resize.h          |   3 +
 av1/common/x86/resize_avx2.c | 327 +++++++++++++++++++++++++++++++++--
 av1/common/x86/resize_sse2.c |  13 +-
 test/frame_resize_test.cc    | 117 +++++++++++--
 6 files changed, 449 insertions(+), 49 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 3973d919b..7035fb3bd 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -554,8 +554,11 @@ ()
   specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
 }
 
-add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
-specialize qw/resize_vert_dir sse2 avx2/;
+add_proto qw/bool av1_resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
+specialize qw/av1_resize_vert_dir sse2 avx2/;
+
+add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
+specialize qw/av1_resize_horz_dir avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 727f84fdb..505fccd43 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -337,8 +337,8 @@ static int32_t get_upscale_convolve_x0(int in_length, int out_length,
   return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
 }
 
-static void down2_symeven(const uint8_t *const input, int length,
-                          uint8_t *output) {
+void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
+                   int start_offset) {
   // Actual filter len = 2 * filter_len_half.
   const int16_t *filter = av1_down2_symeven_half_filter;
   const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
@@ -350,7 +350,7 @@ static void down2_symeven(const uint8_t *const input, int length,
   l2 += (l2 & 1);
   if (l1 > l2) {
     // Short input length.
-    for (i = 0; i < length; i += 2) {
+    for (i = start_offset; i < length; i += 2) {
       int sum = (1 << (FILTER_BITS - 1));
       for (j = 0; j < filter_len_half; ++j) {
         sum +=
@@ -362,7 +362,7 @@ static void down2_symeven(const uint8_t *const input, int length,
     }
   } else {
     // Initial part.
-    for (i = 0; i < l1; i += 2) {
+    for (i = start_offset; i < l1; i += 2) {
       int sum = (1 << (FILTER_BITS - 1));
       for (j = 0; j < filter_len_half; ++j) {
         sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j];
@@ -492,7 +492,7 @@ static void resize_multistep(const uint8_t *const input, int length,
       if (filteredlength & 1)
         down2_symodd(in, filteredlength, out);
       else
-        down2_symeven(in, filteredlength, out);
+        down2_symeven(in, filteredlength, out, 0);
       filteredlength = proj_filteredlength;
     }
     if (filteredlength != olength) {
@@ -521,8 +521,8 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
   }
 }
 
-bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
-                       int height, int height2, int width2, int start_col) {
+bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
+                           int height, int height2, int width2, int start_col) {
   bool mem_status = true;
   uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
   uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
@@ -533,7 +533,7 @@ bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
 
   for (int i = start_col; i < width2; ++i) {
     fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    down2_symeven(arrbuf, height, arrbuf2);
+    down2_symeven(arrbuf, height, arrbuf2, 0);
     fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
   }
 
@@ -543,11 +543,12 @@ bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
   return mem_status;
 }
 
-static INLINE void resize_horz_dir(const uint8_t *const input, int in_stride,
-                                   uint8_t *intbuf, int height,
-                                   int filtered_length, int width2) {
+void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride,
+                           uint8_t *intbuf, int height, int filtered_length,
+                           int width2) {
   for (int i = 0; i < height; ++i)
-    down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
+    down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i,
+                  0);
 }
 
 bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
@@ -559,10 +560,10 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
   }
 
   // Resize in the horizontal direction
-  resize_horz_dir(input, in_stride, intbuf, height, width, width2);
+  av1_resize_horz_dir(input, in_stride, intbuf, height, width, width2);
   // Resize in the vertical direction
-  bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2,
-                                    width2, 0 /*start_col*/);
+  bool mem_status = av1_resize_vert_dir(intbuf, output, out_stride, height,
+                                        height2, width2, 0 /*start_col*/);
   aom_free(intbuf);
   return mem_status;
 }
diff --git a/av1/common/resize.h b/av1/common/resize.h
index de71f5d53..6b233f825 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -101,6 +101,9 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
                               int in_stride, uint8_t *output, int height2,
                               int width2, int out_stride);
 
+void down2_symeven(const uint8_t *const input, int length, uint8_t *output,
+                   int start_offset);
+
 bool should_resize_by_half(int height, int width, int height2, int width2);
 
 // Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index 47f015ae7..f0421703c 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -41,7 +41,7 @@
   s[8] = _mm256_unpackhi_epi8(s68, s79);                                    \
                                                                             \
   __m256i res_out[2] = { 0 };                                               \
-  resize_y_convolve(s, coeffs_y, res_out);                                  \
+  resize_convolve(s, coeffs_y, res_out);                                    \
                                                                             \
   /* r00... r07 */                                                          \
   __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits);   \
@@ -52,7 +52,7 @@
   res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);        \
                                                                             \
   __m256i res_out_b[2] = { 0 };                                             \
-  resize_y_convolve(s + 5, coeffs_y, res_out_b);                            \
+  resize_convolve(s + 5, coeffs_y, res_out_b);                              \
                                                                             \
   /* r08... r015 */                                                         \
   __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
@@ -91,7 +91,7 @@
   s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20);     \
                                                                           \
   __m256i res_out[2] = { 0 };                                             \
-  resize_y_convolve(s, coeffs_y, res_out);                                \
+  resize_convolve(s, coeffs_y, res_out);                                  \
                                                                           \
   /* r00... r07 */                                                        \
   __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
@@ -108,9 +108,107 @@
   res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel);             \
   res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
 
-static INLINE void resize_y_convolve(const __m256i *const s,
-                                     const __m256i *const coeffs,
-                                     __m256i *res_out) {
+#define PROCESS_RESIZE_X_WD32                                                  \
+  /* a0 a1 ..... a30 a31 */                                                    \
+  __m256i row0 = _mm256_loadu_si256(                                           \
+      (__m256i *)&input[i * in_stride + j - filter_offset]);                   \
+  /* b0 b1 ..... b30 b31 */                                                    \
+  __m256i row1 = _mm256_loadu_si256(                                           \
+      (__m256i *)&input[(i + 1) * in_stride + j - filter_offset]);             \
+  /* a0 .... a15 || b0.... b15 */                                              \
+  __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);                    \
+  /* a16 .... a31 || b16 .... b31 */                                           \
+  __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);                    \
+  filter_offset = 3;                                                           \
+                                                                               \
+  /* Pad start pixels to the left, while processing the first pixels in the    \
+    row. */                                                                    \
+  if (j == 0) {                                                                \
+    /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */                         \
+    row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);                       \
+    /* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */                   \
+    row1 = _mm256_alignr_epi8(r1, r0, 13);                                     \
+    r0 = row0;                                                                 \
+    r1 = row1;                                                                 \
+  }                                                                            \
+                                                                               \
+  /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/                                \
+  __m128i row0_0 = _mm_loadl_epi64(                                            \
+      (__m128i *)&input[i * in_stride + 32 + j - filter_offset]);              \
+  /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */                               \
+  __m128i row1_0 = _mm_loadl_epi64(                                            \
+      (__m128i *)&input[(i + 1) * in_stride + 32 + j - filter_offset]);        \
+  __m256i r2 = _mm256_permute2x128_si256(                                      \
+      _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20);   \
+                                                                               \
+  /* Pad end pixels to the right, while processing the last pixels in the      \
+  row. */                                                                      \
+  const int is_last_cols32 = (j + 32 == filtered_length);                      \
+  if (is_last_cols32) {                                                        \
+    r2 = _mm256_shuffle_epi8(r2, wd32_end_pad_mask);                           \
+  }                                                                            \
+                                                                               \
+  /* Process even pixels of the first row  */                                  \
+  /* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */                \
+  s0[0] = _mm256_alignr_epi8(r1, r0, 0);                                       \
+  /* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */                \
+  s0[1] = _mm256_alignr_epi8(r1, r0, 2);                                       \
+  /* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */                \
+  s0[2] = _mm256_alignr_epi8(r1, r0, 4);                                       \
+  /* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */                \
+  s0[3] = _mm256_alignr_epi8(r1, r0, 6);                                       \
+                                                                               \
+  /* Process even pixels of the second row  */                                 \
+  /* a13 a14 a15 a16  ..... a28 | b13 b14 b15 b16 ..... b28 */                 \
+  s1[0] = _mm256_alignr_epi8(r2, r1, 0);                                       \
+  /* a15 a16 a17 a18  ..... a30 | b15 b16 b17 b18 ..... b30 */                 \
+  s1[1] = _mm256_alignr_epi8(r2, r1, 2);                                       \
+  /* a17 a18 a19 a20  ..... a32 | b17 b18 b19 b20 ..... b32 */                 \
+  s1[2] = _mm256_alignr_epi8(r2, r1, 4);                                       \
+  /* a19 a20 a21 a22  ..... a34 | b19 b20 b21 b22 ..... b34 */                 \
+  s1[3] = _mm256_alignr_epi8(r2, r1, 6);                                       \
+                                                                               \
+  /* The register res_out_0 stores the result of start-16 pixels corresponding \
+to the first and second rows whereas res_out_1 stores the end-16 pixels. */    \
+  __m256i res_out_0[2], res_out_1[2];                                          \
+  res_out_1[0] = res_out_1[1] = zero;                                          \
+  res_out_0[0] = res_out_0[1] = zero;                                          \
+  resize_convolve(s0, coeffs_x, res_out_0);                                    \
+  resize_convolve(s1, coeffs_x, res_out_1);                                    \
+                                                                               \
+  /* Result of 32 pixels of row0 (a0 to a32) */                                \
+  res_out_0[0] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);     \
+  res_out_1[0] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits);     \
+  /* r00-r03 r08-r011 | r04-r07 r012-r015 */                                   \
+  __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]);        \
+                                                                               \
+  /* result of 32 pixels of row1 (b0 to b32) */                                \
+  res_out_0[1] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);     \
+  res_out_1[1] = _mm256_sra_epi32(                                             \
+      _mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits);     \
+  /* r10-r13 r18-r111 | r14-r17 r112-r115 */                                   \
+  __m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]);        \
+                                                                               \
+  /* Convert the result from 16bit to 8bit */                                  \
+  /* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115   \
+   */                                                                          \
+  __m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1);           \
+  __m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel);            \
+  res_out_row01 = _mm256_max_epu8(res_out_r01, zero);                          \
+  __m128i low_128 = CAST_LOW(res_out_row01);                                   \
+  __m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1);               \
+                                                                               \
+  _mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2],                 \
+                   _mm_unpacklo_epi32(low_128, high_128));                     \
+  _mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2],           \
+                   _mm_unpackhi_epi32(low_128, high_128));
+
+static INLINE void resize_convolve(const __m256i *const s,
+                                   const __m256i *const coeffs,
+                                   __m256i *res_out) {
   const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
   const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
   const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
@@ -152,8 +250,9 @@ static INLINE void prepare_filter_coeffs(const int16_t *filter,
   coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
 }
 
-bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
-                          int height, int height2, int stride, int start_col) {
+bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                              int height, int height2, int stride,
+                              int start_col) {
   assert(start_col <= stride);
   // For the GM tool, the input layer height or width is assured to be an even
   // number. Hence the function 'down2_symodd()' is not invoked and SIMD
@@ -164,8 +263,8 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
   // eliminate the need for conditional statements within the subsequent SIMD
   // code to manage these cases.
   if (height & 1 || height < 8) {
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, start_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, start_col);
   }
 
   __m256i s[10], coeffs_y[4];
@@ -404,8 +503,212 @@ bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
   }
 
   if (remain_col)
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, stride - remain_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, stride - remain_col);
 
   return true;
 }
+
+// Masks used for width 32 and 8 pixels, with left and right padding
+// requirements
+static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2,  3,  4,
+                                                    5, 6, 7, 8, 9, 10, 11, 12,
+                                                    0, 0, 0, 0, 1, 2,  3,  4,
+                                                    5, 6, 7, 8, 9, 10, 11, 12 };
+
+static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2,
+                                                     2, 2, 2, 2, 2, 2, 2, 2,
+                                                     0, 1, 2, 2, 2, 2, 2, 2,
+                                                     2, 2, 2, 2, 2, 2, 2, 2 };
+
+static const uint8_t wd8_right_padding_mask[32] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10,
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10
+};
+
+void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
+                              uint8_t *intbuf, int height, int filtered_length,
+                              int width2) {
+  assert(height % 2 == 0);
+  // Currently, Invoking C function for width less than 32. Optimize the below,
+  // by invoking SSE2 once the implementation for the same is available.
+  if (filtered_length < 32) {
+    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
+                          width2);
+    return;
+  }
+
+  const int filt_length = sizeof(av1_down2_symeven_half_filter);
+  assert(filt_length % 2 == 0);
+  (void)filt_length;
+
+  __m256i s0[4], s1[4], coeffs_x[4];
+
+  const int bits = FILTER_BITS;
+  const int dst_stride = width2;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+
+  const uint8_t max_pixel = 255;
+  const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i wd32_start_pad_mask =
+      _mm256_loadu_si256((__m256i *)wd32_left_padding_mask);
+  const __m256i wd32_end_pad_mask =
+      _mm256_loadu_si256((__m256i *)wd32_right_padding_mask);
+  const __m256i wd8_end_pad_mask =
+      _mm256_loadu_si256((__m256i *)wd8_right_padding_mask);
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
+
+  // The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously
+  // to generate output corresponding to 2 rows. To streamline the core loop and
+  // eliminate the need for conditional checks, the remaining columns (16 or 8)
+  // are processed separately.
+  if (filtered_length % 32 == 0) {
+    for (int i = 0; i < height; i += 2) {
+      int filter_offset = 0;
+      for (int j = 0; j < filtered_length; j += 32) {
+        PROCESS_RESIZE_X_WD32
+      }
+    }
+  } else {
+    for (int i = 0; i < height; i += 2) {
+      int filter_offset = 0;
+      int remain_col = filtered_length % 32;
+      for (int j = 0; j + 32 <= filtered_length; j += 32) {
+        PROCESS_RESIZE_X_WD32
+      }
+
+      int wd_processed = filtered_length - remain_col;
+      if (remain_col > 15) {
+        remain_col = filtered_length % 16;
+        const int in_idx = i * in_stride + wd_processed - filter_offset;
+        const int out_idx = (i * dst_stride) + wd_processed / 2;
+        // a0 a1 --- a15
+        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        // b0 b1 --- b15
+        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        // a0 a1 --- a15 || b0 b1 --- b15
+        __m256i r0 =
+            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+
+        // a16 a17 --- a23
+        row0 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16]);
+        // b16 b17 --- b23
+        row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride]);
+
+        // a16-a23 x x x x| b16-b23 x x x x
+        __m256i r1 =
+            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+
+        // Pad end pixels to the right, while processing the last pixels in the
+        // row.
+        const int is_last_cols16 = wd_processed + 16 == filtered_length;
+        if (is_last_cols16) {
+          r1 = _mm256_shuffle_epi8(r1, wd32_end_pad_mask);
+        }
+
+        // a0 a1 --- a15 || b0 b1 --- b15
+        s0[0] = r0;
+        // a2 a3 --- a17 || b2 b3 --- b17
+        s0[1] = _mm256_alignr_epi8(r1, r0, 2);
+        // a4 a5 --- a19 || b4 b5 --- b19
+        s0[2] = _mm256_alignr_epi8(r1, r0, 4);
+        // a6 a7 --- a21 || b6 b7 --- b21
+        s0[3] = _mm256_alignr_epi8(r1, r0, 6);
+
+        // result for 16 pixels (a0 to a15) of row0 and row1
+        __m256i res_out_0[2];
+        res_out_0[0] = res_out_0[1] = zero;
+        resize_convolve(s0, coeffs_x, res_out_0);
+
+        // r00 -r07
+        res_out_0[0] = _mm256_sra_epi32(
+            _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
+        // r10-r17
+        res_out_0[1] = _mm256_sra_epi32(
+            _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);
+        // r00-r03 r10-r13 r04-r07 r14-r17
+        __m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]);
+        // r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17
+        res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
+        res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
+        res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
+        // r00-r03 r10-r13 r04-r07 r14-r17
+        __m128i low_result =
+            CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8));
+        // r00-r03 r04-r07 r10-r13 r14-r17
+        low_result = _mm_shuffle_epi32(low_result, 0xd8);
+
+        _mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result);
+        _mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride],
+                         _mm_unpackhi_epi64(low_result, low_result));
+      }
+
+      wd_processed = filtered_length - remain_col;
+      if (remain_col > 7) {
+        remain_col = filtered_length % 8;
+        const int in_idx = i * in_stride + wd_processed - filter_offset;
+        const int out_idx = (i * dst_stride) + wd_processed / 2;
+        // a0 a1 --- a15
+        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+        // b0 b1 --- b15
+        __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride]);
+        // a0 a1 --- a15 || b0 b1 --- b15
+        __m256i r0 =
+            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
+
+        // Pad end pixels to the right, while processing the last pixels in the
+        // row.
+        const int is_last_cols_8 = wd_processed + 8 == filtered_length;
+        if (is_last_cols_8) r0 = _mm256_shuffle_epi8(r0, wd8_end_pad_mask);
+
+        // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
+        s0[0] = r0;
+        // a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9
+        s0[1] = _mm256_bsrli_epi128(r0, 2);
+        // a4 a5 a6 a7 a8 a9 a10 a10 |  b4 b5 b6 b7 b8 b9 b10 b10
+        s0[2] = _mm256_bsrli_epi128(r0, 4);
+        // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
+        s0[3] = _mm256_bsrli_epi128(r0, 6);
+        __m256i res_out_0[2];
+        res_out_0[0] = res_out_0[1] = zero;
+        resize_convolve(s0, coeffs_x, res_out_0);
+
+        // r00 - r03 | r10 - r13
+        __m256i res_out =
+            _mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20);
+        // r00 - r03 | r10 - r13
+        res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits),
+                                   round_shift_bits);
+        // r00-r03 r00-r03 r10-r13 r10-r13
+        __m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out);
+        // r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13
+        res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
+        res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
+        res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
+
+        *((int *)(intbuf + out_idx)) =
+            _mm_cvtsi128_si32(CAST_LOW(res_out_row01));
+        *((int *)(intbuf + out_idx + dst_stride)) =
+            _mm_cvtsi128_si32(_mm256_extracti128_si256(res_out_row01, 1));
+      }
+
+      wd_processed = filtered_length - remain_col;
+      // When the remaining width is 2, the above code would not have taken
+      // care of padding required for (filtered_length - 4)th pixel. Hence,
+      // process that pixel again with the C code.
+      wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
+      if (remain_col) {
+        const int in_idx = (in_stride * i);
+        const int out_idx = (wd_processed / 2) + width2 * i;
+
+        down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
+                      wd_processed);
+        down2_symeven(input + in_idx + in_stride, filtered_length,
+                      intbuf + out_idx + width2, wd_processed);
+      }
+    }
+  }
+}
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index 9714ecf77..c68371cb0 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -81,8 +81,9 @@ static INLINE void prepare_filter_coeffs(const int16_t *filter,
   coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
 }
 
-bool resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
-                          int height, int height2, int stride, int start_col) {
+bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                              int height, int height2, int stride,
+                              int start_col) {
   // For the GM tool, the input layer height or width is assured to be an even
   // number. Hence the function 'down2_symodd()' is not invoked and SIMD
   // optimization of the same is not implemented.
@@ -92,8 +93,8 @@ bool resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
   // eliminate the need for conditional statements within the subsequent SIMD
   // code to manage these cases.
   if (height & 1 || height < 8) {
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, start_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, start_col);
   }
 
   __m128i coeffs_y[2];
@@ -158,8 +159,8 @@ bool resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
   }
 
   if (remain_col)
-    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
-                             stride, stride - remain_col);
+    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                                 stride, stride - remain_col);
 
   return true;
 }
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index cab6fe354..b0bcb7b97 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -79,14 +79,12 @@ class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
   }
 
   void RunTest() {
-    int width2 = width_, height2 = height_;
-
     for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8();
     for (int level = 1; level < n_levels_; level++) {
-      width2 = (width_ >> level);
-      height2 = (height_ >> level);
-      resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, width2,
-                        0);
+      const int width2 = (width_ >> level);
+      const int height2 = (height_ >> level);
+      av1_resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
+                            width2, 0);
       test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
 
       AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2);
@@ -94,17 +92,15 @@ class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
   }
 
   void SpeedTest() {
-    int width2 = width_, height2 = height_;
-
     for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8();
     for (int level = 1; level < n_levels_; level++) {

(Patch may be truncated, please check the link at the top of this post.)