aom: Add AVX2 for resize_vert_dir()

From 60cbd98d3acc7a8d8eec9737917a3a44e9f78e9a Mon Sep 17 00:00:00 2001
From: Samuthirika S <[EMAIL REDACTED]>
Date: Sat, 13 Apr 2024 15:45:35 +0530
Subject: [PATCH] Add AVX2 for resize_vert_dir()

This CL adds AVX2 implementation for resize_vert_dir()
function. Also, unit test for the same is added.

Resolution       Average Scaling w.r.t C
 3840x2160               9.28x
 2560x1440              10.35x
 1920x1080               9.28x
 1280x720                9.02x
 640x480                 9.16x
 640x360                 8.91x
 256x256                10.82x

This is a bit-exact change.

Change-Id: Ifb0677d5f99cf03154e6412aca5effd1eb267dc7
---
 av1/av1.cmake                |   1 +
 av1/common/av1_rtcd_defs.pl  |   3 +
 av1/common/resize.c          |  21 +-
 av1/common/resize.h          |   4 +
 av1/common/x86/resize_avx2.c | 411 +++++++++++++++++++++++++++++++++++
 test/frame_resize_test.cc    | 157 +++++++++++++
 test/test.cmake              |   1 +
 7 files changed, 585 insertions(+), 13 deletions(-)
 create mode 100644 av1/common/x86/resize_avx2.c
 create mode 100644 test/frame_resize_test.cc

diff --git a/av1/av1.cmake b/av1/av1.cmake
index c1206e9d6..b6cf974aa 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -302,6 +302,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
             "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/resize_avx2.c"
             "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
             "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 8bf4b0709..e97f39145 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -554,6 +554,9 @@ ()
   specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
 }
 
+add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
+specialize qw/resize_vert_dir avx2/;
+
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
 
diff --git a/av1/common/resize.c b/av1/common/resize.c
index ef35fa227..2b48b9fff 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -18,6 +18,7 @@
 #include <string.h>
 
 #include "config/aom_config.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/flow_estimation/corner_detect.h"
@@ -216,10 +217,6 @@ const int16_t av1_resize_filter_normative[(
 // Filters for interpolation (full-band) - no filtering for integer pixels
 #define filteredinterp_filters1000 av1_resize_filter_normative
 
-// Filters for factor of 2 downsampling.
-static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
-static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
-
 static const InterpKernel *choose_interp_filter(int in_length, int out_length) {
   int out_length16 = out_length * 16;
   if (out_length16 >= in_length * 16)
@@ -524,9 +521,8 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
   }
 }
 
-static INLINE bool resize_vert_dir(uint8_t *intbuf, uint8_t *output,
-                                   int out_stride, int height, int height2,
-                                   int width2) {
+bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
+                       int height, int height2, int width2, int start_col) {
   bool mem_status = true;
   uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
   uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
@@ -535,7 +531,7 @@ static INLINE bool resize_vert_dir(uint8_t *intbuf, uint8_t *output,
     goto Error;
   }
 
-  for (int i = 0; i < width2; ++i) {
+  for (int i = start_col; i < width2; ++i) {
     fill_col_to_arr(intbuf + i, width2, height, arrbuf);
     down2_symeven(arrbuf, height, arrbuf2);
     fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
@@ -547,9 +543,8 @@ static INLINE bool resize_vert_dir(uint8_t *intbuf, uint8_t *output,
   return mem_status;
 }
 
-static INLINE void resize_horz_dir(const uint8_t *const input, int in_stride,
-                                   uint8_t *intbuf, int height,
-                                   int filtered_length, int width2) {
+void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf,
+                     int height, int filtered_length, int width2) {
   for (int i = 0; i < height; ++i)
     down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
 }
@@ -565,8 +560,8 @@ bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
   // Resize in the horizontal direction
   resize_horz_dir(input, in_stride, intbuf, height, width, width2);
   // Resize in the vertical direction
-  bool mem_status =
-      resize_vert_dir(intbuf, output, out_stride, height, height2, width2);
+  bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2,
+                                    width2, 0 /*start_col*/);
   aom_free(intbuf);
   return mem_status;
 }
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 6e7d46e0d..de71f5d53 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -20,6 +20,10 @@
 extern "C" {
 #endif
 
+// Filters for factor of 2 downsampling.
+static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
+static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
+
 bool av1_resize_plane(const uint8_t *input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride);
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
new file mode 100644
index 000000000..c44edb88d
--- /dev/null
+++ b/av1/common/x86/resize_avx2.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/resize.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#define CAST_HI(x) _mm256_castsi128_si256(x)
+#define CAST_LOW(x) _mm256_castsi256_si128(x)
+
+#define PROCESS_RESIZE_Y_WD16                                               \
+  const int idx1 = AOMMIN(height - 1, i + 5);                               \
+  const int idx2 = AOMMIN(height - 1, i + 6);                               \
+  l6 = l10;                                                                 \
+  l7 = l11;                                                                 \
+  l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride));                  \
+  l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride));                  \
+                                                                            \
+  /* g0... g15 | i0... i15 */                                               \
+  const __m256i s68 =                                                       \
+      _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20);            \
+  /* h0... h15 | j0... j15 */                                               \
+  const __m256i s79 =                                                       \
+      _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20);            \
+                                                                            \
+  /* g0h0... g7g7 | i0j0... i7j */                                          \
+  s[3] = _mm256_unpacklo_epi8(s68, s79);                                    \
+  /* g8h8... g15g15 | i8j8... i15j15 */                                     \
+  s[8] = _mm256_unpackhi_epi8(s68, s79);                                    \
+                                                                            \
+  __m256i res_out[2] = { 0 };                                               \
+  resize_y_convolve(s, coeffs_y, res_out);                                  \
+                                                                            \
+  /* r00... r07 */                                                          \
+  __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits);   \
+  /* r20... r27 */                                                          \
+  __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits);   \
+                                                                            \
+  res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);        \
+  res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);        \
+                                                                            \
+  __m256i res_out_b[2] = { 0 };                                             \
+  resize_y_convolve(s + 5, coeffs_y, res_out_b);                            \
+                                                                            \
+  /* r08... r015 */                                                         \
+  __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
+  /* r28... r215 */                                                         \
+  __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \
+  res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits);        \
+  res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits);        \
+                                                                            \
+  /* r00... r03 r20... r23 | r04... r07 r24... r27 */                       \
+  __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);    \
+  /* r08... r012 r28... r212 | r013... r015 r213... r215 */                 \
+  __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2);    \
+  /* r00... r07 | r20... r27 */                                             \
+  res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8);                    \
+  /* r08... r015 | r28... r215 */                                           \
+  res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8);                    \
+  /* r00... r015 | r20... r215 */                                           \
+  res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1);                    \
+  res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel);                       \
+  res_8bit0 = _mm256_max_epu8(res_8bit0, zero);
+
+#define PROCESS_RESIZE_Y_WD8                                              \
+  const int idx1 = AOMMIN(height - 1, i + 5);                             \
+  const int idx2 = AOMMIN(height - 1, i + 6);                             \
+  l6 = l10;                                                               \
+  l7 = l11;                                                               \
+  l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride));                \
+  l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride));                \
+                                                                          \
+  /* g0h0... g7h7 */                                                      \
+  s67 = _mm_unpacklo_epi8(l6, l7);                                        \
+  /* i0j0...i7j7 */                                                       \
+  __m128i s89 = _mm_unpacklo_epi8(l8, l9);                                \
+                                                                          \
+  /* g0h0...g7g7 | i0j0...i7j7 */                                         \
+  s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20);     \
+                                                                          \
+  __m256i res_out[2] = { 0 };                                             \
+  resize_y_convolve(s, coeffs_y, res_out);                                \
+                                                                          \
+  /* r00... r07 */                                                        \
+  __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
+  /* r20...r27 */                                                         \
+  __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
+  res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);      \
+  res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);      \
+                                                                          \
+  /* r00...r03 r20...r23 | r04...r07 r24...r27 */                         \
+  res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);      \
+  /* r00...r07 | r20...r27 */                                             \
+  res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8);          \
+  res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1);      \
+  res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel);             \
+  res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
+
+static INLINE void resize_y_convolve(const __m256i *const s,
+                                     const __m256i *const coeffs,
+                                     __m256i *res_out) {
+  const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+  const __m256i dst_0 = _mm256_add_epi16(res_0, res_1);
+  const __m256i dst_1 = _mm256_add_epi16(res_2, res_3);
+  // The sum of convolve operation crosses signed 16bit. Hence, the addition
+  // should happen in 32bit.
+  const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0));
+  const __m256i dst_01 =
+      _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1));
+  const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1));
+  const __m256i dst_11 =
+      _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1));
+
+  res_out[0] = _mm256_add_epi32(dst_00, dst_10);
+  res_out[1] = _mm256_add_epi32(dst_01, dst_11);
+}
+
+static INLINE void prepare_filter_coeffs(const int16_t *filter,
+                                         __m256i *const coeffs /* [4] */) {
+  // f0 f1 f2 f3 x x x x
+  const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
+  // f0 f1 f2 f3 f0 f1 f2 f3
+  const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44);
+  // f0 f1 f2 f3 f1 f0 f3 f2
+  const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1);
+
+  const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1);
+
+  // f0 f1 f0 f1 ..
+  coeffs[2] = _mm256_broadcastw_epi16(filter_8bit);
+  // f2 f3 f2 f3 ..
+  coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2));
+  // f3 f2 f3 f2 ..
+  coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6));
+  // f1 f0 f1 f0 ..
+  coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
+}
+
+bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
+                          int height, int height2, int stride, int start_col) {
+  assert(start_col <= stride);
+  // For the GM tool, the input layer height or width is assured to be an even
+  // number. Hence the function 'down2_symodd()' is not invoked and SIMD
+  // optimization of the same is not implemented.
+  // When the input height is less than 8 and even, the potential input
+  // heights are limited to 2, 4, or 6. These scenarios require seperate
+  // handling due to padding requirements. Invoking the C function here will
+  // eliminate the need for conditional statements within the subsequent SIMD
+  // code to manage these cases.
+  if (height & 1 || height < 8) {
+    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                             stride, start_col);
+  }
+
+  __m256i s[10], coeffs_y[4];
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const uint8_t max_pixel = 255;
+  const __m256i clip_pixel = _mm256_set1_epi8(max_pixel);
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
+
+  const int num_col16 = stride / 16;
+  int remain_col = stride % 16;
+  // The core vertical SIMD processes 4 input rows simultaneously to generate
+  // output corresponding to 2 rows. To streamline the core loop and eliminate
+  // the need for conditional checks, the remaining rows (4 or 6) are processed
+  // separately.
+  const int remain_row = (height % 4 == 0) ? 4 : 6;
+
+  for (int j = start_col; j < stride - remain_col; j += 16) {
+    const uint8_t *data = &intbuf[j];
+    const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride));
+    // Padding top 3 rows with the last available row at the top.
+    const __m128i l0 = l3;
+    const __m128i l1 = l3;
+    const __m128i l2 = l3;
+    const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride));
+
+    __m128i l6, l7, l8, l9;
+    __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride));
+    __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride));
+    __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride));
+
+    // a0...a15 | c0...c15
+    const __m256i s02 =
+        _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20);
+    // b0...b15 | d0...d15
+    const __m256i s13 =
+        _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20);
+    // c0...c15 | e0...e15
+    const __m256i s24 =
+        _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20);
+    // d0...d15 | f0...f15
+    const __m256i s35 =
+        _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20);
+    // e0...e15 | g0...g15
+    const __m256i s46 =
+        _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20);
+    // f0...f15 | h0...h15
+    const __m256i s57 =
+        _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20);
+
+    // a0b0...a7b7 | c0d0...c7d7
+    s[0] = _mm256_unpacklo_epi8(s02, s13);
+    // c0d0...c7d7 | e0f0...e7f7
+    s[1] = _mm256_unpacklo_epi8(s24, s35);
+    // e0f0...e7f7 | g0h0...g7h7
+    s[2] = _mm256_unpacklo_epi8(s46, s57);
+
+    // a8b8...a15b15 | c8d8...c15d15
+    s[5] = _mm256_unpackhi_epi8(s02, s13);
+    // c8d8...c15d15 | e8f8...e15f15
+    s[6] = _mm256_unpackhi_epi8(s24, s35);
+    // e8f8...e15f15 | g8h8...g15h15
+    s[7] = _mm256_unpackhi_epi8(s46, s57);
+
+    // height to be processed here
+    const int process_ht = height - remain_row;
+    for (int i = 0; i < process_ht; i += 4) {
+      PROCESS_RESIZE_Y_WD16
+
+      _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
+                       CAST_LOW(res_8bit0));
+
+      _mm_storeu_si128(
+          (__m128i *)&output[(i / 2) * out_stride + j + out_stride],
+          _mm256_extracti128_si256(res_8bit0, 1));
+
+      // Load the required data for processing of next 4 input rows.
+      const int idx7 = AOMMIN(height - 1, i + 7);
+      const int idx8 = AOMMIN(height - 1, i + 8);
+      l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride));
+      l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride));
+
+      const __m256i s810 =
+          _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
+      const __m256i s911 =
+          _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
+      // i0j0... i7j7 | k0l0... k7l7
+      s[4] = _mm256_unpacklo_epi8(s810, s911);
+      // i8j8... i15j15 | k8l8... k15l15
+      s[9] = _mm256_unpackhi_epi8(s810, s911);
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+
+      s[5] = s[7];
+      s[6] = s[8];
+      s[7] = s[9];
+    }
+
+    // Process the remaining last 4 or 6 rows here.
+    int i = process_ht;
+    while (i < height - 1) {
+      PROCESS_RESIZE_Y_WD16
+
+      _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
+                       CAST_LOW(res_8bit0));
+      i += 2;
+
+      const int is_store_valid = (i < height - 1);
+      if (is_store_valid)
+        _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
+                         _mm256_extracti128_si256(res_8bit0, 1));
+      i += 2;
+
+      // Check if there is any remaining height to process. If so, perform the
+      // necessary data loading for processing the next row.
+      if (i < height - 1) {
+        l10 = l11 = l9;
+        const __m256i s810 =
+            _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
+        const __m256i s911 =
+            _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
+        // i0j0... i7j7 | k0l0... k7l7
+        s[4] = _mm256_unpacklo_epi8(s810, s911);
+        // i8j8... i15j15 | k8l8... k15l15
+        s[9] = _mm256_unpackhi_epi8(s810, s911);
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+
+        s[5] = s[7];
+        s[6] = s[8];
+        s[7] = s[9];
+      }
+    }
+  }
+
+  if (remain_col > 7) {
+    const int processed_wd = num_col16 * 16;
+    remain_col = stride % 8;
+
+    const uint8_t *data = &intbuf[processed_wd];
+
+    const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
+    // Padding top 3 rows with available top-most row.
+    const __m128i l0 = l3;
+    const __m128i l1 = l3;
+    const __m128i l2 = l3;
+    const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
+
+    __m128i l6, l7, l8, l9;
+    __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
+    __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride));
+    __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride));
+
+    // a0b0...a7b7
+    const __m128i s01 = _mm_unpacklo_epi8(l0, l1);
+    // c0d0...c7d7
+    const __m128i s23 = _mm_unpacklo_epi8(l2, l3);
+    // e0f0...e7f7
+    const __m128i s45 = _mm_unpacklo_epi8(l4, l5);
+    // g0h0...g7h7
+    __m128i s67 = _mm_unpacklo_epi8(l10, l11);
+
+    // a0b0...a7b7 | c0d0...c7d7
+    s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20);
+    // c0d0...c7d7 | e0f0...e7f7
+    s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20);
+    // e0f0...e7f7 | g0h0...g7h7
+    s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20);
+
+    // height to be processed here
+    const int process_ht = height - remain_row;
+    for (int i = 0; i < process_ht; i += 4) {
+      PROCESS_RESIZE_Y_WD8
+
+      _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
+                       CAST_LOW(res_a_round_1));
+
+      _mm_storel_epi64(
+          (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride],
+          _mm256_extracti128_si256(res_a_round_1, 1));
+
+      const int idx7 = AOMMIN(height - 1, i + 7);
+      const int idx8 = AOMMIN(height - 1, i + 8);
+      l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride));
+      l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride));
+
+      // k0l0... k7l7
+      const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
+      // i0j0... i7j7 | k0l0... k7l7
+      s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+    }
+
+    // Process the remaining last 4 or 6 rows here.
+    int i = process_ht;
+    while (i < height - 1) {
+      PROCESS_RESIZE_Y_WD8
+
+      _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
+                       CAST_LOW(res_a_round_1));
+
+      i += 2;
+
+      const int is_store_valid = (i < height - 1);
+      if (is_store_valid)
+        _mm_storel_epi64(
+            (__m128i *)&output[(i / 2) * out_stride + processed_wd],
+            _mm256_extracti128_si256(res_a_round_1, 1));
+      i += 2;
+
+      // Check rows are still remaining for processing. If yes do the required
+      // load of data for the next iteration.
+      if (i < height - 1) {
+        l10 = l11 = l9;
+        // k0l0... k7l7
+        const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
+        // i0j0... i7j7 | k0l0... k7l7
+        s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+      }
+    }
+  }
+
+  if (remain_col)
+    return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+                             stride, stride - remain_col);
+
+  return true;
+}
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
new file mode 100644
index 000000000..889130419
--- /dev/null
+++ b/test/frame_resize_test.cc
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/bitops.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+using std::make_tuple;
+using std::tuple;
+
+const int kIters = 1000;
+
+typedef tuple<int, int> FrameDimension;
+
+// Resolutions (width x height) to be tested for resizing.
+const FrameDimension kFrameDim[] = {
+  make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080),
+  make_tuple(1280, 720),  make_tuple(640, 480),   make_tuple(640, 360),
+  make_tuple(256, 256),
+};
+
+// Check that two 8-bit output buffers are identical.
+void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
+                          int height) {
+  ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
+  for (int j = 0; j < height; ++j) {
+    if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
+      p1 += width;
+      p2 += width;
+      continue;
+    }
+    for (int i = 0; i < width; ++i) {
+      ASSERT_EQ(p1[i], p2[i])
+          << width << "x" << height << " Pixel mismatch at (" << i << ", " << j
+          << ")";
+    }
+  }
+}
+
+typedef bool (*LowBDResizeFunc)(uint8_t *intbuf, uint8_t *output,
+                                int out_stride, int height, int height2,
+                                int stride, int start_wd);
+// Test parameter list:
+//  <tst_fun, dims>
+typedef tuple<LowBDResizeFunc, FrameDimension> ResizeTestParams;
+
+class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> {
+ public:
+  void SetUp() {
+    test_fun_ = GET_PARAM(0);
+    frame_dim_ = GET_PARAM(1);
+    width_ = std::get<0>(frame_dim_);
+    height_ = std::get<1>(frame_dim_);
+    const int msb = get_msb(AOMMIN(width_, height_));
+    n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+
+    src_ = (uint8_t *)aom_malloc((width_ / 2) * height_ * sizeof(*src_));
+    ref_dest_ =
+        (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*ref_dest_));
+    test_dest_ =
+        (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*test_dest_));
+  }
+
+  void RunTest() {
+    int width2 = width_, height2 = height_;
+
+    for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8();
+    for (int level = 1; level < n_levels_; level++) {
+      width2 = (width_ >> level);
+      height2 = (height_ >> level);
+      resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, width2,
+                        0);
+      test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+
+      AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2);
+    }
+  }
+
+  void SpeedTest() {
+    int width2 = width_, height2 = height_;
+
+    for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8();
+    for (int level = 1; level < n_levels_; level++) {
+      width2 = (width_ >> level);
+      height2 = (height_ >> level);
+      aom_usec_timer ref_timer;
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < kIters; j++) {
+        resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2,
+                          width2, 0);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+      aom_usec_timer tst_timer;
+      aom_usec_timer_start(&tst_timer);
+      for (int j = 0; j < kIters; j++) {
+        test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0);
+      }
+      aom_usec_timer_mark(&tst_timer);
+      const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+      std::cout << "level: " << level << " [" << width2 << " x " << height2
+                << "] C time = " << ref_time << " , SIMD time = " << tst_time
+                << " scaling=" << float(1.00) * ref_time / tst_time << "x \n";
+    }
+  }
+
+  void TearDown() {
+    aom_free(src_);
+    aom_free(ref_dest_);
+    aom_free(test_dest_);
+  }
+
+ private:
+  LowBDResizeFunc test_fun_;
+  FrameDimension frame_dim_;
+  int width_;
+  int height_;
+  int n_levels_;
+  uint8_t *src_;
+  uint8_t *ref_dest_;
+  uint8_t *test_dest_;
+  libaom_test::ACMRandom rng_;
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1ResizeYTest);
+
+TEST_P(AV1ResizeYTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ResizeYTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1ResizeYTest,
+    ::testing::Combine(::testing::Values(resize_vert_dir_avx2),
+                       ::testing::ValuesIn(kFrameDim)));
+#endif
+
+}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index e2f5da570..2631c9fb3 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -209,6 +209,7 @@ if(NOT BUILD_SHARED_LIBS)
               "${AOM_ROOT}/test/fdct4x4_test.cc"
               "${AOM_ROOT}/test/fft_test.cc"
               "${AOM_ROOT}/test/firstpass_test.cc"
+              "${AOM_ROOT}/test/frame_resize_test.cc"
               "${AOM_ROOT}/test/fwht4x4_test.cc"
               "${AOM_ROOT}/test/hadamard_test.cc"
               "${AOM_ROOT}/test/horver_correlation_test.cc"

(Patch may be truncated, please check the link at the top of this post.)