aom: Add SSE2 for av1_resize_horz_dir()

From 1ed584b2c6d51adba7a730e10a0dd2fb4ce997de Mon Sep 17 00:00:00 2001
From: Samuthirika S <[EMAIL REDACTED]>
Date: Mon, 13 May 2024 17:39:46 +0530
Subject: [PATCH] Add SSE2 for av1_resize_horz_dir()

This CL adds SSE2 implementation for av1_resize_horz_dir()
function. Also, unit test for the same is added.

This is a bit-exact change.

Change-Id: Ia2da5221913743f34519951235bbfa36aa8465e4
---
 av1/common/av1_rtcd_defs.pl  |   2 +-
 av1/common/x86/resize_avx2.c |   7 +-
 av1/common/x86/resize_sse2.c | 166 +++++++++++++++++++++++++++++++++++
 test/frame_resize_test.cc    |   7 ++
 4 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index eca260cce..8e24bb9c1 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -558,7 +558,7 @@ ()
 specialize qw/av1_resize_vert_dir sse2 avx2/;
 
 add_proto qw/void av1_resize_horz_dir/, "const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filteredlength, int width2";
-specialize qw/av1_resize_horz_dir avx2/;
+specialize qw/av1_resize_horz_dir sse2 avx2/;
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/av1/common/x86/resize_avx2.c b/av1/common/x86/resize_avx2.c
index 38bbc2626..7c36fca8a 100644
--- a/av1/common/x86/resize_avx2.c
+++ b/av1/common/x86/resize_avx2.c
@@ -530,11 +530,10 @@ void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
                               uint8_t *intbuf, int height, int filtered_length,
                               int width2) {
   assert(height % 2 == 0);
-  // Currently, Invoking C function for width less than 32. Optimize the below,
-  // by invoking SSE2 once the implementation for the same is available.
+  // Invoke SSE2 for width less than 32.
   if (filtered_length < 32) {
-    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
-                          width2);
+    av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length,
+                             width2);
     return;
   }
 
diff --git a/av1/common/x86/resize_sse2.c b/av1/common/x86/resize_sse2.c
index f0470a3f3..1afc96221 100644
--- a/av1/common/x86/resize_sse2.c
+++ b/av1/common/x86/resize_sse2.c
@@ -164,3 +164,169 @@ bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
 
   return true;
 }
+
+// Blends a and b using mask and returns the result.
+static INLINE __m128i blend(__m128i a, __m128i b, __m128i mask) {
+  const __m128i masked_b = _mm_and_si128(mask, b);
+  const __m128i masked_a = _mm_andnot_si128(mask, a);
+  return (_mm_or_si128(masked_a, masked_b));
+}
+
+// Masks used for width 16 pixels, with left and right padding
+// requirements.
+static const uint8_t left_padding_mask[16] = {
+  255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const uint8_t right_padding_mask[16] = { 0,   0,   0,   0,  0,   0,
+                                                0,   0,   0,   0,  255, 255,
+                                                255, 255, 255, 255 };
+
+static const uint8_t mask_16[16] = {
+  255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0,
+};
+
+void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
+                              uint8_t *intbuf, int height, int filtered_length,
+                              int width2) {
+  assert(height % 2 == 0);
+  // Invoke C for width less than 16.
+  if (filtered_length < 16) {
+    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
+                          width2);
+  }
+
+  __m128i coeffs_x[2];
+  const int bits = FILTER_BITS;
+  const int dst_stride = width2;
+  const int remain_col = filtered_length % 16;
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const uint8_t max_pixel = 255;
+  const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i start_pad_mask = _mm_loadu_si128((__m128i *)left_padding_mask);
+  const __m128i end_pad_mask = _mm_loadu_si128((__m128i *)right_padding_mask);
+  const __m128i mask_even = _mm_loadu_si128((__m128i *)mask_16);
+  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
+
+  for (int i = 0; i < height; ++i) {
+    int filter_offset = 0;
+    for (int j = 0; j <= filtered_length - 16; j += 16) {
+      const int in_idx = i * in_stride + j - filter_offset;
+      const int out_idx = i * dst_stride + j / 2;
+
+      // a0 a1 a2 a3 .... a15
+      __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]);
+      // a8 a9 a10 a11 .... a23
+      __m128i row01 =
+          _mm_loadu_si128((__m128i *)&input[in_idx + 5 + filter_offset]);
+      filter_offset = 3;
+
+      // Pad start pixels to the left, while processing the first pixels in the
+      // row.
+      if (j == 0) {
+        const __m128i start_pixel_row0 =
+            _mm_set1_epi8((char)input[i * in_stride]);
+        row00 =
+            blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask);
+      }
+
+      // Pad end pixels to the right, while processing the last pixels in the
+      // row.
+      const int is_last_cols16 = (j == filtered_length - 16);
+      if (is_last_cols16) {
+        const __m128i end_pixel_row0 =
+            _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
+        row01 = blend(row01, end_pixel_row0, end_pad_mask);
+      }
+
+      // a2 a3 a4 a5 a6 a7 a8 a9 .... a17
+      const __m128i row0_1 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 2),
+                                                _mm_srli_si128(row01, 2));
+      // a4 a5 a6 a7 a9 10 a11 a12 .... a19
+      const __m128i row0_2 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 4),
+                                                _mm_srli_si128(row01, 4));
+      // a6 a7 a8 a9 a10 a11 a12 a13 .... a21
+      const __m128i row0_3 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 6),
+                                                _mm_srli_si128(row01, 6));
+
+      // a0 a2 a4 a6 a8 a10 a12 a14 (each 16 bit)
+      const __m128i s0 = _mm_and_si128(row00, mask_even);
+      // a1 a3 a5 a7 a9 a11 a13 a15
+      const __m128i s1 = _mm_and_si128(_mm_srli_epi16(row00, 8), mask_even);
+      // a2 a4 a6 a8 a10 a12 a14 a16
+      const __m128i s2 = _mm_and_si128(row0_1, mask_even);
+      // a3 a5 a7 a9 a11 a13 a15 a17
+      const __m128i s3 = _mm_and_si128(_mm_srli_epi16(row0_1, 8), mask_even);
+      // a4 a6 a8 a10 a12 a14 a16 a18
+      const __m128i s4 = _mm_and_si128(row0_2, mask_even);
+      // a5 a7 a9 a11 a13 a15 a17 a19
+      const __m128i s5 = _mm_and_si128(_mm_srli_epi16(row0_2, 8), mask_even);
+      // a6 a8 a10 a12 a14 a16 a18 a20
+      const __m128i s6 = _mm_and_si128(row0_3, mask_even);
+      // a7 a9 a11 a13 a15 a17 a19 a21
+      const __m128i s7 = _mm_and_si128(_mm_srli_epi16(row0_3, 8), mask_even);
+
+      // a0a7 a2a9 a4a11 .... a12a19 a14a21
+      const __m128i s07 = _mm_add_epi16(s0, s7);
+      // a1a6 a3a8 a5a10 .... a13a18 a15a20
+      const __m128i s16 = _mm_add_epi16(s1, s6);
+      // a2a5 a4a7 a6a9  .... a14a17 a16a19
+      const __m128i s25 = _mm_add_epi16(s2, s5);
+      // a3a4 a5a6 a7a8  .... a15a16 a17a18
+      const __m128i s34 = _mm_add_epi16(s3, s4);
+
+      // a0a7 a1a6 a2a9 a3a8 a4a11 a5a10 a6a13 a7a12
+      const __m128i s1607_low = _mm_unpacklo_epi16(s07, s16);
+      // a2a5 a3a4 a4a7 a5a6 a6a9 a7a8 a8a11 a9a10
+      const __m128i s3425_low = _mm_unpacklo_epi16(s25, s34);
+
+      // a8a15 a9a14 a10a17 a11a16 a12a19 a13a18 a14a21 a15a20
+      const __m128i s1607_high = _mm_unpackhi_epi16(s07, s16);
+      // a10a13 a11a12 a12a15 a13a14 a14a17 a15a16 a16a19 a17a18
+      const __m128i s3425_high = _mm_unpackhi_epi16(s25, s34);
+
+      const __m128i r01_0 = _mm_madd_epi16(s3425_low, coeffs_x[1]);
+      const __m128i r01_1 = _mm_madd_epi16(s1607_low, coeffs_x[0]);
+      const __m128i r01_2 = _mm_madd_epi16(s3425_high, coeffs_x[1]);
+      const __m128i r01_3 = _mm_madd_epi16(s1607_high, coeffs_x[0]);
+
+      // Result of first 8 pixels of row0 (a0 to a7).
+      // r0_0 r0_1 r0_2 r0_3
+      __m128i r00 = _mm_add_epi32(r01_0, r01_1);
+      r00 = _mm_add_epi32(r00, round_const_bits);
+      r00 = _mm_sra_epi32(r00, round_shift_bits);
+
+      // Result of next 8 pixels of row0 (a8 to 15).
+      // r0_4 r0_5 r0_6 r0_7
+      __m128i r01 = _mm_add_epi32(r01_2, r01_3);
+      r01 = _mm_add_epi32(r01, round_const_bits);
+      r01 = _mm_sra_epi32(r01, round_shift_bits);
+
+      // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
+      const __m128i res_16 = _mm_packs_epi32(r00, r01);
+      const __m128i res_8 = _mm_packus_epi16(res_16, res_16);
+      __m128i res = _mm_min_epu8(res_8, clip_pixel);
+      res = _mm_max_epu8(res, zero);
+
+      // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
+      _mm_storel_epi64((__m128i *)&intbuf[out_idx], res);
+    }
+
+    int wd_processed = filtered_length - remain_col;
+    // When the remaining width is 2, the above code would not have taken
+    // care of padding required for (filtered_length - 4)th pixel. Hence,
+    // process that pixel again with the C code.
+    wd_processed = (remain_col == 2) ? wd_processed - 2 : wd_processed;
+    if (remain_col) {
+      const int in_idx = (in_stride * i);
+      const int out_idx = (wd_processed / 2) + width2 * i;
+
+      down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
+                    wd_processed);
+    }
+  }
+}
diff --git a/test/frame_resize_test.cc b/test/frame_resize_test.cc
index 7a4da4597..befdd490b 100644
--- a/test/frame_resize_test.cc
+++ b/test/frame_resize_test.cc
@@ -245,6 +245,13 @@ TEST_P(AV1ResizeXTest, RunTest) { RunTest(); }
 
 TEST_P(AV1ResizeXTest, DISABLED_SpeedTest) { SpeedTest(); }
 
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1ResizeXTest,
+    ::testing::Combine(::testing::Values(av1_resize_horz_dir_sse2),
+                       ::testing::ValuesIn(kFrameDim)));
+#endif
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1ResizeXTest,