aom: avx2: use yy_loadu2_128 from synonyms_avx2.h

From e592429eef9acaaeca99978d69836a87fb6ebaf8 Mon Sep 17 00:00:00 2001
From: James Zern <[EMAIL REDACTED]>
Date: Thu, 20 Jun 2024 12:46:23 -0700
Subject: [PATCH] avx2: use yy_loadu2_128 from synonyms_avx2.h

This removes identical or close to identical implementations. It's
unclear if the use of _mm_lddqu_si128() in masked_sad_intrin_avx2.c was
intentional, but for now we'll normalize on _mm_loadu_si128(); any
benefits should be minor.

Bug: b:300649160
Change-Id: I535b4b74855579edb647f5469a815968e00e1b28
---
 aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c | 35 +++++++++--------------
 aom_dsp/x86/avg_intrin_avx2.c             | 23 ++++++---------
 aom_dsp/x86/masked_sad_intrin_avx2.c      | 22 ++++++--------
 3 files changed, 30 insertions(+), 50 deletions(-)

diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 1f382d110b..0c4c537a50 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -15,6 +15,7 @@
 
 #include "aom_dsp/x86/convolve.h"
 #include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/mem.h"
 
 #if defined(__clang__)
@@ -61,12 +62,6 @@ static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
                    _mm256_extractf128_si256(*a, 1));
 }
 
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
-  return a;
-}
-
 static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
                                    const ptrdiff_t stride, const __m256i *a) {
   _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
@@ -100,7 +95,7 @@ static void aom_filter_block1d4_h4_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -188,7 +183,7 @@ static void aom_filter_block1d4_h8_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -295,7 +290,7 @@ static void aom_filter_block1d8_h4_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
@@ -397,7 +392,7 @@ static void aom_filter_block1d8_h8_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -521,7 +516,7 @@ static void aom_filter_block1d16_h4_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
@@ -535,8 +530,7 @@ static void aom_filter_block1d16_h4_avx2(
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+    srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
 
     // filter the source buffer
     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
@@ -644,7 +638,7 @@ static void aom_filter_block1d16_h8_avx2(
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+    srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr);
 
     // filter the source buffer
     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
@@ -670,8 +664,7 @@ static void aom_filter_block1d16_h8_avx2(
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+    srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
 
     // filter the source buffer
     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
@@ -1068,7 +1061,7 @@ static void aom_filter_block1d16_v4_avx2(
   src_stride = src_pitch << 1;
   dst_stride = out_pitch << 1;
 
-  srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg23 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
   srcReg4x = _mm256_castsi128_si256(
       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
 
@@ -1172,11 +1165,9 @@ static void aom_filter_block1d16_v8_avx2(
   dst_stride = out_pitch << 1;
 
   // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
-  srcReg32b3 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg32b5 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b1 = yy_loadu2_128(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 = yy_loadu2_128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
   srcReg32b7 = _mm256_castsi128_si256(
       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
 
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 6e943b84b3..ba6de96d24 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -14,6 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/bitdepth_conversion_avx2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/mem.h"
 
 static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
@@ -542,28 +543,22 @@ int aom_satd_lp_avx2(const int16_t *coeff, int length) {
   }
 }
 
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
-  return a;
-}
-
 void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx,
                            int *avg) {
   const uint8_t *s_y0 = s + y16_idx * p + x16_idx;
   const uint8_t *s_y1 = s_y0 + 8 * p;
   __m256i sum0, sum1, s0, s1, s2, s3, u0;
   u0 = _mm256_setzero_si256();
-  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1, s_y0), u0);
-  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + p, s_y0 + p), u0);
-  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
-  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
+  s0 = _mm256_sad_epu8(yy_loadu2_128(s_y1, s_y0), u0);
+  s1 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + p, s_y0 + p), u0);
+  s2 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
+  s3 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
   sum0 = _mm256_add_epi16(s0, s1);
   sum1 = _mm256_add_epi16(s2, s3);
-  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
-  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
-  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
-  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
+  s0 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
+  s1 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
+  s2 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
+  s3 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
   sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1));
   sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3));
   sum0 = _mm256_add_epi16(sum0, sum1);
diff --git a/aom_dsp/x86/masked_sad_intrin_avx2.c b/aom_dsp/x86/masked_sad_intrin_avx2.c
index a8097bf4a1..d157d7d625 100644
--- a/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -17,6 +17,7 @@
 #include "aom_dsp/blend.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
 
 static INLINE unsigned int masked_sad32xh_avx2(
@@ -67,13 +68,6 @@ static INLINE unsigned int masked_sad32xh_avx2(
   return sad;
 }
 
-static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
-  __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
-  __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
-  __m256i a = _mm256_castsi128_si256(a0);
-  return _mm256_inserti128_si256(a, a1, 1);
-}
-
 static INLINE unsigned int masked_sad16xh_avx2(
     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
     const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
@@ -84,10 +78,10 @@ static INLINE unsigned int masked_sad16xh_avx2(
   const __m256i round_scale =
       _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   for (y = 0; y < height; y += 2) {
-    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
-    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
-    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
-    const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+    const __m256i src = yy_loadu2_128(src_ptr + src_stride, src_ptr);
+    const __m256i a = yy_loadu2_128(a_ptr + a_stride, a_ptr);
+    const __m256i b = yy_loadu2_128(b_ptr + b_stride, b_ptr);
+    const __m256i m = yy_loadu2_128(m_ptr + m_stride, m_ptr);
     const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
 
     // Calculate 16 predicted pixels.
@@ -217,9 +211,9 @@ static INLINE unsigned int highbd_masked_sad8xh_avx2(
   const __m256i one = _mm256_set1_epi16(1);
 
   for (y = 0; y < height; y += 2) {
-    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
-    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
-    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    const __m256i src = yy_loadu2_128(src_ptr + src_stride, src_ptr);
+    const __m256i a = yy_loadu2_128(a_ptr + a_stride, a_ptr);
+    const __m256i b = yy_loadu2_128(b_ptr + b_stride, b_ptr);
     // Zero-extend mask to 16 bits
     const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
         _mm_loadl_epi64((const __m128i *)(m_ptr)),