aom: disflow_avx2.c: Make compatible with gcc <= 9

From 874b7ca3adae0c408b06ce3982e8c4ad432bdd98 Mon Sep 17 00:00:00 2001
From: Rachel Barker <[EMAIL REDACTED]>
Date: Tue, 2 Apr 2024 17:43:11 +0000
Subject: [PATCH] disflow_avx2.c: Make compatible with gcc <= 9

Per the linked bug report, the _mm256_loadu2_m128i() intrinsic
was only added in gcc 10. Therefore, for compatibility with gcc 9
and earlier, we must instead use our own implementation of this
intrinsic, which we call yy_loadu2_128().

Bug: aomedia:3550
Change-Id: I8a4220acaaddeb6dcdd8fd918cd386c432a56bfc
---
 aom_dsp/flow_estimation/x86/disflow_avx2.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/aom_dsp/flow_estimation/x86/disflow_avx2.c b/aom_dsp/flow_estimation/x86/disflow_avx2.c
index e210042d6..ad5a1bd7c 100644
--- a/aom_dsp/flow_estimation/x86/disflow_avx2.c
+++ b/aom_dsp/flow_estimation/x86/disflow_avx2.c
@@ -145,7 +145,7 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
     // for a total of 11 pixels. Here we load 16 pixels, but only use
     // the first 11.
     __m256i row =
-        _mm256_loadu2_m128i((__m128i *)(ref_row + stride), (__m128i *)ref_row);
+        yy_loadu2_128((__m128i *)(ref_row + stride), (__m128i *)ref_row);
 
     // Expand pixels to int16s
     // We must use unpacks here, as we have one row in each 128-bit lane
@@ -273,8 +273,8 @@ static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
 
   // Loop setup: Load the first two rows (of 10 input rows) and apply
   // the horizontal parts of the two filters
-  __m256i row_m1_0 = _mm256_loadu2_m128i((__m128i *)(src - 1),
-                                         (__m128i *)(src - src_stride - 1));
+  __m256i row_m1_0 =
+      yy_loadu2_128((__m128i *)(src - 1), (__m128i *)(src - src_stride - 1));
   __m256i row_m1_0_a = _mm256_unpacklo_epi8(row_m1_0, zero);
   __m256i row_m1_0_b =
       _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 1), zero);
@@ -293,8 +293,8 @@ static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx,
   for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) {
     // Load rows (i+1, i+2) and apply both horizontal filters
     const __m256i row_p1_p2 =
-        _mm256_loadu2_m128i((__m128i *)(src + (i + 2) * src_stride - 1),
-                            (__m128i *)(src + (i + 1) * src_stride - 1));
+        yy_loadu2_128((__m128i *)(src + (i + 2) * src_stride - 1),
+                      (__m128i *)(src + (i + 1) * src_stride - 1));
     const __m256i row_p1_p2_a = _mm256_unpacklo_epi8(row_p1_p2, zero);
     const __m256i row_p1_p2_b =
         _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 1), zero);