aom: synonyms_avx2.h: add yy_loadu_4x64()

From 38736a8a99432d9f8dbc2580b857d4f43ecc1774 Mon Sep 17 00:00:00 2001
From: James Zern <[EMAIL REDACTED]>
Date: Thu, 20 Jun 2024 16:55:14 -0700
Subject: [PATCH] synonyms_avx2.h: add yy_loadu_4x64()

Use this instead of _mm256_set_epi64x() to load unaligned 64-bit values.
This quiets undefined sanitizer warnings of the form:
aom_dsp/x86/blend_a64_mask_avx2.c:913:40: runtime error: load of
  misaligned address 0x000009e7542c for type 'int64_t' (aka 'long'),
  which requires 8 byte alignment

Bug: b:300649160
Change-Id: I9be724c3461665f1800c599768a7609c0e57af69
---
 aom_dsp/x86/blend_a64_mask_avx2.c | 14 ++++++--------
 aom_dsp/x86/synonyms_avx2.h       | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 638c378b42..2b7fe838d6 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -910,14 +910,12 @@ static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
     const __m256i *round_offset, int shift, const __m256i *clip_low,
     const __m256i *clip_high, const __m256i *mask_max) {
   // Load 4x u16 pixels from each of 4 rows from each source
-  const __m256i s0 = _mm256_set_epi64x(*(int64_t *)(src0 + 3 * src0_stride),
-                                       *(int64_t *)(src0 + 2 * src0_stride),
-                                       *(int64_t *)(src0 + 1 * src0_stride),
-                                       *(int64_t *)(src0 + 0 * src0_stride));
-  const __m256i s1 = _mm256_set_epi64x(*(int64_t *)(src1 + 3 * src1_stride),
-                                       *(int64_t *)(src1 + 2 * src1_stride),
-                                       *(int64_t *)(src1 + 1 * src1_stride),
-                                       *(int64_t *)(src1 + 0 * src1_stride));
+  const __m256i s0 =
+      yy_loadu_4x64(src0 + 3 * src0_stride, src0 + 2 * src0_stride,
+                    src0 + 1 * src0_stride, src0 + 0 * src0_stride);
+  const __m256i s1 =
+      yy_loadu_4x64(src1 + 3 * src1_stride, src1 + 2 * src1_stride,
+                    src1 + 1 * src1_stride, src1 + 0 * src1_stride);
   // Generate the inverse mask
   const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
 
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
index 2a130ef7f6..53f5028acc 100644
--- a/aom_dsp/x86/synonyms_avx2.h
+++ b/aom_dsp/x86/synonyms_avx2.h
@@ -60,6 +60,22 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
 }
 
+// This behaves similarly to _mm256_set_epi64x(), but avoids undefined
+// sanitizer warnings when loading values from unaligned buffers using
+// `*(int64_t *)val`.
+static INLINE __m256i yy_loadu_4x64(const void *e3, const void *e2,
+                                    const void *e1, const void *e0) {
+  __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0));
+  __m128d v01 = _mm_loadh_pd(v0, (const double *)e1);
+  __m128d v2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e2));
+  __m128d v23 = _mm_loadh_pd(v2, (const double *)e3);
+  // Note this can be replaced with
+  // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains
+  // _mm256_set_m128d() with all supported compilers. This version is used to
+  // match the behavior with yy_set_m128i().
+  return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
+}
+
 static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
   __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
   __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));