aom: convolve_sse2: use xx_loadl_32() for unaligned int loads

From 24bcf570a609471585ca3cb989076b9ef221f461 Mon Sep 17 00:00:00 2001
From: James Zern <[EMAIL REDACTED]>
Date: Thu, 9 May 2024 12:19:28 -0700
Subject: [PATCH] convolve_sse2: use xx_loadl_32() for unaligned int loads

This quiets some undefined sanitizer warnings related to unaligned
loads; no major changes in assembly with gcc-13 (some register changes,
instruction reordering).

Change-Id: I2e8ac7f40caec56f204440a39116745e2a9a1fe2
---
 av1/common/x86/convolve_sse2.c | 39 ++++++++++++++--------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 6383567a4..4787d3f1d 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -16,6 +16,7 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/synonyms.h"
 #include "av1/common/convolve.h"
 
 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
@@ -200,31 +201,23 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
     if (w <= 4) {
       __m128i s[8], src6, res, res_round, res16;
       int res_int;
-      src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
-      s[0] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
-      s[1] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
-      s[2] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
-      s[3] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
-      s[4] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
-      s[5] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+      src6 = xx_loadl_32(src_ptr + 6 * src_stride);
+      s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
+                               xx_loadl_32(src_ptr + 1 * src_stride));
+      s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
+                               xx_loadl_32(src_ptr + 2 * src_stride));
+      s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
+                               xx_loadl_32(src_ptr + 3 * src_stride));
+      s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
+                               xx_loadl_32(src_ptr + 4 * src_stride));
+      s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
+                               xx_loadl_32(src_ptr + 5 * src_stride));
+      s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
 
       do {
-        s[6] = _mm_unpacklo_epi8(
-            src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
-        src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
-        s[7] = _mm_unpacklo_epi8(
-            _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+        s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
+        src6 = xx_loadl_32(src_ptr + 8 * src_stride);
+        s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
 
         res = convolve_lo_y(s + 0, coeffs);
         res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);