SDL: audiocvt: stereo-to-mono SSE3 now uses unaligned accesses.

From 8d790b10f8eda4754c534cdae050041cb60d706b Mon Sep 17 00:00:00 2001
From: "Ryan C. Gordon" <[EMAIL REDACTED]>
Date: Tue, 27 Jul 2021 12:23:46 -0400
Subject: [PATCH] audiocvt: stereo-to-mono SSE3 now uses unaligned accesses.

On modern CPUs, there's no penalty for using the unaligned instruction on
aligned memory, but now it can vectorize unaligned data too, which even if
it's not optimal, is still going to be faster than the scalar fallback.

Fixes #4532.
---
 src/audio/SDL_audiocvt.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c
index 36c631a2fb..23d1906d7b 100644
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@@ -52,6 +52,7 @@
 static void SDLCALL
 SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 {
+    const __m128 divby2 = _mm_set1_ps(0.5f);
     float *dst = (float *) cvt->buf;
     const float *src = dst;
     int i = cvt->len_cvt / 8;
@@ -59,15 +60,12 @@ SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
     LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
     SDL_assert(format == AUDIO_F32SYS);
 
-    /* We can only do this if dst is aligned to 16 bytes; since src is the
-       same pointer and it moves by 2, it can't be forcibly aligned. */
-    if ((((size_t) dst) & 15) == 0) {
-        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
-        const __m128 divby2 = _mm_set1_ps(0.5f);
-        while (i >= 4) {   /* 4 * float32 */
-            _mm_store_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_load_ps(src), _mm_load_ps(src+4)), divby2));
-            i -= 4; src += 8; dst += 4;
-        }
+    /* Do SSE blocks as long as we have 16 bytes available.
+       Just use unaligned load/stores, if the memory at runtime is
+       aligned it'll be just as fast on modern processors */
+    while (i >= 4) {   /* 4 * float32 */
+        _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_load_ps(src), _mm_loadu_ps(src+4)), divby2));
+        i -= 4; src += 8; dst += 4;
     }
 
     /* Finish off any leftovers with scalar operations. */