SDL: audio: Remove 5.1->X SIMD converters, add SSE mono->stereo.

From 9f56c7cf026c51f11f53866defd1c1e9c6606c87 Mon Sep 17 00:00:00 2001
From: "Ryan C. Gordon" <[EMAIL REDACTED]>
Date: Wed, 20 Jul 2022 16:39:19 -0400
Subject: [PATCH] audio: Remove 5.1->X SIMD converters, add SSE mono->stereo.

The 5.1 versions didn't use the new algorithm, and making that new
algorithm work took so many permutes that it was significantly slower
than just using the scalar versions.

However, mono-to-stereo is an extremely common conversion, and it's
trivial to accelerate it with plain SSE, so that was added!
---
 src/audio/SDL_audiocvt.c | 237 +++++++--------------------------------
 1 file changed, 39 insertions(+), 198 deletions(-)

diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c
index 586f8a44e23..85faa4b0b75 100644
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@@ -93,68 +93,34 @@
  * 8 channels (7.1) layout: FL+FR+FC+LFE+BL+BR+SL+SR
  */
 
-
-#if 0  /* !!! FIXME: these need to be updated to match the new scalar code. */
-#if HAVE_AVX_INTRINSICS
-/* MSVC will always accept AVX intrinsics when compiling for x64 */
-#if defined(__clang__) || defined(__GNUC__)
-__attribute__((target("avx")))
-#endif
-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
+#if HAVE_SSE3_INTRINSICS
+/* Convert from stereo to mono. Average left and right. */
 static void SDLCALL
-SDL_Convert51ToStereo_AVX(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 {
+    const __m128 divby2 = _mm_set1_ps(0.5f);
     float *dst = (float *) cvt->buf;
     const float *src = dst;
-    int i = cvt->len_cvt / (sizeof (float) * 6);
-    const float two_fifths_f = 1.0f / 2.5f;
-    const __m256 two_fifths_v = _mm256_set1_ps(two_fifths_f);
-    const __m256 half = _mm256_set1_ps(0.5f);
+    int i = cvt->len_cvt / 8;
 
-    LOG_DEBUG_CONVERT("5.1", "stereo (using AVX)");
+    LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
     SDL_assert(format == AUDIO_F32SYS);
 
-    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
-    while (i >= 4) {
-        __m256 in0 = _mm256_loadu_ps(src + 0);  /* 0FL 0FR 0FC 0LF 0BL 0BR 1FL 1FR */
-        __m256 in1 = _mm256_loadu_ps(src + 8);  /* 1FC 1LF 1BL 1BR 2FL 2FR 2FC 2LF */
-        __m256 in2 = _mm256_loadu_ps(src + 16); /* 2BL 2BR 3FL 3FR 3FC 3LF 3BL 3BR */
-
-        /* 0FL 0FR 0FC 0LF 2FL 2FR 2FC 2LF */
-        __m256 temp0 = _mm256_blend_ps(in0, in1, 0xF0);
-        /* 1FC 1LF 1BL 1BR 3FC 3LF 3BL 3BR */
-        __m256 temp1 = _mm256_blend_ps(in1, in2, 0xF0);
-
-        /* 0FC 0FC 1FC 1FC 2FC 2FC 3FC 3FC */
-        __m256 fc_distributed = _mm256_mul_ps(half, _mm256_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 2, 2)));
-
-        /* 0FL 0FR 1BL 1BR 2FL 2FR 3BL 3BR */
-        __m256 permuted0 = _mm256_blend_ps(temp0, temp1, 0xCC);
-        /* 0BL 0BR 1FL 1FR 2BL 2BR 3FL 3FR */
-        __m256 permuted1 = _mm256_permute2f128_ps(in0, in2, 0x21);
-
-        /*   0FL 0FR 1BL 1BR 2FL 2FR 3BL 3BR */
-        /* + 0BL 0BR 1FL 1FR 2BL 2BR 3FL 3FR */
-        /* =  0L  0R  1L  1R  2L  2R  3L  3R */
-        __m256 out = _mm256_add_ps(permuted0, permuted1);
-        out = _mm256_add_ps(out, fc_distributed);
-        out = _mm256_mul_ps(out, two_fifths_v);
-
-        _mm256_storeu_ps(dst, out);
-
-        i -= 4; src += 24; dst += 8;
+    /* Do SSE blocks as long as we have 16 bytes available.
+       Just use unaligned load/stores, if the memory at runtime is
+       aligned it'll be just as fast on modern processors */
+    while (i >= 4) {   /* 4 * float32 */
+        _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_loadu_ps(src), _mm_loadu_ps(src+4)), divby2));
+        i -= 4; src += 8; dst += 4;
     }
 
-
     /* Finish off any leftovers with scalar operations. */
     while (i) {
-        const float front_center_distributed = src[2] * 0.5f;
-        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
-        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
-        i--; src += 6; dst+=2;
+        *dst = (src[0] + src[1]) * 0.5f;
+        dst++; i--; src += 2;
     }
 
-    cvt->len_cvt /= 3;
+    cvt->len_cvt /= 2;
     if (cvt->filters[++cvt->filter_index]) {
         cvt->filters[cvt->filter_index] (cvt, format);
     }
@@ -162,155 +128,38 @@ SDL_Convert51ToStereo_AVX(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 #endif
 
 #if HAVE_SSE_INTRINSICS
-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
+/* Convert from mono to stereo. Duplicate to stereo left and right. */
 static void SDLCALL
-SDL_Convert51ToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 {
-    float *dst = (float *) cvt->buf;
-    const float *src = dst;
-    int i = cvt->len_cvt / (sizeof (float) * 6);
-    const float two_fifths_f = 1.0f / 2.5f;
-    const __m128 two_fifths_v = _mm_set1_ps(two_fifths_f);
-    const __m128 half = _mm_set1_ps(0.5f);
+    float *dst = ((float *) (cvt->buf + (cvt->len_cvt * 2))) - 8;
+    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 4;
+    int i = cvt->len_cvt / sizeof (float);
 
-    LOG_DEBUG_CONVERT("5.1", "stereo (using SSE)");
-    SDL_assert(format == AUDIO_F32SYS);
-
-    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
-    /* Just use unaligned load/stores, if the memory at runtime is */
-    /* aligned it'll be just as fast on modern processors */
-    while (i >= 2) {
-        /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
-        /* registers. Using shuffles they can be rearranged so that */
-        /* the conversion math can be vectorized. */
-        __m128 in0 = _mm_loadu_ps(src);     /* 0FL 0FR 0FC 0LF */
-        __m128 in1 = _mm_loadu_ps(src + 4); /* 0BL 0BR 1FL 1FR */
-        __m128 in2 = _mm_loadu_ps(src + 8); /* 1FC 1LF 1BL 1BR */
-
-        /* 0FC 0FC 1FC 1FC */
-        __m128 fc_distributed = _mm_mul_ps(half, _mm_shuffle_ps(in0, in2, _MM_SHUFFLE(0, 0, 2, 2)));
-
-        /* 0FL 0FR 1BL 1BR */
-        __m128 blended = _mm_shuffle_ps(in0, in2, _MM_SHUFFLE(3, 2, 1, 0));
-
-        /*   0FL 0FR 1BL 1BR */
-        /* + 0BL 0BR 1FL 1FR */
-        /* =  0L  0R  1L  1R */
-        __m128 out = _mm_add_ps(blended, in1);
-        out = _mm_add_ps(out, fc_distributed);
-        out = _mm_mul_ps(out, two_fifths_v);
-
-        _mm_storeu_ps(dst, out);
-
-        i -= 2; src += 12; dst += 4;
-    }
-
-
-    /* Finish off any leftovers with scalar operations. */
-    while (i) {
-        const float front_center_distributed = src[2] * 0.5f;
-        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
-        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
-        i--; src += 6; dst+=2;
-    }
-
-    cvt->len_cvt /= 3;
-    if (cvt->filters[++cvt->filter_index]) {
-        cvt->filters[cvt->filter_index] (cvt, format);
-    }
-}
-#endif
-
-#if HAVE_NEON_INTRINSICS
-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
-static void SDLCALL
-SDL_Convert51ToStereo_NEON(SDL_AudioCVT * cvt, SDL_AudioFormat format)
-{
-    float *dst = (float *) cvt->buf;
-    const float *src = dst;
-    int i = cvt->len_cvt / (sizeof (float) * 6);
-    const float two_fifths_f = 1.0f / 2.5f;
-    const float32x4_t two_fifths_v = vdupq_n_f32(two_fifths_f);
-    const float32x4_t half = vdupq_n_f32(0.5f);
-
-    LOG_DEBUG_CONVERT("5.1", "stereo (using NEON)");
-    SDL_assert(format == AUDIO_F32SYS);
-
-    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
-
-    /* Just use unaligned load/stores, it's the same NEON instructions and
-       hopefully even unaligned NEON is faster than the scalar fallback. */
-    while (i >= 2) {
-        /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
-        /* registers. Using shuffles they can be rearranged so that */
-        /* the conversion math can be vectorized. */
-        const float32x4_t in0 = vld1q_f32(src);     /* 0FL 0FR 0FC 0LF */
-        const float32x4_t in1 = vld1q_f32(src + 4); /* 0BL 0BR 1FL 1FR */
-        const float32x4_t in2 = vld1q_f32(src + 8); /* 1FC 1LF 1BL 1BR */
-
-        /* 0FC 0FC 1FC 1FC */
-        const float32x4_t fc_distributed = vmulq_f32(half, vcombine_f32(vdup_lane_f32(vget_high_f32(in0), 0), vdup_lane_f32(vget_low_f32(in2), 0)));
-
-        /* 0FL 0FR 1BL 1BR */
-        const float32x4_t blended = vcombine_f32(vget_low_f32(in0), vget_high_f32(in2));
-
-        /*   0FL 0FR 1BL 1BR */
-        /* + 0BL 0BR 1FL 1FR */
-        /* =  0L  0R  1L  1R */
-        float32x4_t out = vaddq_f32(blended, in1);
-        out = vaddq_f32(out, fc_distributed);
-        out = vmulq_f32(out, two_fifths_v);
-
-        vst1q_f32(dst, out);
-
-        i -= 2; src += 12; dst += 4;
-    }
-
-    /* Finish off any leftovers with scalar operations. */
-    while (i) {
-        const float front_center_distributed = src[2] * 0.5f;
-        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
-        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
-        i--; src += 6; dst+=2;
-    }
-
-    cvt->len_cvt /= 3;
-    if (cvt->filters[++cvt->filter_index]) {
-        cvt->filters[cvt->filter_index] (cvt, format);
-    }
-}
-#endif
-#endif
-
-
-#if HAVE_SSE3_INTRINSICS
-/* Convert from stereo to mono. Average left and right. */
-static void SDLCALL
-SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
-{
-    const __m128 divby2 = _mm_set1_ps(0.5f);
-    float *dst = (float *) cvt->buf;
-    const float *src = dst;
-    int i = cvt->len_cvt / 8;
-
-    LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
+    LOG_DEBUG_CONVERT("mono", "stereo (using SSE)");
     SDL_assert(format == AUDIO_F32SYS);
 
     /* Do SSE blocks as long as we have 16 bytes available.
        Just use unaligned load/stores, if the memory at runtime is
        aligned it'll be just as fast on modern processors */
+    /* convert backwards, since output is growing in-place. */
     while (i >= 4) {   /* 4 * float32 */
-        _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_loadu_ps(src), _mm_loadu_ps(src+4)), divby2));
-        i -= 4; src += 8; dst += 4;
+        const __m128 input = _mm_loadu_ps(src);   /* A B C D */
+        _mm_storeu_ps(dst, _mm_unpacklo_ps(input, input));  /* A A B B */
+        _mm_storeu_ps(dst+4, _mm_unpackhi_ps(input, input));  /* C C D D */
+        i -= 4; src -= 4; dst -= 8;
     }
 
     /* Finish off any leftovers with scalar operations. */
-    while (i) {
-        *dst = (src[0] + src[1]) * 0.5f;
-        dst++; i--; src += 2;
+    src += 3; dst += 6;  /* adjust for smaller buffers. */
+    while (i) {     /* convert backwards, since output is growing in-place. */
+        const float srcFC = src[0];
+        dst[1] /* FR */ = srcFC;
+        dst[0] /* FL */ = srcFC;
+        i--; src--; dst -= 2;
     }
 
-    cvt->len_cvt /= 2;
+    cvt->len_cvt *= 2;
     if (cvt->filters[++cvt->filter_index]) {
         cvt->filters[cvt->filter_index] (cvt, format);
     }
@@ -833,24 +682,16 @@ SDL_BuildAudioCVT(SDL_AudioCVT * cvt,
         return SDL_SetError("Invalid channel combination");
     } else if (channel_converter != NULL) {
         /* swap in some SIMD versions for a few of these. */
-        if (channel_converter == SDL_Convert51ToStereo) {
+        if (channel_converter == SDL_ConvertStereoToMono) {
             SDL_AudioFilter filter = NULL;
-#if 0 /* !!! FIXME: these have not been updated for the new formulas */
-            #if HAVE_AVX_INTRINSICS
-            if (!filter && SDL_HasAVX()) { filter = SDL_Convert51ToStereo_AVX; }
-            #endif
-            #if HAVE_SSE_INTRINSICS
-            if (!filter && SDL_HasSSE()) { filter = SDL_Convert51ToStereo_SSE; }
-            #endif
-            #if HAVE_NEON_INTRINSICS
-            if (!filter && SDL_HasNEON()) { filter = SDL_Convert51ToStereo_NEON; }
+            #if HAVE_SSE3_INTRINSICS
+            if (!filter && SDL_HasSSE3()) { filter = SDL_ConvertStereoToMono_SSE3; }
             #endif
-#endif
             if (filter) { channel_converter = filter; }
-        } else if (channel_converter == SDL_ConvertStereoToMono) {
+        } else if (channel_converter == SDL_ConvertMonoToStereo) {
             SDL_AudioFilter filter = NULL;
-            #if HAVE_SSE3_INTRINSICS
-            if (!filter && SDL_HasSSE3()) { filter = SDL_ConvertStereoToMono_SSE3; }
+            #if HAVE_SSE_INTRINSICS
+            if (!filter && SDL_HasSSE()) { filter = SDL_ConvertMonoToStereo_SSE; }
             #endif
             if (filter) { channel_converter = filter; }
         }