SDL: audio: first attempt at rewriting the channel converters.

From 257277903e23c3e4cb2565524b34f80b73ac20da Mon Sep 17 00:00:00 2001
From: "Ryan C. Gordon" <[EMAIL REDACTED]>
Date: Tue, 19 Jul 2022 02:16:08 -0400
Subject: [PATCH] audio: first attempt at rewriting the channel converters.

This is not ready for production use!
---
 src/audio/SDL_audio_c.h  |    2 +-
 src/audio/SDL_audiocvt.c | 1626 ++++++++++++++++++++++++++------------
 2 files changed, 1132 insertions(+), 496 deletions(-)

diff --git a/src/audio/SDL_audio_c.h b/src/audio/SDL_audio_c.h
index bfa0760de59..3f22cd614a3 100644
--- a/src/audio/SDL_audio_c.h
+++ b/src/audio/SDL_audio_c.h
@@ -25,7 +25,7 @@
 #include "../SDL_internal.h"
 
 #ifndef DEBUG_CONVERT
-#define DEBUG_CONVERT 0
+#define DEBUG_CONVERT 1
 #endif
 
 #if DEBUG_CONVERT
diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c
index 0b95b417c75..1117420e935 100644
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@@ -63,61 +63,41 @@
 # endif
 #endif
 
-#if HAVE_SSE3_INTRINSICS
-/* Convert from stereo to mono. Average left and right. */
-static void SDLCALL
-SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
-{
-    const __m128 divby2 = _mm_set1_ps(0.5f);
-    float *dst = (float *) cvt->buf;
-    const float *src = dst;
-    int i = cvt->len_cvt / 8;
-
-    LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
-    SDL_assert(format == AUDIO_F32SYS);
-
-    /* Do SSE blocks as long as we have 16 bytes available.
-       Just use unaligned load/stores, if the memory at runtime is
-       aligned it'll be just as fast on modern processors */
-    while (i >= 4) {   /* 4 * float32 */
-        _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_loadu_ps(src), _mm_loadu_ps(src+4)), divby2));
-        i -= 4; src += 8; dst += 4;
-    }
-
-    /* Finish off any leftovers with scalar operations. */
-    while (i) {
-        *dst = (src[0] + src[1]) * 0.5f;
-        dst++; i--; src += 2;
-    }
-
-    cvt->len_cvt /= 2;
-    if (cvt->filters[++cvt->filter_index]) {
-        cvt->filters[cvt->filter_index] (cvt, format);
-    }
-}
-#endif
-
-/* Convert from stereo to mono. Average left and right. */
-static void SDLCALL
-SDL_ConvertStereoToMono(SDL_AudioCVT * cvt, SDL_AudioFormat format)
-{
-    float *dst = (float *) cvt->buf;
-    const float *src = dst;
-    int i;
-
-    LOG_DEBUG_CONVERT("stereo", "mono");
-    SDL_assert(format == AUDIO_F32SYS);
-
-    for (i = cvt->len_cvt / 8; i; --i, src += 2) {
-        *(dst++) = (src[0] + src[1]) * 0.5f;
-    }
-
-    cvt->len_cvt /= 2;
-    if (cvt->filters[++cvt->filter_index]) {
-        cvt->filters[cvt->filter_index] (cvt, format);
-    }
-}
-
+/*
+ * CHANNEL LAYOUTS AS SDL EXPECTS THEM:
+ *
+ * (Even if the platform expects something else later, that
+ * SDL will swizzle between the app and the platform).
+ *
+ * Abbreviations:
+ * - FRONT=single mono speaker
+ * - FL=front left speaker
+ * - FR=front right speaker
+ * - FC=front center speaker
+ * - BL=back left speaker
+ * - BR=back right speaker
+ * - SR=side right speaker
+ * - SL=side left speaker
+ * - BC=back center speaker
+ * - LFE=low-frequency speaker
+ *
+ * These are listed in the order they are laid out in
+ * memory, so "FL+FR" means "the front left speaker is
+ * layed out in memory first, then the front right, then
+ * it repeats for the next audio frame".
+ *
+ * 1 channel (mono) layout: FRONT
+ * 2 channels (stereo) layout: FL+FR
+ * 3 channels (2.1) layout: FL+FR+LFE
+ * 4 channels (quad) layout: FL+FR+BL+BR
+ * 5 channels (4.1) layout: FL+FR+LFE+BL+BR
+ * 6 channels (5.1) layout: FL+FR+FC+LFE+BL+BR
+ * 7 channels (6.1) layout: FL+FR+FC+LFE+BC+SL+SR
+ * 8 channels (7.1) layout: FL+FR+FC+LFE+BL+BR+SL+SR
+ */
+
+
+#if 0  /* !!! FIXME: these need to be updated to match the new scalar code. */
 #if HAVE_AVX_INTRINSICS
 /* MSVC will always accept AVX intrinsics when compiling for x64 */
 #if defined(__clang__) || defined(__GNUC__)
@@ -225,258 +205,979 @@ SDL_Convert51ToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 
         _mm_storeu_ps(dst, out);
 
-        i -= 2; src += 12; dst += 4;
-    }
+        i -= 2; src += 12; dst += 4;
+    }
+
+
+    /* Finish off any leftovers with scalar operations. */
+    while (i) {
+        const float front_center_distributed = src[2] * 0.5f;
+        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
+        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
+        i--; src += 6; dst+=2;
+    }
+
+    cvt->len_cvt /= 3;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+#endif
+
+#if HAVE_NEON_INTRINSICS
+/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
+static void SDLCALL
+SDL_Convert51ToStereo_NEON(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i = cvt->len_cvt / (sizeof (float) * 6);
+    const float two_fifths_f = 1.0f / 2.5f;
+    const float32x4_t two_fifths_v = vdupq_n_f32(two_fifths_f);
+    const float32x4_t half = vdupq_n_f32(0.5f);
+
+    LOG_DEBUG_CONVERT("5.1", "stereo (using NEON)");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
+
+    /* Just use unaligned load/stores, it's the same NEON instructions and
+       hopefully even unaligned NEON is faster than the scalar fallback. */
+    while (i >= 2) {
+        /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
+        /* registers. Using shuffles they can be rearranged so that */
+        /* the conversion math can be vectorized. */
+        const float32x4_t in0 = vld1q_f32(src);     /* 0FL 0FR 0FC 0LF */
+        const float32x4_t in1 = vld1q_f32(src + 4); /* 0BL 0BR 1FL 1FR */
+        const float32x4_t in2 = vld1q_f32(src + 8); /* 1FC 1LF 1BL 1BR */
+
+        /* 0FC 0FC 1FC 1FC */
+        const float32x4_t fc_distributed = vmulq_f32(half, vcombine_f32(vdup_lane_f32(vget_high_f32(in0), 0), vdup_lane_f32(vget_low_f32(in2), 0)));
+
+        /* 0FL 0FR 1BL 1BR */
+        const float32x4_t blended = vcombine_f32(vget_low_f32(in0), vget_high_f32(in2));
+
+        /*   0FL 0FR 1BL 1BR */
+        /* + 0BL 0BR 1FL 1FR */
+        /* =  0L  0R  1L  1R */
+        float32x4_t out = vaddq_f32(blended, in1);
+        out = vaddq_f32(out, fc_distributed);
+        out = vmulq_f32(out, two_fifths_v);
+
+        vst1q_f32(dst, out);
+
+        i -= 2; src += 12; dst += 4;
+    }
+
+    /* Finish off any leftovers with scalar operations. */
+    while (i) {
+        const float front_center_distributed = src[2] * 0.5f;
+        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
+        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
+        i--; src += 6; dst+=2;
+    }
+
+    cvt->len_cvt /= 3;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+#endif
+#endif
+
+
+/* Channel conversion is now mostly following this scheme, borrowed from FNA
+   (which is following the scheme of XNA)...
+   https://github.com/FNA-XNA/FAudio/blob/master/src/matrix_defaults.inl */
+
+/* CONVERT FROM MONO... */
+/* Mono duplicates to stereo and all other channels are silenced. */
+
+#define CVT_MONO_TO(toname, tonamestr, num_channels, zeroingcode) \
+    static void SDLCALL SDL_ConvertMonoTo##toname(SDL_AudioCVT * cvt, SDL_AudioFormat format) { \
+        const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 1; \
+        float *dst = ((float *) (cvt->buf + cvt->len_cvt * num_channels)) - num_channels; \
+        int i; \
+        LOG_DEBUG_CONVERT("mono", tonamestr); \
+        SDL_assert(format == AUDIO_F32SYS); \
+        SDL_assert(num_channels >= 2); \
+        for (i = cvt->len_cvt / sizeof (float); i; i--, src--, dst -= num_channels) { \
+            dst[0] = dst[1] = *src; \
+            zeroingcode; \
+        } \
+        cvt->len_cvt *= num_channels; \
+        if (cvt->filters[++cvt->filter_index]) { \
+            cvt->filters[cvt->filter_index] (cvt, format); \
+        } \
+    }
+CVT_MONO_TO(Stereo, "stereo", 2, {});
+CVT_MONO_TO(21, "2.1", 3, { dst[2] = 0.0f; });
+CVT_MONO_TO(Quad, "quad", 4, { dst[2] = dst[3] = 0.0f; });
+CVT_MONO_TO(41, "4.1", 5, { dst[2] = dst[3] = dst[4] = 0.0f; });
+CVT_MONO_TO(51, "5.1", 6, { dst[2] = dst[3] = dst[4] = dst[5] = 0.0f; });
+CVT_MONO_TO(61, "6.1", 7, { dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = 0.0f; });
+CVT_MONO_TO(71, "7.1", 8, { dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = 0.0f; });
+#undef CVT_MONO_TO
+
+
+
+/* CONVERT FROM STEREO... */
+/* Stereo duplicates to two front speakers and all other channels are silenced. */
+
+#if HAVE_SSE3_INTRINSICS
+/* Convert from stereo to mono. Average left and right. */
+static void SDLCALL
+SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    const __m128 divby2 = _mm_set1_ps(0.5f);
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i = cvt->len_cvt / 8;
+
+    LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    /* Do SSE blocks as long as we have 16 bytes available.
+       Just use unaligned load/stores, if the memory at runtime is
+       aligned it'll be just as fast on modern processors */
+    while (i >= 4) {   /* 4 * float32 */
+        _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_loadu_ps(src), _mm_loadu_ps(src+4)), divby2));
+        i -= 4; src += 8; dst += 4;
+    }
+
+    /* Finish off any leftovers with scalar operations. */
+    while (i) {
+        *dst = (src[0] + src[1]) * 0.5f;
+        dst++; i--; src += 2;
+    }
+
+    cvt->len_cvt /= 2;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+#endif
+
+/* Convert from stereo to mono. Average left and right. */
+static void SDLCALL
+SDL_ConvertStereoToMono(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("stereo", "mono");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 2); i; --i, src += 2) {
+        *(dst++) = (src[0] + src[1]) * 0.5f;
+    }
+
+    cvt->len_cvt /= 2;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+#define CVT_STEREO_TO(toname, tonamestr, num_channels, zeroingcode) \
+    static void SDLCALL SDL_ConvertStereoTo##toname(SDL_AudioCVT * cvt, SDL_AudioFormat format) { \
+        int i; \
+        const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 2; \
+        float *dst = ((float *) (cvt->buf + ((cvt->len_cvt / 2) * num_channels))) - num_channels; \
+        LOG_DEBUG_CONVERT("stereo", tonamestr); \
+        SDL_assert(format == AUDIO_F32SYS); \
+        SDL_assert(num_channels >= 3); \
+        for (i = cvt->len_cvt / (sizeof (float) * 2); i; --i, dst -= num_channels, src -= 2) { \
+            dst[0] = src[0]; \
+            dst[1] = src[1]; \
+            zeroingcode; \
+        } \
+        cvt->len_cvt = (cvt->len_cvt / 2) * num_channels; \
+        if (cvt->filters[++cvt->filter_index]) { \
+            cvt->filters[cvt->filter_index] (cvt, format); \
+        } \
+    }
+
+CVT_STEREO_TO(21, "2.1", 3, { dst[2] = 0.0f; });
+CVT_STEREO_TO(Quad, "quad", 4, { dst[2] = dst[3] = 0.0f; });
+CVT_STEREO_TO(41, "4.1", 5, { dst[2] = dst[3] = dst[4] = 0.0f; });
+CVT_STEREO_TO(51, "5.1", 6, { dst[2] = dst[3] = dst[4] = dst[5] = 0.0f; });
+CVT_STEREO_TO(61, "6.1", 7, { dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = 0.0f; });
+CVT_STEREO_TO(71, "7.1", 8, { dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = 0.0f; });
+#undef CVT_STEREO_TO
+
+
+
+/* CONVERT FROM 2.1... */
+/* 2.1 duplicates to two front speakers (and LFE when available) and all other channels are silenced. */
+
+/* Convert from 2.1 to mono. Average left and right, drop LFE. */
+static void SDLCALL
+SDL_Convert21ToMono(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("2.1", "mono");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 2); i; --i, src += 3) {
+        *(dst++) = (src[0] + src[1]) * 0.5f;
+    }
+
+    cvt->len_cvt /= 3;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+#define CVT_21_TO(toname, tonamestr, num_channels, customcode) \
+    static void SDLCALL SDL_Convert21To##toname(SDL_AudioCVT * cvt, SDL_AudioFormat format) { \
+        int i; \
+        float lf, rf, lfe; \
+        const float *src = (const float *) (cvt->buf + cvt->len_cvt); \
+        float *dst = (float *) (cvt->buf + ((cvt->len_cvt / 3) * num_channels)); \
+        LOG_DEBUG_CONVERT("2.1", tonamestr); \
+        SDL_assert(format == AUDIO_F32SYS); \
+        SDL_assert(num_channels >= 2); \
+        for (i = cvt->len_cvt / (sizeof (float) * 3); i; --i) { \
+            dst -= num_channels; \
+            src -= 2; \
+            lf = src[0]; \
+            rf = src[1]; \
+            lfe = src[2]; \
+            dst[0] = lf; \
+            dst[1] = rf; \
+            customcode; \
+        } \
+        cvt->len_cvt = (cvt->len_cvt / 3) * num_channels; \
+        if (cvt->filters[++cvt->filter_index]) { \
+            cvt->filters[cvt->filter_index] (cvt, format); \
+        } \
+    }
+
+CVT_21_TO(Stereo, "stereo", 2, { (void) lfe; });
+CVT_21_TO(Quad, "quad", 4, { (void) lfe; dst[2] = dst[3] = 0.0f; });
+CVT_21_TO(41, "4.1", 5, { dst[2] = lfe; dst[3] = dst[4] = 0.0f; });
+CVT_21_TO(51, "5.1", 6, { dst[2] = 0.0f; dst[3] = lfe; dst[4] = dst[5] = 0.0f; });
+CVT_21_TO(61, "6.1", 7, { dst[2] = 0.0f; dst[3] = lfe; dst[4] = dst[5] = dst[6] = 0.0f; });
+CVT_21_TO(71, "7.1", 8, { dst[2] = 0.0f; dst[3] = lfe; dst[4] = dst[5] = dst[6] = dst[7] = 0.0f; });
+#undef CVT_21_TO
+
+
+/* CONVERT FROM QUAD... */
+
+static void SDLCALL
+SDL_ConvertQuadToMono(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("quad", "mono");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    /* !!! FIXME: could benefit from SIMD */
+    for (i = cvt->len_cvt / (sizeof (float) * 4); i; --i, src += 4) {
+        *(dst++) = (src[0] + src[1] + src[3] + src[4]) * 0.25f;
+    }
+
+    cvt->len_cvt /= 4;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_ConvertQuadToStereo(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("quad", "stereo");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    /* !!! FIXME: could benefit from SIMD */
+    for (i = cvt->len_cvt / (sizeof (float) * 4); i; --i, src += 4) {
+        const float fl = src[0];
+        const float fr = src[1];
+        const float bl = src[2];
+        const float br = src[3];
+        /* !!! FIXME: FNA/XNA mixes a little of the back right into the left (and back left into the right)...but this can't possibly be right, right...? */
+        *(dst++) = (fl * 0.421000004f) + (bl * 0.358999997f) + (br * 0.219999999f);
+        *(dst++) = (fr * 0.421000004f) + (br * 0.358999997f) + (bl * 0.219999999f);
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 4) * 2;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_ConvertQuadTo21(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("quad", "2.1");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    /* !!! FIXME: could benefit from SIMD */
+    for (i = cvt->len_cvt / (sizeof (float) * 4); i; --i, src += 4) {
+        const float fl = src[0];
+        const float fr = src[1];
+        const float bl = src[2];
+        const float br = src[3];
+        /* !!! FIXME: FNA/XNA mixes a little of the back right into the left (and back left into the right)...but this can't possibly be right, right...? */
+        *(dst++) = (fl * 0.421000004f) + (bl * 0.358999997f) + (br * 0.219999999f);
+        *(dst++) = (fr * 0.421000004f) + (br * 0.358999997f) + (bl * 0.219999999f);
+        *(dst++) = 0.0f;  /* lfe */
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 4) * 3;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_ConvertQuadTo41(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 4;
+    float *dst = ((float *) (cvt->buf + ((cvt->len_cvt / 4) * 5))) - 5;
+    int i;
+
+    LOG_DEBUG_CONVERT("quad", "4.1");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 4); i; --i, src -= 4, dst -= 5) {
+        dst[4] = src[3];
+        dst[3] = src[2];
+        dst[2] = 0.0f;  /* LFE */
+        dst[1] = src[1];
+        dst[0] = src[0];
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 4) * 5;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_ConvertQuadTo51(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 4;
+    float *dst = ((float *) (cvt->buf + ((cvt->len_cvt / 4) * 6))) - 6;
+    int i;
+
+    LOG_DEBUG_CONVERT("quad", "5.1");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 4); i; --i, src -= 4, dst -= 6) {
+        dst[5] = src[3];
+        dst[4] = src[2];
+        dst[3] = 0.0f;  /* LFE */
+        dst[2] = 0.0f;  /* FC */
+        dst[1] = src[1];
+        dst[0] = src[0];
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 4) * 6;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_ConvertQuadTo61(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 4;
+    float *dst = ((float *) (cvt->buf + ((cvt->len_cvt / 4) * 7))) - 7;
+    int i;
+
+    LOG_DEBUG_CONVERT("quad", "6.1");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    /* !!! FIXME: I'm skeptical XNA/FNA's conversion is right, here. */
+
+    for (i = cvt->len_cvt / (sizeof (float) * 4); i; --i, src -= 4, dst -= 7) {
+        const float bl = src[2];
+        const float br = src[3];
+        dst[6] = br * 0.796000004f;
+        dst[5] = bl * 0.796000004f;
+        dst[4] = (bl + br) * 0.5f;  /* average BL+BR to BC */
+        dst[3] = 0.0f;  /* LFE */
+        dst[2] = 0.0f;  /* FC */
+        dst[1] = src[1] * 0.939999998f;
+        dst[0] = src[0] * 0.939999998f;
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 4) * 7;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_ConvertQuadTo71(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 4;
+    float *dst = ((float *) (cvt->buf + ((cvt->len_cvt / 4) * 8))) - 8;
+    int i;
+
+    LOG_DEBUG_CONVERT("quad", "7.1");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 4); i; --i, src -= 4, dst -= 8) {
+        dst[7] = 0.0f;  /* SR */
+        dst[6] = 0.0f;  /* SL */
+        dst[5] = src[3];
+        dst[4] = src[2];
+        dst[3] = 0.0f;  /* LFE */
+        dst[2] = 0.0f;  /* FC */
+        dst[1] = src[1];
+        dst[0] = src[0];
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 4) * 8;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+
+/* CONVERT FROM 4.1... */
+
+static void SDLCALL
+SDL_Convert41ToMono(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("4.1", "mono");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 5); i; --i, src += 5) {
+        *(dst++) = (src[0] + src[1] + src[3] + src[4]) * 0.25f;
+    }
+
+    cvt->len_cvt /= 5;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_Convert41ToStereo(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("4.1", "stereo");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 5); i; --i, src += 5) {
+        const float fl = src[0];
+        const float fr = src[1];
+        const float bl = src[3];
+        const float br = src[4];
+        /* !!! FIXME: FNA/XNA mixes a little of the back right into the left (and back left into the right) and a little of the LFE...but this can't possibly be right, right...? */
+        *(dst++) = (fl * 0.374222219f) + (bl * 0.319111109f) + (br * 0.195555553f);
+        *(dst++) = (fr * 0.374222219f) + (br * 0.319111109f) + (bl * 0.195555553f);
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 5) * 2;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_Convert41To21(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("4.1", "2.1");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 5); i; --i, src += 5) {
+        const float fl = src[0];
+        const float fr = src[1];
+        const float lfe = src[2];
+        const float bl = src[3];
+        const float br = src[4];
+        /* !!! FIXME: FNA/XNA mixes a little of the back right into the left (and back left into the right) and a little of the LFE...but this can't possibly be right, right...? */
+        *(dst++) = (fl * 0.374222219f) + (bl * 0.319111109f) + (br * 0.195555553f);
+        *(dst++) = (fr * 0.374222219f) + (br * 0.319111109f) + (bl * 0.195555553f);
+        *(dst++) = lfe;
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 5) * 3;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_Convert41ToQuad(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("4.1", "quad");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 5); i; --i, src += 5) {
+        /* !!! FIXME: FNA/XNA mixes a little of the LFE into every channel...but this can't possibly be right, right...? I just drop the LFE and copy the channels. */
+        *(dst++) = src[0];
+        *(dst++) = src[1];
+        *(dst++) = src[3];
+        *(dst++) = src[4];
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 5) * 4;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_Convert41To51(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 5;
+    float *dst = ((float *) (cvt->buf + ((cvt->len_cvt / 5) * 6))) - 6;
+    int i;
+
+    LOG_DEBUG_CONVERT("4.1", "5.1");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 5); i; --i, src -= 5, dst -= 6) {
+        dst[5] = src[4];
+        dst[4] = src[3];
+        dst[3] = src[2];
+        dst[2] = 0.0f;  /* FC */
+        dst[1] = src[1];
+        dst[0] = src[0];
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 5) * 6;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_Convert41To61(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 5;
+    float *dst = ((float *) (cvt->buf + ((cvt->len_cvt / 4) * 7))) - 7;
+    int i;
+
+    LOG_DEBUG_CONVERT("4.1", "6.1");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    /* !!! FIXME: I'm skeptical XNA/FNA's conversion is right, here. */
+
+    for (i = cvt->len_cvt / (sizeof (float) * 5); i; --i, src -= 5, dst -= 7) {
+        const float bl = src[3];
+        const float br = src[4];
+        dst[6] = br * 0.796000004f;
+        dst[5] = bl * 0.796000004f;
+        dst[4] = (bl + br) * 0.5f;  /* average BL+BR to BC */
+        dst[3] = src[2];
+        dst[2] = 0.0f;  /* FC */
+        dst[1] = src[1] * 0.939999998f;
+        dst[0] = src[0] * 0.939999998f;
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 5) * 7;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_Convert41To71(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    const float *src = ((const float *) (cvt->buf + cvt->len_cvt)) - 5;
+    float *dst = ((float *) (cvt->buf + ((cvt->len_cvt / 5) * 8))) - 8;
+    int i;
+
+    LOG_DEBUG_CONVERT("4.1", "7.1");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 5); i; --i, src -= 5, dst -= 8) {
+        dst[7] = 0.0f;  /* SR */
+        dst[6] = 0.0f;  /* SL */
+        dst[5] = src[4];
+        dst[4] = src[3];
+        dst[3] = src[2];
+        dst[2] = 0.0f;  /* FC */
+        dst[1] = src[1];
+        dst[0] = src[0];
+    }
+
+    cvt->len_cvt = (cvt->len_cvt / 5) * 8;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+
+
+/* CONVERT FROM 5.1... */
+
+static void SDLCALL
+SDL_Convert51ToMono(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("5.1", "mono");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    for (i = cvt->len_cvt / (sizeof (float) * 6); i; --i, src += 6) {
+        *(dst++) = (src[0] + src[1] + src[2] + src[4] + src[5]) * 0.200000003f;
+    }
+
+    cvt->len_cvt /= 6;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+
+static void SDLCALL
+SDL_Convert51ToStereo(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
+
+    LOG_DEBUG_CONVERT("5.1", "stereo");
+    SDL_assert(format == AUDIO_F32SYS);
 
+    for (i = cvt->len_cvt / (sizeof (float) * 6); i; --i, src += 6) {
+        const float fl = src[0];
+        const float fr = src[1];
+        const float fc = src[2];
+        const float bl = src[4];
+        const float br = src[5];
+        const float extra = 0.090909094f / 4.0f;  /* this was the LFE distribution, we'll just split it between the other channels for now. */
 
-    /* Finish off any leftovers with scalar operations. */
-    while (i) {
-        const float front_center_distributed = src[2] * 0.5f;
-        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
-        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
-        i--; src += 6; dst+=2;
+        /* !!! FIXME: FNA/XNA mixes a little of the back right into the left (and back left into the right) and a little of the LFE...but this can't possibly be right, right...? */
+        *(dst++) = (fl * (0.294545442f+extra)) + (fc * (0.208181813f+extra)) + (bl * (0.251818180f+extra)) + (br * (0.154545456f+extra));
+        *(dst++) = (fr * (0.294545442f+extra)) + (fc * (0.208181813f+extra)) + (br * (0.251818180f+extra)) + (bl * (0.154545456f+extra));
     }
 
-    cvt->len_cvt /= 3;
+    cvt->len_cvt = (cvt->len_cvt / 6) * 2;
     if (cvt->filters[++cvt->filter_index]) {
         cvt->filters[cvt->filter_index] (cvt, format);
     }
 }
-#endif
 
-#if HAVE_NEON_INTRINSICS
-/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
 static void SDLCALL
-SDL_Convert51ToStereo_NEON(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+SDL_Convert51To21(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 {
     float *dst = (float *) cvt->buf;
     const float *src = dst;
-    int i = cvt->len_cvt / (sizeof (float) * 6);
-    const float two_fifths_f = 1.0f / 2.5f;
-    const float32x4_t two_fifths_v = vdupq_n_f32(two_fifths_f);
-    const float32x4_t half = vdupq_n_f32(0.5f);
+    int i;
 
-    LOG_DEBUG_CONVERT("5.1", "stereo (using NEON)");
+    LOG_DEBUG_CONVERT("5.1", "2.1");
     SDL_assert(format == AUDIO_F32SYS);
 
-    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
-
-    /* Just use unaligned load/stores, it's the same NEON instructions and
-       hopefully even unaligned NEON is faster than the scalar fallback. */
-    while (i >= 2) {
-        /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
-        /* registers. Using shuffles they can be rearranged so that */
-        /* the conversion math can be vectorized. */
-        const float32x4_t in0 = vld1q_f32(src);     /* 0FL 0FR 0FC 0LF */
-        const float32x4_t in1 = vld1q_f32(src + 4); /* 0BL 0BR 1FL 1FR */
-        const float32x4_t in2 = vld1q_f32(src + 8); /* 1FC 1LF 1BL 1BR */
-
-        /* 0FC 0FC 1FC 1FC */
-        const float32x4_t fc_distributed = vmulq_f32(half, vcombine_f32(vdup_lane_f32(vget_high_f32(in0), 0), vdup_lane_f32(vget_low_f32(in2), 0)));
+    for (i = cvt->len_cvt / (sizeof (float) * 6); i; --i, src += 6) {
+        const float fl = src[0];
+        const float fr = src[1];
+        const float fc = src[2];
+        const float lfe = src[3];
+        const float bl = src[4];
+        const float br = src[5];
 
-        /* 0FL 0FR 1BL 1BR */
-        const float32x4_t blended = vcombine_f32(vget_low_f32(in0), vget_high_f32(in2));
+        /* !!! FIXME: FNA/XNA mixes a little of the back right into the left (and back left into the right) and a little of the LFE...but this can't possibly be right, right...? */
+        *(dst++) = (fl * 0.324000001f) + (fc * 0.229000002f) + (bl * 0.277000010f) + (br * 0.170000002f);
+        *(dst++) = (fr * 0.324000001f) + (fc * 0.229000002f) + (br * 0.277000010f) + (bl * 0.170000002f);
+        *(dst++) = lfe;
+    }
 
-        /*   0FL 0FR 1BL 1BR */
-        /* + 0BL 0BR 1FL 1FR */
-        /* =  0L  0R  1L  1R */
-        float32x4_t out = vaddq_f32(blended, in1);
-        out = vaddq_f32(out, fc_distributed);
-        out = vmulq_f32(out, two_fifths_v);
+    cvt->len_cvt = (cvt->len_cvt / 6) * 3;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
 
-        vst1q_f32(dst, out);
+static void SDLCALL
+SDL_Convert51ToQuad(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i;
 
-        i -= 2; src += 12; dst += 4;
-    }
+    LOG_DEBUG_CONVERT("5.1", "quad");
+    SDL_assert(format == AUDIO_F32SYS);
 
-    /* Finish off any leftovers with scalar operations. */
-    while (i) {
-        const float front_center_distributed = src[2] * 0.5f;
-        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
-        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
-        i--; src += 6; dst+=2;
+    for (i = cvt->len_cvt / (sizeof (float) * 6); i; --i, src += 6) {
+        const float fl = src[0];
+        const float fr = 

(Patch may be truncated, please check the link at the top of this post.)