SDL: Fixed bug #4534: NEON implementation of Convert51ToStereo (Thanks Ryan!)

From 072e3fdfc437a87832d4ff569e8e1393abe251ae Mon Sep 17 00:00:00 2001
From: Sylvain <[EMAIL REDACTED]>
Date: Thu, 14 Oct 2021 23:17:08 +0200
Subject: [PATCH] Fixed bug #4534: NEON implementation of Convert51ToStereo
 (Thanks Ryan!)

---
 src/audio/SDL_audiocvt.c | 74 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c
index 225ced6661..f0cb7e6e40 100644
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@@ -35,6 +35,10 @@
 
 #define DEBUG_AUDIOSTREAM 0
 
+#ifdef __ARM_NEON
+#define HAVE_NEON_INTRINSICS 1
+#endif
+
 #ifdef __SSE__
 #define HAVE_SSE_INTRINSICS 1
 #endif
@@ -240,6 +244,66 @@ SDL_Convert51ToStereo_SSE(SDL_AudioCVT * cvt, SDL_AudioFormat format)
 }
 #endif
 
+#if HAVE_NEON_INTRINSICS
+/* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
+static void SDLCALL
+SDL_Convert51ToStereo_NEON(SDL_AudioCVT * cvt, SDL_AudioFormat format)
+{
+    float *dst = (float *) cvt->buf;
+    const float *src = dst;
+    int i = cvt->len_cvt / (sizeof (float) * 6);
+    const float two_fifths_f = 1.0f / 2.5f;
+    const float32x4_t two_fifths_v = vdupq_n_f32(two_fifths_f);
+    const float32x4_t half = vdupq_n_f32(0.5f);
+
+    LOG_DEBUG_CONVERT("5.1", "stereo (using NEON)");
+    SDL_assert(format == AUDIO_F32SYS);
+
+    /* SDL's 5.1 layout: FL+FR+FC+LFE+BL+BR */
+
+    /* Just use unaligned load/stores, it's the same NEON instructions and
+       hopefully even unaligned NEON is faster than the scalar fallback. */
+    while (i >= 2) {
+        /* Two 5.1 samples (12 floats) fit nicely in three 128bit */
+        /* registers. Using shuffles they can be rearranged so that */
+        /* the conversion math can be vectorized. */
+        const float32x4_t in0 = vld1q_f32(src);     /* 0FL 0FR 0FC 0LF */
+        const float32x4_t in1 = vld1q_f32(src + 4); /* 0BL 0BR 1FL 1FR */
+        const float32x4_t in2 = vld1q_f32(src + 8); /* 1FC 1LF 1BL 1BR */
+
+        /* 0FC 0FC 1FC 1FC */
+        const float32x4_t fc_distributed = vmulq_f32(half, vcombine_f32(vdup_lane_f32(vget_high_f32(in0), 0), vdup_lane_f32(vget_low_f32(in2), 0)));
+
+        /* 0FL 0FR 1BL 1BR */
+        const float32x4_t blended = vcombine_f32(vget_low_f32(in0), vget_high_f32(in2));
+
+        /*   0FL 0FR 1BL 1BR */
+        /* + 0BL 0BR 1FL 1FR */
+        /* =  0L  0R  1L  1R */
+        float32x4_t out = vaddq_f32(blended, in1);
+        out = vaddq_f32(out, fc_distributed);
+        out = vmulq_f32(out, two_fifths_v);
+
+        vst1q_f32(dst, out);
+
+        i -= 2; src += 12; dst += 4;
+    }
+
+    /* Finish off any leftovers with scalar operations. */
+    while (i) {
+        const float front_center_distributed = src[2] * 0.5f;
+        dst[0] = (src[0] + front_center_distributed + src[4]) * two_fifths_f;  /* left */
+        dst[1] = (src[1] + front_center_distributed + src[5]) * two_fifths_f;  /* right */
+        i--; src += 6; dst+=2;
+    }
+
+    cvt->len_cvt /= 3;
+    if (cvt->filters[++cvt->filter_index]) {
+        cvt->filters[cvt->filter_index] (cvt, format);
+    }
+}
+#endif
+
 /* Convert from 5.1 to stereo. Average left and right, distribute center, discard LFE. */
 static void SDLCALL
 SDL_Convert51ToStereo(SDL_AudioCVT * cvt, SDL_AudioFormat format)
@@ -1177,6 +1241,12 @@ SDL_BuildAudioCVT(SDL_AudioCVT * cvt,
             }
             #endif
 
+            #if HAVE_NEON_INTRINSICS
+            if (!filter && SDL_HasNEON()) {
+                filter = SDL_Convert51ToStereo_NEON;
+            }
+            #endif
+
             if (!filter) {
                 filter = SDL_Convert51ToStereo;
             }
@@ -1231,7 +1301,7 @@ SDL_BuildAudioCVT(SDL_AudioCVT * cvt,
            handled by now, but let's be defensive */
       return SDL_SetError("Invalid channel combination");
     }
-    
+
     /* Do rate conversion, if necessary. Updates (cvt). */
     if (SDL_BuildAudioResampleCVT(cvt, dst_channels, src_rate, dst_rate) < 0) {
         return -1;              /* shouldn't happen, but just in case... */
@@ -1713,7 +1783,7 @@ SDL_AudioStreamPut(SDL_AudioStream *stream, const void *buf, int len)
             stream->staging_buffer_filled += len;
             return 0;
         }
- 
+
         /* Fill the staging buffer, process it, and continue */
         amount = (stream->staging_buffer_size - stream->staging_buffer_filled);
         SDL_assert(amount > 0);