SDL: audio: backport SDL3's audio datatype converters.

From 1e2f3118117a3cd6d483ea090db8d15a50f3bfdc Mon Sep 17 00:00:00 2001
From: "Ryan C. Gordon" <[EMAIL REDACTED]>
Date: Tue, 16 Jan 2024 10:01:46 -0500
Subject: [PATCH] audio: backport SDL3's audio datatype converters.

These are faster and more precise.

This did not change the Uint16 versions (as SDL3 dropped them, and honestly
no one should be using them in SDL2), nor the NEON converters (as this was
not changed in SDL3, so far).

Fixes #8786.
---
 src/audio/SDL_audiotypecvt.c | 701 +++++++++++++++++------------------
 1 file changed, 338 insertions(+), 363 deletions(-)

diff --git a/src/audio/SDL_audiotypecvt.c b/src/audio/SDL_audiotypecvt.c
index 1089fe015135..d99d2def5f09 100644
--- a/src/audio/SDL_audiotypecvt.c
+++ b/src/audio/SDL_audiotypecvt.c
@@ -62,18 +62,36 @@ SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
 #define DIVBY128     0.0078125f
 #define DIVBY32768   0.000030517578125f
 #define DIVBY8388607 0.00000011920930376163766f
+#define DIVBY2147483648 0.0000000004656612873077392578125f /* 0x1p-31f */
 
 #if NEED_SCALAR_CONVERTER_FALLBACKS
+
+/* This code requires that floats are in the IEEE-754 binary32 format */
+SDL_COMPILE_TIME_ASSERT(float_bits, sizeof(float) == sizeof(Uint32));
+
+union float_bits {
+    Uint32 u32;
+    float f32;
+};
+
+/* Create a bit-mask based on the sign-bit. Should optimize to a single arithmetic-shift-right */
+#define SIGNMASK(x) (Uint32)(0u - ((Uint32)(x) >> 31))
+
 static void SDLCALL SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
-    const Sint8 *src = ((const Sint8 *)(cvt->buf + cvt->len_cvt)) - 1;
-    float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
+    const int num_samples = cvt->len_cvt;
+    const Sint8 *src = (const Sint8 *)cvt->buf;
+    float *dst = (float *)cvt->buf;
     int i;
 
     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
 
-    for (i = cvt->len_cvt; i; --i, --src, --dst) {
-        *dst = ((float)*src) * DIVBY128;
+    for (i = num_samples - 1; i >= 0; --i) {
+        /* 1) Construct a float in the range [65536.0, 65538.0)
+         * 2) Shift the float range to [-1.0, 1.0) */
+        union float_bits x;
+        x.u32 = (Uint8)src[i] ^ 0x47800080u;
+        dst[i] = x.f32 - 65537.0f;
     }
 
     cvt->len_cvt *= 4;
@@ -84,14 +102,19 @@ static void SDLCALL SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFor
 
 static void SDLCALL SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
-    const Uint8 *src = ((const Uint8 *)(cvt->buf + cvt->len_cvt)) - 1;
-    float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
+    const int num_samples = cvt->len_cvt;
+    const Uint8 *src = (const Uint8 *)cvt->buf;
+    float *dst = (float *)cvt->buf;
     int i;
 
     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
 
-    for (i = cvt->len_cvt; i; --i, --src, --dst) {
-        *dst = (((float)*src) * DIVBY128) - 1.0f;
+    for (i = num_samples - 1; i >= 0; --i) {
+        /* 1) Construct a float in the range [65536.0, 65538.0)
+         * 2) Shift the float range to [-1.0, 1.0) */
+        union float_bits x;
+        x.u32 = src[i] ^ 0x47800000u;
+        dst[i] = x.f32 - 65537.0f;
     }
 
     cvt->len_cvt *= 4;
@@ -102,14 +125,19 @@ static void SDLCALL SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFor
 
 static void SDLCALL SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
-    const Sint16 *src = ((const Sint16 *)(cvt->buf + cvt->len_cvt)) - 1;
-    float *dst = ((float *)(cvt->buf + cvt->len_cvt * 2)) - 1;
+    const int num_samples = cvt->len_cvt;
+    const Sint16 *src = (const Sint16 *)cvt->buf;
+    float *dst = (float *)cvt->buf;
     int i;
 
     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
 
-    for (i = cvt->len_cvt / sizeof(Sint16); i; --i, --src, --dst) {
-        *dst = ((float)*src) * DIVBY32768;
+    for (i = num_samples - 1; i >= 0; --i) {
+        /* 1) Construct a float in the range [256.0, 258.0)
+         * 2) Shift the float range to [-1.0, 1.0) */
+        union float_bits x;
+        x.u32 = (Uint16)src[i] ^ 0x43808000u;
+        dst[i] = x.f32 - 257.0f;
     }
 
     cvt->len_cvt *= 2;
@@ -155,21 +183,26 @@ static void SDLCALL SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFo
 
 static void SDLCALL SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
+    const int num_samples = cvt->len_cvt / sizeof (float);
     const float *src = (const float *)cvt->buf;
     Sint8 *dst = (Sint8 *)cvt->buf;
     int i;
 
     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
 
-    for (i = cvt->len_cvt / sizeof(float); i; --i, ++src, ++dst) {
-        const float sample = *src;
-        if (sample >= 1.0f) {
-            *dst = 127;
-        } else if (sample <= -1.0f) {
-            *dst = -128;
-        } else {
-            *dst = (Sint8)(sample * 127.0f);
-        }
+    for (i = 0; i < num_samples; ++i) {
+        /* 1) Shift the float range from [-1.0, 1.0] to [98303.0, 98305.0]
+         * 2) Shift the integer range from [0x47BFFF80, 0x47C00080] to [-128, 128]
+         * 3) Clamp the value to [-128, 127] */
+        union float_bits x;
+        Uint32 y, z;
+        x.f32 = src[i] + 98304.0f;
+
+        y = x.u32 - 0x47C00000u;
+        z = 0x7Fu - (y ^ SIGNMASK(y));
+        y = y ^ (z & SIGNMASK(z));
+
+        dst[i] = (Sint8)(y & 0xFF);
     }
 
     cvt->len_cvt /= 4;
@@ -180,21 +213,27 @@ static void SDLCALL SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFor
 
 static void SDLCALL SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
+    const int num_samples = cvt->len_cvt / sizeof (float);
     const float *src = (const float *)cvt->buf;
     Uint8 *dst = (Uint8 *)cvt->buf;
     int i;
 
     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
 
-    for (i = cvt->len_cvt / sizeof(float); i; --i, ++src, ++dst) {
-        const float sample = *src;
-        if (sample >= 1.0f) {
-            *dst = 255;
-        } else if (sample <= -1.0f) {
-            *dst = 0;
-        } else {
-            *dst = (Uint8)((sample + 1.0f) * 127.0f);
-        }
+    for (i = 0; i < num_samples; ++i) {
+        /* 1) Shift the float range from [-1.0, 1.0] to [98303.0, 98305.0]
+         * 2) Shift the integer range from [0x47BFFF80, 0x47C00080] to [-128, 128]
+         * 3) Clamp the value to [-128, 127]
+         * 4) Shift the integer range from [-128, 127] to [0, 255] */
+        union float_bits x;
+        Uint32 y, z;
+        x.f32 = src[i] + 98304.0f;
+
+        y = x.u32 - 0x47C00000u;
+        z = 0x7Fu - (y ^ SIGNMASK(y));
+        y = (y ^ 0x80u) ^ (z & SIGNMASK(z));
+
+        dst[i] = (Uint8)(y & 0xFF);
     }
 
     cvt->len_cvt /= 4;
@@ -205,21 +244,25 @@ static void SDLCALL SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFor
 
 static void SDLCALL SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
+    const int num_samples = cvt->len_cvt / sizeof (float);
     const float *src = (const float *)cvt->buf;
     Sint16 *dst = (Sint16 *)cvt->buf;
     int i;
 
     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
 
-    for (i = cvt->len_cvt / sizeof(float); i; --i, ++src, ++dst) {
-        const float sample = *src;
-        if (sample >= 1.0f) {
-            *dst = 32767;
-        } else if (sample <= -1.0f) {
-            *dst = -32768;
-        } else {
-            *dst = (Sint16)(sample * 32767.0f);
-        }
+    for (i = 0; i < num_samples; ++i) {
+        /* 1) Shift the float range from [-1.0, 1.0] to [383.0, 385.0]
+         * 2) Shift the integer range from [0x43BF8000, 0x43C08000] to [-32768, 32768]
+         * 3) Clamp values outside the [-32768, 32767] range */
+        union float_bits x;
+        x.f32 = src[i] + 384.0f;
+
+        Uint32 y = x.u32 - 0x43C00000u;
+        Uint32 z = 0x7FFFu - (y ^ SIGNMASK(y));
+        y = y ^ (z & SIGNMASK(z));
+
+        dst[i] = (Sint16)(y & 0xFFFF);
     }
 
     cvt->len_cvt /= 2;
@@ -255,21 +298,26 @@ static void SDLCALL SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFo
 
 static void SDLCALL SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
+    const int num_samples = cvt->len_cvt / sizeof (float);
     const float *src = (const float *)cvt->buf;
     Sint32 *dst = (Sint32 *)cvt->buf;
     int i;
 
     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
 
-    for (i = cvt->len_cvt / sizeof(float); i; --i, ++src, ++dst) {
-        const float sample = *src;
-        if (sample >= 1.0f) {
-            *dst = 2147483647;
-        } else if (sample <= -1.0f) {
-            *dst = (Sint32)-2147483648LL;
-        } else {
-            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
-        }
+    for (i = 0; i < num_samples; ++i) {
+        /* 1) Shift the float range from [-1.0, 1.0] to [-2147483648.0, 2147483648.0]
+         * 2) Set values outside the [-2147483648.0, 2147483647.0] range to -2147483648.0
+         * 3) Convert the float to an integer, and fixup values outside the valid range */
+        union float_bits x;
+        x.f32 = src[i];
+
+        Uint32 y = x.u32 + 0x0F800000u;
+        Uint32 z = y - 0xCF000000u;
+        z &= SIGNMASK(y ^ z);
+        x.u32 = y - z;
+
+        dst[i] = (Sint32)x.f32 ^ (Sint32)SIGNMASK(z);
     }
 
     if (cvt->filters[++cvt->filter_index]) {
@@ -281,60 +329,45 @@ static void SDLCALL SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFo
 #ifdef HAVE_SSE2_INTRINSICS
 static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
-    const Sint8 *src = ((const Sint8 *)(cvt->buf + cvt->len_cvt)) - 1;
-    float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
-    int i;
+    const Sint8 *src = (const Sint8 *)cvt->buf;
+    float *dst = (float *)cvt->buf;
+    int i = cvt->len_cvt;
+
+    /* 1) Flip the sign bit to convert from S8 to U8 format
+     * 2) Construct a float in the range [65536.0, 65538.0)
+     * 3) Shift the float range to [-1.0, 1.0)
+     * dst[i] = i2f((src[i] ^ 0x80) | 0x47800000) - 65537.0 */
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i flipper = _mm_set1_epi8(-0x80);
+    const __m128i caster = _mm_set1_epi16(0x4780 /* 0x47800000 = f2i(65536.0) */);
+    const __m128 offset = _mm_set1_ps(-65537.0);
 
     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
 
-    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
-    for (i = cvt->len_cvt; i && (((size_t)(dst - 15)) & 15); --i, --src, --dst) {
-        *dst = ((float)*src) * DIVBY128;
-    }
+    while (i >= 16) {
+        i -= 16;
 
-    src -= 15;
-    dst -= 15; /* adjust to read SSE blocks from the start. */
-    SDL_assert(!i || !(((size_t)dst) & 15));
+        {
+        const __m128i bytes = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i-16]), flipper);
 
-    /* Make sure src is aligned too. */
-    if (!(((size_t)src) & 15)) {
-        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
-        const __m128i *mmsrc = (const __m128i *)src;
-        const __m128i zero = _mm_setzero_si128();
-        const __m128 divby128 = _mm_set1_ps(DIVBY128);
-        while (i >= 16) {                                /* 16 * 8-bit */
-            const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 sint8 into an XMM register. */
-            /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
-            const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
-            /* right-shift-sign-extend gets us sint16 with the other set of values. */
-            const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
-            /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
-            const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
-            const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
-            const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
-            const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
-            /* Interleave back into correct order, store. */
-            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
-            _mm_store_ps(dst + 4, _mm_unpackhi_ps(floats1, floats2));
-            _mm_store_ps(dst + 8, _mm_unpacklo_ps(floats3, floats4));
-            _mm_store_ps(dst + 12, _mm_unpackhi_ps(floats3, floats4));
-            i -= 16;
-            mmsrc--;
-            dst -= 16;
-        }
+        const __m128i shorts1 = _mm_unpacklo_epi8(bytes, zero);
+        const __m128i shorts2 = _mm_unpackhi_epi8(bytes, zero);
 
-        src = (const Sint8 *)mmsrc;
-    }
+        const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset);
+        const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset);
+        const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts2, caster)), offset);
+        const __m128 floats4 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts2, caster)), offset);
 
-    src += 15;
-    dst += 15; /* adjust for any scalar finishing. */
+        _mm_storeu_ps(&dst[i], floats1);
+        _mm_storeu_ps(&dst[i + 4], floats2);
+        _mm_storeu_ps(&dst[i + 8], floats3);
+        _mm_storeu_ps(&dst[i + 12], floats4);
+        }
+    }
 
-    /* Finish off any leftovers with scalar operations. */
     while (i) {
-        *dst = ((float)*src) * DIVBY128;
-        i--;
-        src--;
-        dst--;
+        --i;
+        _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint8)src[i] ^ 0x47800080u)), offset));
     }
 
     cvt->len_cvt *= 4;
@@ -345,62 +378,43 @@ static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
 
 static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
-    const Uint8 *src = ((const Uint8 *)(cvt->buf + cvt->len_cvt)) - 1;
-    float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
-    int i;
+    const Sint8 *src = (const Sint8 *)cvt->buf;
+    float *dst = (float *)cvt->buf;
+    int i = cvt->len_cvt;
+
+    /* 1) Construct a float in the range [65536.0, 65538.0)
+     * 2) Shift the float range to [-1.0, 1.0)
+     * dst[i] = i2f(src[i] | 0x47800000) - 65537.0 */
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i caster = _mm_set1_epi16(0x4780 /* 0x47800000 = f2i(65536.0) */);
+    const __m128 offset = _mm_set1_ps(-65537.0);
 
     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
 
-    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
-    for (i = cvt->len_cvt; i && (((size_t)(dst - 15)) & 15); --i, --src, --dst) {
-        *dst = (((float)*src) * DIVBY128) - 1.0f;
-    }
+    while (i >= 16) {
+        i -= 16;
 
-    src -= 15;
-    dst -= 15; /* adjust to read SSE blocks from the start. */
-    SDL_assert(!i || !(((size_t)dst) & 15));
+        {
+        const __m128i bytes = _mm_loadu_si128((const __m128i *)&src[i]);
 
-    /* Make sure src is aligned too. */
-    if (!(((size_t)src) & 15)) {
-        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
-        const __m128i *mmsrc = (const __m128i *)src;
-        const __m128i zero = _mm_setzero_si128();
-        const __m128 divby128 = _mm_set1_ps(DIVBY128);
-        const __m128 minus1 = _mm_set1_ps(-1.0f);
-        while (i >= 16) {                                /* 16 * 8-bit */
-            const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */
-            /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
-            const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
-            /* right-shift-zero-extend gets us uint16 with the other set of values. */
-            const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
-            /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
-            /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
-            const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
-            const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
-            const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
-            const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
-            /* Interleave back into correct order, store. */
-            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
-            _mm_store_ps(dst + 4, _mm_unpackhi_ps(floats1, floats2));
-            _mm_store_ps(dst + 8, _mm_unpacklo_ps(floats3, floats4));
-            _mm_store_ps(dst + 12, _mm_unpackhi_ps(floats3, floats4));
-            i -= 16;
-            mmsrc--;
-            dst -= 16;
-        }
+        const __m128i shorts1 = _mm_unpacklo_epi8(bytes, zero);
+        const __m128i shorts2 = _mm_unpackhi_epi8(bytes, zero);
 
-        src = (const Uint8 *)mmsrc;
-    }
+        const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset);
+        const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset);
+        const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts2, caster)), offset);
+        const __m128 floats4 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts2, caster)), offset);
 
-    src += 15;
-    dst += 15; /* adjust for any scalar finishing. */
+        _mm_storeu_ps(&dst[i], floats1);
+        _mm_storeu_ps(&dst[i + 4], floats2);
+        _mm_storeu_ps(&dst[i + 8], floats3);
+        _mm_storeu_ps(&dst[i + 12], floats4);
+        }
+    }
 
-    /* Finish off any leftovers with scalar operations. */
     while (i) {
-        *dst = (((float)*src) * DIVBY128) - 1.0f;
-        i--;
-        src--;
-        dst--;
+        --i;
+        _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint8)src[i] ^ 0x47800000u)), offset));
     }
 
     cvt->len_cvt *= 4;
@@ -411,49 +425,42 @@ static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
 
 static void SDLCALL SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
-    const Sint16 *src = ((const Sint16 *)(cvt->buf + cvt->len_cvt)) - 1;
-    float *dst = ((float *)(cvt->buf + cvt->len_cvt * 2)) - 1;
-    int i;
+    const Sint8 *src = (const Sint8 *)cvt->buf;
+    float *dst = (float *)cvt->buf;
+    int i = cvt->len_cvt / 2;
+
+    /* 1) Flip the sign bit to convert from S16 to U16 format
+     * 2) Construct a float in the range [256.0, 258.0)
+     * 3) Shift the float range to [-1.0, 1.0)
+     * dst[i] = i2f((src[i] ^ 0x8000) | 0x43800000) - 257.0 */
+    const __m128i flipper = _mm_set1_epi16(-0x8000);
+    const __m128i caster = _mm_set1_epi16(0x4380 /* 0x43800000 = f2i(256.0) */);
+    const __m128 offset = _mm_set1_ps(-257.0f);
 
     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
 
-    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
-    for (i = cvt->len_cvt / sizeof(Sint16); i && (((size_t)(dst - 7)) & 15); --i, --src, --dst) {
-        *dst = ((float)*src) * DIVBY32768;
-    }
+    while (i >= 16) {
+        i -= 16;
 
-    src -= 7;
-    dst -= 7; /* adjust to read SSE blocks from the start. */
-    SDL_assert(!i || !(((size_t)dst) & 15));
+        {
+        const __m128i shorts1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i]), flipper);
+        const __m128i shorts2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)&src[i + 8]), flipper);
 
-    /* Make sure src is aligned too. */
-    if (!(((size_t)src) & 15)) {
-        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
-        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
-        while (i >= 8) {                                               /* 8 * 16-bit */
-            const __m128i ints = _mm_load_si128((__m128i const *)src); /* get 8 sint16 into an XMM register. */
-            /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
-            const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
-            /* right-shift-sign-extend gets us sint32 with the other set of values. */
-            const __m128i b = _mm_srai_epi32(ints, 16);
-            /* Interleave these back into the right order, convert to float, multiply, store. */
-            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
-            _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
-            i -= 8;
-            src -= 8;
-            dst -= 8;
+        const __m128 floats1 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts1, caster)), offset);
+        const __m128 floats2 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts1, caster)), offset);
+        const __m128 floats3 = _mm_add_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(shorts2, caster)), offset);
+        const __m128 floats4 = _mm_add_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(shorts2, caster)), offset);
+
+        _mm_storeu_ps(&dst[i], floats1);
+        _mm_storeu_ps(&dst[i + 4], floats2);
+        _mm_storeu_ps(&dst[i + 8], floats3);
+        _mm_storeu_ps(&dst[i + 12], floats4);
         }
     }
 
-    src += 7;
-    dst += 7; /* adjust for any scalar finishing. */
-
-    /* Finish off any leftovers with scalar operations. */
     while (i) {
-        *dst = ((float)*src) * DIVBY32768;
-        i--;
-        src--;
-        dst--;
+        --i;
+        _mm_store_ss(&dst[i], _mm_add_ss(_mm_castsi128_ps(_mm_cvtsi32_si128((Uint16)src[i] ^ 0x43808000u)), offset));
     }
 
     cvt->len_cvt *= 2;
@@ -520,38 +527,37 @@ static void SDLCALL SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
 {
     const Sint32 *src = (const Sint32 *)cvt->buf;
     float *dst = (float *)cvt->buf;
-    int i;
+    int i = cvt->len_cvt / 4;
+
+    /* dst[i] = f32(src[i]) / f32(0x80000000) */
+    const __m128 scaler = _mm_set1_ps(DIVBY2147483648);
 
     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
 
-    /* Get dst aligned to 16 bytes */
-    for (i = cvt->len_cvt / sizeof(Sint32); i && (((size_t)dst) & 15); --i, ++src, ++dst) {
-        *dst = ((float)(*src >> 8)) * DIVBY8388607;
-    }
+    while (i >= 16) {
+        i -= 16;
 
-    SDL_assert(!i || !(((size_t)dst) & 15));
+        {
+        const __m128i ints1 = _mm_loadu_si128((const __m128i *)&src[i]);
+        const __m128i ints2 = _mm_loadu_si128((const __m128i *)&src[i + 4]);
+        const __m128i ints3 = _mm_loadu_si128((const __m128i *)&src[i + 8]);
+        const __m128i ints4 = _mm_loadu_si128((const __m128i *)&src[i + 12]);
 
-    /* Make sure src is aligned too. */
-    if (!(((size_t)src) & 15)) {
-        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
-        const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
-        const __m128i *mmsrc = (const __m128i *)src;
-        while (i >= 4) { /* 4 * sint32 */
-            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
-            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
-            i -= 4;
-            mmsrc++;
-            dst += 4;
+        const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(ints1), scaler);
+        const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(ints2), scaler);
+        const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(ints3), scaler);
+        const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(ints4), scaler);
+
+        _mm_storeu_ps(&dst[i], floats1);
+        _mm_storeu_ps(&dst[i + 4], floats2);
+        _mm_storeu_ps(&dst[i + 8], floats3);
+        _mm_storeu_ps(&dst[i + 12], floats4);
         }
-        src = (const Sint32 *)mmsrc;
     }
 
-    /* Finish off any leftovers with scalar operations. */
     while (i) {
-        *dst = ((float)(*src >> 8)) * DIVBY8388607;
-        i--;
-        src++;
-        dst++;
+        --i;
+        _mm_store_ss(&dst[i], _mm_mul_ss(_mm_cvt_si2ss(_mm_setzero_ps(), src[i]), scaler));
     }
 
     if (cvt->filters[++cvt->filter_index]) {
@@ -563,57 +569,47 @@ static void SDLCALL SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
 {
     const float *src = (const float *)cvt->buf;
     Sint8 *dst = (Sint8 *)cvt->buf;
-    int i;
+    int i = cvt->len_cvt / 4;
+
+    /* 1) Shift the float range from [-1.0, 1.0] to [98303.0, 98305.0]
+     * 2) Extract the lowest 16 bits and clamp to [-128, 127]
+     * Overflow is correctly handled for inputs between roughly [-255.0, 255.0]
+     * dst[i] = clamp(i16(f2i(src[i] + 98304.0) & 0xFFFF), -128, 127) */
+    const __m128 offset = _mm_set1_ps(98304.0f);
+    const __m128i mask = _mm_set1_epi16(0xFF);
 
     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
 
-    /* Get dst aligned to 16 bytes */
-    for (i = cvt->len_cvt / sizeof(float); i && (((size_t)dst) & 15); --i, ++src, ++dst) {
-        const float sample = *src;
-        if (sample >= 1.0f) {
-            *dst = 127;
-        } else if (sample <= -1.0f) {
-            *dst = -128;
-        } else {
-            *dst = (Sint8)(sample * 127.0f);
-        }
-    }
+    while (i >= 16) {
+        const __m128 floats1 = _mm_loadu_ps(&src[0]);
+        const __m128 floats2 = _mm_loadu_ps(&src[4]);
+        const __m128 floats3 = _mm_loadu_ps(&src[8]);
+        const __m128 floats4 = _mm_loadu_ps(&src[12]);
 
-    SDL_assert(!i || !(((size_t)dst) & 15));
+        const __m128i ints1 = _mm_castps_si128(_mm_add_ps(floats1, offset));
+        const __m128i ints2 = _mm_castps_si128(_mm_add_ps(floats2, offset));
+        const __m128i ints3 = _mm_castps_si128(_mm_add_ps(floats3, offset));
+        const __m128i ints4 = _mm_castps_si128(_mm_add_ps(floats4, offset));
 
-    /* Make sure src is aligned too. */
-    if (!(((size_t)src) & 15)) {
-        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
-        const __m128 one = _mm_set1_ps(1.0f);
-        const __m128 negone = _mm_set1_ps(-1.0f);
-        const __m128 mulby127 = _mm_set1_ps(127.0f);
-        __m128i *mmdst = (__m128i *)dst;
-        while (i >= 16) {                                                                                                            /* 16 * float32 */
-            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127));      /* load 4 floats, clamp, convert to sint32 */
-            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src + 4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
-            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src + 8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
-            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src + 12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
-            _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));                   /* pack down, store out. */
-            i -= 16;
-            src += 16;
-            mmdst++;
-        }
-        dst = (Sint8 *)mmdst;
+        const __m128i shorts1 = _mm_and_si128(_mm_packs_epi16(ints1, ints2), mask);
+        const __m128i shorts2 = _mm_and_si128(_mm_packs_epi16(ints3, ints4), mask);
+
+        const __m128i bytes = _mm_packus_epi16(shorts1, shorts2);
+
+        _mm_storeu_si128((__m128i*)dst, bytes);
+
+        i -= 16;
+        src += 16;
+        dst += 16;
     }
 
-    /* Finish off any leftovers with scalar operations. */
     while (i) {
-        const float sample = *src;
-        if (sample >= 1.0f) {
-            *dst = 127;
-        } else if (sample <= -1.0f) {
-            *dst = -128;
-        } else {
-            *dst = (Sint8)(sample * 127.0f);
-        }
-        i--;
-        src++;
-        dst++;
+        const __m128i ints = _mm_castps_si128(_mm_add_ss(_mm_load_ss(src), offset));
+        *dst = (Sint8)(_mm_cvtsi128_si32(_mm_packs_epi16(ints, ints)) & 0xFF);
+
+        --i;
+        ++src;
+        ++dst;
     }
 
     cvt->len_cvt /= 4;
@@ -626,57 +622,47 @@ static void SDLCALL SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
 {
     const float *src = (const float *)cvt->buf;
     Uint8 *dst = cvt->buf;
-    int i;
+    int i = cvt->len_cvt / 4;
+
+    /* 1) Shift the float range from [-1.0, 1.0] to [98304.0, 98306.0]
+     * 2) Extract the lowest 16 bits and clamp to [0, 255]
+     * Overflow is correctly handled for inputs between roughly [-254.0, 254.0]
+     * dst[i] = clamp(i16(f2i(src[i] + 98305.0) & 0xFFFF), 0, 255) */
+    const __m128 offset = _mm_set1_ps(98305.0f);
+    const __m128i mask = _mm_set1_epi16(0xFF);
 
     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
 
-    /* Get dst aligned to 16 bytes */
-    for (i = cvt->len_cvt / sizeof(float); i && (((size_t)dst) & 15); --i, ++src, ++dst) {
-        const float sample = *src;
-        if (sample >= 1.0f) {
-            *dst = 255;
-        } else if (sample <= -1.0f) {
-            *dst = 0;
-        } else {
-            *dst = (Uint8)((sample + 1.0f) * 127.0f);
-        }
-    }
+    while (i >= 16) {
+        const __m128 floats1 = _mm_loadu_ps(&src[0]);
+        const __m128 floats2 = _mm_loadu_ps(&src[4]);
+        const __m128 floats3 = _mm_loadu_ps(&src[8]);
+        const __m128 floats4 = _mm_loadu_ps(&src[12]);
 
-    SDL_assert(!i || !(((size_t)dst) & 15));
+        const __m128i ints1 = _mm_castps_si128(_mm_add_ps(floats1, offset));
+        const __m128i ints2 = _mm_castps_si128(_mm_add_ps(floats2, offset));
+        const __m128i ints3 = _mm_castps_si128(_mm_add_ps(floats3, offset));
+        const __m128i ints4 = _mm_castps_si128(_mm_add_ps(floats4, offset));
 
-    /* Make sure src is aligned too. */
-    if (!(((size_t)src) & 15)) {
-        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
-        const __m128 one = _mm_set1_ps(1.0f);
-        const __m128 negone = _mm_set1_ps(-1.0f);
-        const __m128 mulby127 = _mm_set1_ps(127.0f);
-        __m128i *mmdst = (__m128i *)dst;
-        while (i >= 16) {                                                                                                                             /* 16 * float32 */
-            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127));      /* load 4 floats, clamp, convert to sint32 */
-            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src + 4)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
-            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm

(Patch may be truncated, please check the link at the top of this post.)