SDL: Tidy up/optimize BlitNtoNPixelAlpha_AVX2

From 070e48f66edebdde0606e97c5022bb8d4b8aa57b Mon Sep 17 00:00:00 2001
From: Brick <[EMAIL REDACTED]>
Date: Wed, 3 Jul 2024 15:36:50 +0100
Subject: [PATCH] Tidy up/optimize BlitNtoNPixelAlpha_AVX2

---
 src/video/SDL_blit.h        |  10 ++-
 src/video/SDL_blit_A.c      |   2 +-
 src/video/SDL_blit_A_avx2.c | 171 ++++++++++--------------------------
 3 files changed, 53 insertions(+), 130 deletions(-)

diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h
index b994f9e9bd592..387db2f2cae5a 100644
--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -501,6 +501,7 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
         dst = a << 24 | r << 16 | g << 8 | b;     \
     } while (0)
 /* Blend a single color channel or alpha value */
+/* dC = ((sC * sA) + (dC * (255 - sA))) / 255 */
 #define ALPHA_BLEND_CHANNEL(sC, dC, sA)                  \
     do {                                                 \
         Uint16 x;                                        \
@@ -510,6 +511,7 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
         dC = x >> 8;                                     \
     } while (0)
 /* Perform a division by 255 after a multiplication of two 8-bit color channels */
+/* out = (sC * dC) / 255 */
 #define MULT_DIV_255(sC, dC, out) \
     do {                          \
         Uint16 x = sC * dC;       \
@@ -524,11 +526,11 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
         ALPHA_BLEND_CHANNEL(sG, dG, A);                       \
         ALPHA_BLEND_CHANNEL(sB, dB, A);                       \
     } while (0)
-/* Blend the ARGB values of two 32-bit pixels */
-#define ALPHA_BLEND_ARGB_PIXELS(src, dst)                               \
+/* Blend two 32-bit pixels with the same format */
+#define ALPHA_BLEND_RGBA_4(src, dst, ashift)                            \
     do {                                                                \
-        Uint32 srcA = src >> 24;                                        \
-        src |= 0xFF000000;                                              \
+        Uint32 srcA = (src >> ashift) & 0xFF;                           \
+        src |= ((Uint32)0xFF) << ashift;                                \
                                                                         \
         Uint32 srcRB = src & 0x00FF00FF;                                \
         Uint32 dstRB = dst & 0x00FF00FF;                                \
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index 3b97440ac19d8..4dd67e341b54d 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -1171,7 +1171,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
             {
             PIXEL_TO_ARGB_PIXEL(*(Uint32 *) src, srcfmt, Pixel);
             Uint32 blended = *(Uint32 *) dst;
-            ALPHA_BLEND_ARGB_PIXELS(Pixel, blended);
+            ALPHA_BLEND_RGBA_4(Pixel, blended);
             *(Uint32*)dst = blended;
             src += srcbpp;
             dst += dstbpp;
diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c
index 04b5851c2de26..d6828e2beef65 100644
--- a/src/video/SDL_blit_A_avx2.c
+++ b/src/video/SDL_blit_A_avx2.c
@@ -7,63 +7,14 @@
 #define SDL_blit_A_avx2_c
 
 #include "SDL_blit.h"
-#include "SDL_blit_A_sse4_1.h"
-
-__m256i SDL_TARGETING("avx2") GetSDL_PixelFormatAlphaSplatMask_AVX2(const SDL_PixelFormat* dstfmt) {
-    Uint8 index = dstfmt->Ashift / 8;
-    return _mm256_set_epi8(
-            index + 28, index + 28, index + 28, index + 28, index + 24, index + 24, index + 24, index + 24,
-            index + 20, index + 20, index + 20, index + 20, index + 16, index + 16, index + 16, index + 16,
-            index + 12, index + 12, index + 12, index + 12, index + 8, index + 8, index + 8, index + 8,
-            index + 4, index + 4, index + 4, index + 4, index, index, index, index);
-}
-
-__m256i SDL_TARGETING("avx2") GetSDL_PixelFormatAlphaSaturateMask_AVX2(const SDL_PixelFormat* dstfmt) {
-    const Uint8 bin = dstfmt->Ashift / 8;
-    return _mm256_set_epi8(
-            bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0,
-            bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0,
-            bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0,
-            bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0,
-            bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0,
-            bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0,
-            bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0,
-            bin == 3 ? 0xFF : 0, bin == 2 ? 0xFF : 0, bin == 1 ? 0xFF : 0, bin == 0 ? 0xFF : 0);
-}
-
-__m256i SDL_TARGETING("avx2") GetSDL_PixelFormatShuffleMask_AVX2(const SDL_PixelFormat* srcfmt,
-                                                              const SDL_PixelFormat* dstfmt) {
-    /* Calculate shuffle indices based on the source and destination SDL_PixelFormat */
-    Uint8 shuffleIndices[32];
-    Uint8 dstAshift = dstfmt->Ashift / 8;
-    Uint8 dstRshift = dstfmt->Rshift / 8;
-    Uint8 dstGshift = dstfmt->Gshift / 8;
-    Uint8 dstBshift = dstfmt->Bshift / 8;
-    for (int i = 0; i < 8; ++i) {
-        shuffleIndices[dstAshift + i * 4] = srcfmt->Ashift / 8 + i * 4;
-        shuffleIndices[dstRshift + i * 4] = srcfmt->Rshift / 8 + i * 4;
-        shuffleIndices[dstGshift + i * 4] = srcfmt->Gshift / 8 + i * 4;
-        shuffleIndices[dstBshift + i * 4] = srcfmt->Bshift / 8 + i * 4;
-    }
-
-    /* Create shuffle mask based on the calculated indices */
-    return _mm256_set_epi8(
-            shuffleIndices[31], shuffleIndices[30], shuffleIndices[29], shuffleIndices[28],
-            shuffleIndices[27], shuffleIndices[26], shuffleIndices[25], shuffleIndices[24],
-            shuffleIndices[23], shuffleIndices[22], shuffleIndices[21], shuffleIndices[20],
-            shuffleIndices[19], shuffleIndices[18], shuffleIndices[17], shuffleIndices[16],
-            shuffleIndices[15], shuffleIndices[14], shuffleIndices[13], shuffleIndices[12],
-            shuffleIndices[11], shuffleIndices[10], shuffleIndices[9], shuffleIndices[8],
-            shuffleIndices[7], shuffleIndices[6], shuffleIndices[5], shuffleIndices[4],
-            shuffleIndices[3], shuffleIndices[2], shuffleIndices[1], shuffleIndices[0]
-    );
-}
 
 /**
  * Using the AVX2 instruction set, blit sixteen pixels into eight with alpha blending
  */
-__m256i SDL_TARGETING("avx2") MixRGBA_AVX2(__m256i src, __m256i dst, const __m256i alpha_shuffle,
-                                           const __m256i alpha_saturate) {
+SDL_FORCE_INLINE __m256i SDL_TARGETING("avx2") MixRGBA_AVX2(
+    __m256i src, __m256i dst,
+    const __m256i alpha_shuffle, const __m256i alpha_saturate)
+{
     // SIMD implementation of blend_mul2.
     // dstRGB                            = (srcRGB * srcA) + (dstRGB * (1-srcA))
     // dstA   = srcA + (dstA * (1-srcA)) = (1      * srcA) + (dstA   * (1-srcA))
@@ -112,87 +63,57 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
     SDL_PixelFormat *srcfmt = info->src_fmt;
     SDL_PixelFormat *dstfmt = info->dst_fmt;
 
-    int chunks = width / 8;
-    SDL_bool free_format = SDL_FALSE;
-    /* Handle case when passed invalid format, assume ARGB destination */
-    if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) {
-        dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888);
-        free_format = SDL_TRUE;
-    }
-    const __m256i shift_mask = GetSDL_PixelFormatShuffleMask_AVX2(srcfmt, dstfmt);
-    const __m256i splat_mask = GetSDL_PixelFormatAlphaSplatMask_AVX2(dstfmt);
-    const __m256i saturate_mask = GetSDL_PixelFormatAlphaSaturateMask_AVX2(dstfmt);
-    const __m128i sse4_1_shift_mask = GetSDL_PixelFormatShuffleMask_SSE4_1(srcfmt, dstfmt);
-    const __m128i sse4_1_splat_mask = GetSDL_PixelFormatAlphaSplatMask_SSE4_1(dstfmt);
-    const __m128i sse4_1_saturate_mask = GetSDL_PixelFormatAlphaSaturateMask_SSE4_1(dstfmt);
+    const __m256i mask_offsets = _mm256_set_epi8(
+        28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+
+    const __m256i shift_mask = _mm256_add_epi32(
+        _mm256_set1_epi32(
+            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
+            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
+            ((srcfmt->Bshift >> 3) << dstfmt->Bshift) |
+            ((srcfmt->Ashift >> 3) << dstfmt->Ashift)),
+        mask_offsets);
+
+    const __m256i splat_mask = _mm256_add_epi8(_mm256_set1_epi8(dstfmt->Ashift >> 3), mask_offsets);
+    const __m256i saturate_mask = _mm256_set1_epi32((int)dstfmt->Amask);
 
     while (height--) {
-        /* Process 8-wide chunks of source color data that may be in wrong format */
-        for (int i = 0; i < chunks; i += 1) {
-            __m256i c_src = _mm256_shuffle_epi8(_mm256_loadu_si256((__m256i *) (src + i * 32)), shift_mask);
-            /* Alpha-blend in 8-wide chunk from src into destination */
-            __m256i c_dst = _mm256_loadu_si256((__m256i*) (dst + i * 32));
-            __m256i c_mix = MixRGBA_AVX2(c_src, c_dst, splat_mask, saturate_mask);
-            _mm256_storeu_si256((__m256i*) (dst + i * 32), c_mix);
-        }
+        int i = 0;
+
+        for (; i + 8 <= width; i += 8) {
+            // Load 8 src pixels and shuffle into the dst format
+            __m256i c_src = _mm256_shuffle_epi8(_mm256_loadu_si256((__m256i *)src), shift_mask);
+
+            // Load 8 dst pixels
+            __m256i c_dst = _mm256_loadu_si256((__m256i *)dst);
 
-        /* Handle remaining pixels when width is not a multiple of 4 */
-        if (width % 8 != 0) {
-            int remaining_pixels = width % 8;
-            int offset = width - remaining_pixels;
-            if (remaining_pixels >= 4) {
-                Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
-                Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
-                __m128i c_src = _mm_loadu_si128((__m128i*)src_ptr);
-                c_src = _mm_shuffle_epi8(c_src, sse4_1_shift_mask);
-                __m128i c_dst = _mm_loadu_si128((__m128i*)dst_ptr);
-                __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, sse4_1_splat_mask, sse4_1_saturate_mask);
-                _mm_storeu_si128((__m128i*)dst_ptr, c_mix);
-                remaining_pixels -= 4;
-                offset += 4;
-            }
-            if (remaining_pixels >= 2) {
-                Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
-                Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
-                __m128i c_src = _mm_loadu_si64(src_ptr);
-                c_src = _mm_shuffle_epi8(c_src, sse4_1_shift_mask);
-                __m128i c_dst = _mm_loadu_si64(dst_ptr);
-                __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, sse4_1_splat_mask, sse4_1_saturate_mask);
-                _mm_storeu_si64(dst_ptr, c_mix);
-                remaining_pixels -= 2;
-                offset += 2;
-            }
-            if (remaining_pixels == 1) {
-                Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
-                Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
-                Uint32 pixel = AlignPixelToSDL_PixelFormat(*src_ptr, srcfmt, dstfmt);
-                /* Old GCC has bad or no _mm_loadu_si32 */
-                #if defined(__GNUC__) && (__GNUC__ < 11)
-                __m128i c_src = _mm_set_epi32(0, 0, 0, pixel);
-                __m128i c_dst = _mm_set_epi32(0, 0, 0, *dst_ptr);
-                #else
-                __m128i c_src = _mm_loadu_si32(&pixel);
-                __m128i c_dst = _mm_loadu_si32(dst_ptr);
-                #endif
-                __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, sse4_1_splat_mask, sse4_1_saturate_mask);
-                /* Old GCC has bad or no _mm_storeu_si32 */
-                #if defined(__GNUC__) && (__GNUC__ < 11)
-                *dst_ptr = _mm_extract_epi32(mixed_pixel, 0);
-                #else
-                _mm_storeu_si32(dst_ptr, mixed_pixel);
-                #endif
-            }
+            // Blend the pixels together and save the result
+            _mm256_storeu_si256((__m256i *)dst, MixRGBA_AVX2(c_src, c_dst, splat_mask, saturate_mask));
+
+            src += 32;
+            dst += 32;
         }
 
-        src += 4 * width;
-        dst += 4 * width;
+        for (; i < width; ++i) {
+            Uint32 src32 = *(Uint32 *)src;
+            Uint32 dst32 = *(Uint32 *)dst;
+
+            src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) |
+                    (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) |
+                    (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) |
+                    (((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift);
+
+            ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift);
+
+            *(Uint32 *)dst = dst32;
+
+            src += 4;
+            dst += 4;
+        }
 
         src += srcskip;
         dst += dstskip;
     }
-    if (free_format) {
-        SDL_DestroyPixelFormat(dstfmt);
-    }
 }
 
 #endif