SDL: Support arbitrary destination formats; remove buffer from AVX2

From 87958f44d090bc719df849882d241bc217de5822 Mon Sep 17 00:00:00 2001
From: Isaac Aronson <[EMAIL REDACTED]>
Date: Thu, 7 Sep 2023 21:14:00 -0500
Subject: [PATCH] Support arbitrary destination formats; remove buffer from
 AVX2

---
 src/video/SDL_blit_A_avx2.c   | 46 ++++++++++++----------
 src/video/SDL_blit_A_sse4_1.c | 73 ++++++++++++++++++++++++-----------
 src/video/SDL_blit_A_sse4_1.h |  6 ++-
 3 files changed, 79 insertions(+), 46 deletions(-)

diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c
index 4f8d01b313ccb..66c9d01c16354 100644
--- a/src/video/SDL_blit_A_avx2.c
+++ b/src/video/SDL_blit_A_avx2.c
@@ -9,21 +9,29 @@
 #include "SDL_blit.h"
 #include "SDL_blit_A_sse4_1.h"
 
+__m256i SDL_TARGETING("avx2") GetSDL_PixelFormatAlphaMask_AVX2(SDL_PixelFormat* dstfmt) {
+    Uint8 index = dstfmt->Ashift / 4;
+    /* Handle case where bad input sent */
+    if (dstfmt->Ashift == dstfmt->Bshift && dstfmt->Ashift == 0) {
+        index = 6;
+    }
+    return _mm256_set_epi8(
+            -1, index + 24, -1, index + 24, -1, index + 24, -1, index + 24,
+            -1, index + 16, -1, index + 16, -1, index + 16, -1, index + 16,
+            -1, index + 8, -1, index + 8, -1, index + 8, -1, index + 8,
+            -1, index, -1, index, -1, index, -1, index);
+}
+
 /**
  * Using the AVX2 instruction set, blit eight pixels with alpha blending
  * @param src A pointer to four 32-bit pixels of ARGB format to blit into dst
  * @param dst A pointer to four 32-bit pixels of ARGB format to retain visual data for while alpha blending
  * @return A 128-bit wide vector of four alpha-blended pixels in ARGB format
  */
-__m128i SDL_TARGETING("avx2") MixRGBA_AVX2(__m128i src, __m128i dst) {
+__m128i SDL_TARGETING("avx2") MixRGBA_AVX2(__m128i src, __m128i dst, __m256i alphaMask) {
     __m256i src_color = _mm256_cvtepu8_epi16(src);
     __m256i dst_color = _mm256_cvtepu8_epi16(dst);
-    const __m256i SHUFFLE_ALPHA = _mm256_set_epi8(
-         -1, 30, -1, 30, -1, 30, -1, 30,
-         -1, 22, -1, 22, -1, 22, -1, 22,
-         -1, 14, -1, 14, -1, 14, -1, 14,
-         -1, 6, -1, 6, -1, 6, -1, 6);
-    __m256i alpha = _mm256_shuffle_epi8(src_color, SHUFFLE_ALPHA);
+    __m256i alpha = _mm256_shuffle_epi8(src_color, alphaMask);
     __m256i sub = _mm256_sub_epi16(src_color, dst_color);
     __m256i mul = _mm256_mullo_epi16(sub, alpha);
     /**
@@ -51,22 +59,20 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
     Uint8 *dst = info->dst;
     int dstskip = info->dst_skip;
     SDL_PixelFormat *srcfmt = info->src_fmt;
+    SDL_PixelFormat *dstfmt = info->dst_fmt;
 
     int chunks = width / 4;
-    Uint8 *buf = SDL_malloc(sizeof(Uint8) * chunks * 16);
+    const __m128i colorShiftMask = GetSDL_PixelFormatShuffleMask(srcfmt, dstfmt);
+    const __m256i alphaMask = GetSDL_PixelFormatAlphaMask_AVX2(dstfmt);
+    const __m128i sse4_1AlphaMask = GetSDL_PixelFormatAlphaMask_SSE4_1(dstfmt);
 
     while (height--) {
         /* Process 4-wide chunks of source color data that may be in wrong format */
         for (int i = 0; i < chunks; i += 1) {
-            __m128i c_src = AlignPixelToSDL_PixelFormat_x4(_mm_loadu_si128((__m128i *) (src + i * 16)), srcfmt);
-            _mm_store_si128((__m128i*)(buf + i * 16), c_src);
-        }
-
-        /* Alpha-blend in 4-wide chunk from src into destination */
-        for (int i = 0; i < chunks; i += 1) {
-            __m128i c_src = _mm_loadu_si128((__m128i*) (buf + i * 16));
+            __m128i c_src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *) (src + i * 16)), colorShiftMask);
+            /* Alpha-blend in 4-wide chunk from src into destination */
             __m128i c_dst = _mm_loadu_si128((__m128i*) (dst + i * 16));
-            __m128i c_mix = MixRGBA_AVX2(c_src, c_dst);
+            __m128i c_mix = MixRGBA_AVX2(c_src, c_dst, alphaMask);
             _mm_storeu_si128((__m128i*) (dst + i * 16), c_mix);
         }
 
@@ -78,9 +84,9 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
                 Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
                 Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
                 __m128i c_src = _mm_loadu_si64(src_ptr);
-                c_src = AlignPixelToSDL_PixelFormat_x4(c_src, srcfmt);
+                c_src = _mm_shuffle_epi8(c_src, colorShiftMask);
                 __m128i c_dst = _mm_loadu_si64(dst_ptr);
-                __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst);
+                __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask);
                 _mm_storeu_si64(dst_ptr, c_mix);
                 remaining_pixels -= 2;
                 offset += 2;
@@ -97,7 +103,7 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
                 __m128i c_src = _mm_loadu_si32(&pixel);
                 __m128i c_dst = _mm_loadu_si32(dst_ptr);
                 #endif
-                __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst);
+                __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask);
                 /* Old GCC has bad or no _mm_storeu_si32 */
                 #if defined(__GNUC__) && (__GNUC__ < 11)
                 *dst_ptr = _mm_extract_epi32(mixed_pixel, 0);
@@ -113,8 +119,6 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
         src += srcskip;
         dst += dstskip;
     }
-    SDL_free(buf);
-
 }
 
 #endif
diff --git a/src/video/SDL_blit_A_sse4_1.c b/src/video/SDL_blit_A_sse4_1.c
index b8bf6f4902da1..5a779b01c27e2 100644
--- a/src/video/SDL_blit_A_sse4_1.c
+++ b/src/video/SDL_blit_A_sse4_1.c
@@ -9,23 +9,34 @@
 #include "SDL_blit.h"
 #include "SDL_blit_A_sse4_1.h"
 
+/**
+ * A helper function to create an alpha mask for use with MixRGBA_SSE4_1 based on pixel format
+ */
+__m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatAlphaMask_SSE4_1(SDL_PixelFormat* dstfmt) {
+    Uint8 index = dstfmt->Ashift / 8;
+    /* Handle case where bad input sent */
+    if (dstfmt->Ashift == dstfmt->Bshift && dstfmt->Ashift == 0) {
+        index = 3;
+    }
+    return _mm_set_epi8(
+            -1, index + 4, -1, index + 4, -1, index + 4, -1, index + 4,
+            -1, index, -1, index, -1, index, -1, index);
+}
+
 /**
  * Using the SSE4.1 instruction set, blit four pixels with alpha blending
  * @param src A pointer to two 32-bit pixels of ARGB format to blit into dst
  * @param dst A pointer to two 32-bit pixels of ARGB format to retain visual data for while alpha blending
  * @return A 128-bit wide vector of two alpha-blended pixels in ARGB format
  */
-__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst) {
+__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst, __m128i alphaMask) {
     __m128i src_color = _mm_cvtepu8_epi16(src);
     __m128i dst_color = _mm_cvtepu8_epi16(dst);
     /**
      * Combines a shuffle and an _mm_cvtepu8_epi16 operation into one operation by moving the lower 8 bits of the alpha
      * channel around to create 16-bit integers.
      */
-    const __m128i SHUFFLE_ALPHA = _mm_set_epi8(
-         -1, 7, -1, 7, -1, 7, -1, 7,
-         -1, 3, -1, 3, -1, 3, -1, 3);
-    __m128i alpha = _mm_shuffle_epi8(src, SHUFFLE_ALPHA);
+    __m128i alpha = _mm_shuffle_epi8(src, alphaMask);
     __m128i sub = _mm_sub_epi16(src_color, dst_color);
     __m128i mul = _mm_mullo_epi16(sub, alpha);
     const __m128i SHUFFLE_REDUCE = _mm_set_epi8(
@@ -46,21 +57,36 @@ Uint32 AlignPixelToSDL_PixelFormat(Uint32 color, const SDL_PixelFormat* srcForma
 }
 
 /*
- * This helper function converts arbitrary pixel format data into ARGB form with a 4 pixel-wide shuffle
+ * This helper function converts arbitrary pixel formats into a shuffle mask for _mm_shuffle_epi8
  */
-__m128i SDL_TARGETING("sse4.1") AlignPixelToSDL_PixelFormat_x4(__m128i colors, const SDL_PixelFormat* srcFormat) {
-    // Create shuffle masks based on the source SDL_PixelFormat to ARGB
-    __m128i srcShuffleMask = _mm_set_epi8(
-        srcFormat->Ashift / 8 + 12, srcFormat->Rshift / 8 + 12, srcFormat->Gshift / 8 + 12, srcFormat->Bshift / 8 + 12,
-        srcFormat->Ashift / 8 + 8, srcFormat->Rshift / 8 + 8, srcFormat->Gshift / 8 + 8, srcFormat->Bshift / 8 + 8,
-        srcFormat->Ashift / 8 + 4, srcFormat->Rshift / 8 + 4, srcFormat->Gshift / 8 + 4, srcFormat->Bshift / 8 + 4,
-        srcFormat->Ashift / 8, srcFormat->Rshift / 8, srcFormat->Gshift / 8, srcFormat->Bshift / 8
-    );
+__m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatShuffleMask(const SDL_PixelFormat* srcFormat, const SDL_PixelFormat* dstFormat) {
+    /* Calculate shuffle indices based on the source and destination SDL_PixelFormat */
+    Uint8 shuffleIndices[16];
+    Uint8 dstAshift = dstFormat->Ashift / 8;
+    Uint8 dstRshift = dstFormat->Rshift / 8;
+    Uint8 dstGshift = dstFormat->Gshift / 8;
+    Uint8 dstBshift = dstFormat->Bshift / 8;
+    /* Handle case where bad input sent */
+    if (dstAshift == dstBshift && dstAshift == 0) {
+        dstAshift = 3;
+    }
+    for (int i = 0; i < 4; ++i) {
+        shuffleIndices[dstAshift + i * 4] = srcFormat->Ashift / 8 + i * 4;
+        shuffleIndices[dstRshift + i * 4] = srcFormat->Rshift / 8 + i * 4;
+        shuffleIndices[dstGshift + i * 4] = srcFormat->Gshift / 8 + i * 4;
+        shuffleIndices[dstBshift + i * 4] = srcFormat->Bshift / 8 + i * 4;
+    }
 
-    // Shuffle the colors
-    return _mm_shuffle_epi8(colors, srcShuffleMask);
+    /* Create shuffle mask based on the calculated indices */
+    return _mm_set_epi8(
+            shuffleIndices[15], shuffleIndices[14], shuffleIndices[13], shuffleIndices[12],
+            shuffleIndices[11], shuffleIndices[10], shuffleIndices[9], shuffleIndices[8],
+            shuffleIndices[7], shuffleIndices[6], shuffleIndices[5], shuffleIndices[4],
+            shuffleIndices[3], shuffleIndices[2], shuffleIndices[1], shuffleIndices[0]
+    );
 }
 
+
 void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
     int width = info->dst_w;
     int height = info->dst_h;
@@ -69,22 +95,25 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
     Uint8 *dst = info->dst;
     int dstskip = info->dst_skip;
     SDL_PixelFormat *srcfmt = info->src_fmt;
+    SDL_PixelFormat *dstfmt = info->dst_fmt;
 
     int chunks = width / 4;
     Uint8 *buffer = (Uint8*)SDL_malloc(chunks * 16 * sizeof(Uint8));
+    const __m128i colorShiftMask = GetSDL_PixelFormatShuffleMask(srcfmt, dstfmt);
+    const __m128i alphaMask = GetSDL_PixelFormatAlphaMask_SSE4_1(dstfmt);
 
     while (height--) {
         /* Process 4-wide chunks of source color data that may be in wrong format into buffer */
         for (int i = 0; i < chunks; i += 1) {
             __m128i colors = _mm_loadu_si128((__m128i*)(src + i * 16));
-            _mm_storeu_si128((__m128i*)(buffer + i * 16), AlignPixelToSDL_PixelFormat_x4(colors, srcfmt));
+            _mm_storeu_si128((__m128i*)(buffer + i * 16), _mm_shuffle_epi8(colors, colorShiftMask));
         }
 
         /* Alpha-blend in 2-wide chunks from buffer into destination */
         for (int i = 0; i < chunks * 2; i += 1) {
             __m128i c_src = _mm_loadu_si64((buffer + (i * 8)));
             __m128i c_dst = _mm_loadu_si64((dst + i * 8));
-            __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst);
+            __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, alphaMask);
             _mm_storeu_si64(dst + i * 8, c_mix);
         }
 
@@ -96,9 +125,9 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
                 Uint32 *src_ptr = ((Uint32*)(src + (offset * 4)));
                 Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4)));
                 __m128i c_src = _mm_loadu_si64(src_ptr);
-                c_src = AlignPixelToSDL_PixelFormat_x4(c_src, srcfmt);
+                c_src = _mm_shuffle_epi8(c_src, colorShiftMask);
                 __m128i c_dst = _mm_loadu_si64(dst_ptr);
-                __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst);
+                __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, alphaMask);
                 _mm_storeu_si64(dst_ptr, c_mix);
                 remaining_pixels -= 2;
                 offset += 2;
@@ -115,7 +144,7 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
                 __m128i c_src = _mm_loadu_si32(&pixel);
                 __m128i c_dst = _mm_loadu_si32(dst_ptr);
                 #endif
-                __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst);
+                __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, alphaMask);
                 /* Old GCC has bad or no _mm_storeu_si32 */
                 #if defined(__GNUC__) && (__GNUC__ < 11)
                 *dst_ptr = _mm_extract_epi32(mixed_pixel, 0);
@@ -131,8 +160,6 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
         src += srcskip;
         dst += dstskip;
     }
-
-    SDL_free(buffer);
 }
 
 #endif
diff --git a/src/video/SDL_blit_A_sse4_1.h b/src/video/SDL_blit_A_sse4_1.h
index 2850290fd533a..ec7f8b7ae4b42 100644
--- a/src/video/SDL_blit_A_sse4_1.h
+++ b/src/video/SDL_blit_A_sse4_1.h
@@ -4,9 +4,11 @@
 #ifdef SDL_SSE4_1_INTRINSICS
 Uint32 AlignPixelToSDL_PixelFormat(Uint32 color, const SDL_PixelFormat* srcFormat);
 
-__m128i SDL_TARGETING("sse4.1") AlignPixelToSDL_PixelFormat_x4(__m128i colors, const SDL_PixelFormat* srcFormat);
+__m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatAlphaMask_SSE4_1(SDL_PixelFormat* dstfmt);
 
-__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst);
+__m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatShuffleMask(const SDL_PixelFormat* srcFormat, const SDL_PixelFormat* dstFormat);
+
+__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst, __m128i alphaMask);
 
 void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info);