SDL: Fix BlitNtoNPixelAlpha for formats with no dst alpha

From c457dbb629ace70d928b8b7d33487dea9fc783a3 Mon Sep 17 00:00:00 2001
From: Brick <[EMAIL REDACTED]>
Date: Wed, 3 Jul 2024 16:37:26 +0100
Subject: [PATCH] Fix BlitNtoNPixelAlpha for formats with no dst alpha

---
 src/video/SDL_blit_A_avx2.c   | 113 ++++++++++++++++++----------------
 src/video/SDL_blit_A_sse4_1.c | 113 ++++++++++++++++++----------------
 2 files changed, 120 insertions(+), 106 deletions(-)

diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c
index 96e1714da0c08..4421aed0e264d 100644
--- a/src/video/SDL_blit_A_avx2.c
+++ b/src/video/SDL_blit_A_avx2.c
@@ -6,48 +6,6 @@
 
 #include "SDL_blit.h"
 
-// Using the AVX2 instruction set, blit sixteen pixels into eight with alpha blending
-SDL_FORCE_INLINE __m256i SDL_TARGETING("avx2") MixRGBA_AVX2(
-    __m256i src, __m256i dst,
-    const __m256i alpha_shuffle, const __m256i alpha_saturate)
-{
-    // SIMD implementation of blend_mul2.
-    // dstRGB                            = (srcRGB * srcA) + (dstRGB * (1-srcA))
-    // dstA   = srcA + (dstA * (1-srcA)) = (1      * srcA) + (dstA   * (1-srcA))
-
-    // Splat the alpha into all channels for each pixel
-    __m256i srca = _mm256_shuffle_epi8(src, alpha_shuffle);
-
-    // Set the alpha channels of src to 255
-    src = _mm256_or_si256(src, alpha_saturate);
-
-    __m256i src_lo = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
-    __m256i src_hi = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
-
-    __m256i dst_lo = _mm256_unpacklo_epi8(dst, _mm256_setzero_si256());
-    __m256i dst_hi = _mm256_unpackhi_epi8(dst, _mm256_setzero_si256());
-
-    __m256i srca_lo = _mm256_unpacklo_epi8(srca, _mm256_setzero_si256());
-    __m256i srca_hi = _mm256_unpackhi_epi8(srca, _mm256_setzero_si256());
-
-    // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
-    dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo),
-                              _mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo));
-    dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
-                              _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
-
-    // dst += 0x1U (use 0x80 to round instead of floor)
-    dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
-    dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
-
-    // dst += dst >> 8
-    dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
-    dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
-
-    dst = _mm256_packus_epi16(dst_lo, dst_hi);
-    return dst;
-}
-
 void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
 {
     int width = info->dst_w;
@@ -59,32 +17,64 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
     SDL_PixelFormat *srcfmt = info->src_fmt;
     SDL_PixelFormat *dstfmt = info->dst_fmt;
 
+    // The byte offsets for the start of each pixel
     const __m256i mask_offsets = _mm256_set_epi8(
         28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
 
-    const __m256i shift_mask = _mm256_add_epi32(
+    const __m256i convert_mask = _mm256_add_epi32(
         _mm256_set1_epi32(
             ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
             ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
-            ((srcfmt->Bshift >> 3) << dstfmt->Bshift) |
-            ((srcfmt->Ashift >> 3) << dstfmt->Ashift)),
+            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
         mask_offsets);
 
-    const __m256i splat_mask = _mm256_add_epi8(_mm256_set1_epi8(dstfmt->Ashift >> 3), mask_offsets);
-    const __m256i saturate_mask = _mm256_set1_epi32((int)dstfmt->Amask);
+    const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
+    const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask);
 
     while (height--) {
         int i = 0;
 
         for (; i + 8 <= width; i += 8) {
-            // Load 8 src pixels and shuffle into the dst format
-            __m256i c_src = _mm256_shuffle_epi8(_mm256_loadu_si256((__m256i *)src), shift_mask);
+            // Load 8 src pixels
+            __m256i src256 = _mm256_loadu_si256((__m256i *)src);
 
             // Load 8 dst pixels
-            __m256i c_dst = _mm256_loadu_si256((__m256i *)dst);
+            __m256i dst256 = _mm256_loadu_si256((__m256i *)dst);
+
+            // Extract the alpha from each pixel and splat it into all the channels
+            __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);
+
+            // Convert to dst format
+            src256 = _mm256_shuffle_epi8(src256, convert_mask);
+
+            // Set the alpha channels of src to 255
+            src256 = _mm256_or_si256(src256, alpha_fill_mask);
+
+            __m256i src_lo = _mm256_unpacklo_epi8(src256, _mm256_setzero_si256());
+            __m256i src_hi = _mm256_unpackhi_epi8(src256, _mm256_setzero_si256());
+
+            __m256i dst_lo = _mm256_unpacklo_epi8(dst256, _mm256_setzero_si256());
+            __m256i dst_hi = _mm256_unpackhi_epi8(dst256, _mm256_setzero_si256());
+
+            __m256i srca_lo = _mm256_unpacklo_epi8(srcA, _mm256_setzero_si256());
+            __m256i srca_hi = _mm256_unpackhi_epi8(srcA, _mm256_setzero_si256());
+
+            // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
+            dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo),
+                                      _mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo));
+            dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
+                                      _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
+
+            // dst += 0x1U (use 0x80 to round instead of floor)
+            dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
+            dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
+
+            // dst += dst >> 8
+            dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
+            dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
 
             // Blend the pixels together and save the result
-            _mm256_storeu_si256((__m256i *)dst, MixRGBA_AVX2(c_src, c_dst, splat_mask, saturate_mask));
+            _mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi));
 
             src += 32;
             dst += 32;
@@ -94,12 +84,29 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
             Uint32 src32 = *(Uint32 *)src;
             Uint32 dst32 = *(Uint32 *)dst;
 
+            Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF;
+
             src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) |
                     (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) |
                     (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) |
-                    (((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift);
+                    dstfmt->Amask;
+
+            Uint32 srcRB = src32 & 0x00FF00FF;
+            Uint32 dstRB = dst32 & 0x00FF00FF;
+
+            Uint32 srcGA = (src32 >> 8) & 0x00FF00FF;
+            Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF;
+
+            Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
+            resRB += 0x00010001;
+            resRB += (resRB >> 8) & 0x00FF00FF;
+            resRB = (resRB >> 8) & 0x00FF00FF;
 
-            ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift);
+            Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
+            resGA += 0x00010001;
+            resGA += (resGA >> 8) & 0x00FF00FF;
+            resGA &= 0xFF00FF00;
+            dst32 = resRB | resGA;
 
             *(Uint32 *)dst = dst32;
 
diff --git a/src/video/SDL_blit_A_sse4_1.c b/src/video/SDL_blit_A_sse4_1.c
index 3e6881943196e..425f5f028194b 100644
--- a/src/video/SDL_blit_A_sse4_1.c
+++ b/src/video/SDL_blit_A_sse4_1.c
@@ -6,48 +6,6 @@
 
 #include "SDL_blit.h"
 
-// Using the SSE4.1 instruction set, blit eight pixels into four with alpha blending
-SDL_FORCE_INLINE __m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(
-    __m128i src, __m128i dst,
-    const __m128i alpha_shuffle, const __m128i alpha_saturate)
-{
-    // SIMD implementation of blend_mul2.
-    // dstRGB                            = (srcRGB * srcA) + (dstRGB * (1-srcA))
-    // dstA   = srcA + (dstA * (1-srcA)) = (1      * srcA) + (dstA   * (1-srcA))
-
-    // Splat the alpha into all channels for each pixel
-    __m128i srca = _mm_shuffle_epi8(src, alpha_shuffle);
-
-    // Set the alpha channels of src to 255
-    src = _mm_or_si128(src, alpha_saturate);
-
-    __m128i src_lo = _mm_unpacklo_epi8(src, _mm_setzero_si128());
-    __m128i src_hi = _mm_unpackhi_epi8(src, _mm_setzero_si128());
-
-    __m128i dst_lo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
-    __m128i dst_hi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
-
-    __m128i srca_lo = _mm_unpacklo_epi8(srca, _mm_setzero_si128());
-    __m128i srca_hi = _mm_unpackhi_epi8(srca, _mm_setzero_si128());
-
-    // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
-    dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo),
-                           _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
-    dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
-                           _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
-
-    // dst += 0x1U (use 0x80 to round instead of floor)
-    dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
-    dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
-
-    // dst += dst >> 8
-    dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
-    dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
-
-    dst = _mm_packus_epi16(dst_lo, dst_hi);
-    return dst;
-}
-
 void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
 {
     int width = info->dst_w;
@@ -59,32 +17,64 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
     SDL_PixelFormat *srcfmt = info->src_fmt;
     SDL_PixelFormat *dstfmt = info->dst_fmt;
 
+    // The byte offsets for the start of each pixel
     const __m128i mask_offsets = _mm_set_epi8(
         12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
 
-    const __m128i shift_mask = _mm_add_epi32(
+    const __m128i convert_mask = _mm_add_epi32(
         _mm_set1_epi32(
             ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
             ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
-            ((srcfmt->Bshift >> 3) << dstfmt->Bshift) |
-            ((srcfmt->Ashift >> 3) << dstfmt->Ashift)),
+            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
         mask_offsets);
 
-    const __m128i splat_mask = _mm_add_epi8(_mm_set1_epi8(dstfmt->Ashift >> 3), mask_offsets);
-    const __m128i saturate_mask = _mm_set1_epi32((int)dstfmt->Amask);
+    const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
+    const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask);
 
     while (height--) {
         int i = 0;
 
         for (; i + 4 <= width; i += 4) {
-            // Load 4 src pixels and shuffle into the dst format
-            __m128i c_src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)src), shift_mask);
+            // Load 4 src pixels
+            __m128i src128 = _mm_loadu_si128((__m128i *)src);
 
             // Load 4 dst pixels
-            __m128i c_dst = _mm_loadu_si128((__m128i *)dst);
+            __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
+
+            // Extract the alpha from each pixel and splat it into all the channels
+            __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);
+
+            // Convert to dst format
+            src128 = _mm_shuffle_epi8(src128, convert_mask);
+
+            // Set the alpha channels of src to 255
+            src128 = _mm_or_si128(src128, alpha_fill_mask);
+
+            __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
+            __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
+
+            __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
+            __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
+
+            __m128i srca_lo = _mm_unpacklo_epi8(srcA, _mm_setzero_si128());
+            __m128i srca_hi = _mm_unpackhi_epi8(srcA, _mm_setzero_si128());
+
+            // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
+            dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo),
+                                      _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
+            dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
+                                      _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
+
+            // dst += 0x1U (use 0x80 to round instead of floor)
+            dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
+            dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
+
+            // dst += dst >> 8
+            dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
+            dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
 
             // Blend the pixels together and save the result
-            _mm_storeu_si128((__m128i *)dst, MixRGBA_SSE4_1(c_src, c_dst, splat_mask, saturate_mask));
+            _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi));
 
             src += 16;
             dst += 16;
@@ -94,12 +84,29 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
             Uint32 src32 = *(Uint32 *)src;
             Uint32 dst32 = *(Uint32 *)dst;
 
+            Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF;
+
             src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) |
                     (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) |
                     (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) |
-                    (((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift);
+                    dstfmt->Amask;
+
+            Uint32 srcRB = src32 & 0x00FF00FF;
+            Uint32 dstRB = dst32 & 0x00FF00FF;
+
+            Uint32 srcGA = (src32 >> 8) & 0x00FF00FF;
+            Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF;
+
+            Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
+            resRB += 0x00010001;
+            resRB += (resRB >> 8) & 0x00FF00FF;
+            resRB = (resRB >> 8) & 0x00FF00FF;
 
-            ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift);
+            Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
+            resGA += 0x00010001;
+            resGA += (resGA >> 8) & 0x00FF00FF;
+            resGA &= 0xFF00FF00;
+            dst32 = resRB | resGA;
 
             *(Uint32 *)dst = dst32;