SDL: Replace BlitRGBtoRGBSurfaceAlphaMMX

From d6045551420788a1ed2340182ec03907d2f4ce0e Mon Sep 17 00:00:00 2001
From: Brick <[EMAIL REDACTED]>
Date: Sun, 7 Jul 2024 00:24:25 +0100
Subject: [PATCH] Replace BlitRGBtoRGBSurfaceAlphaMMX

---
 src/video/SDL_blit_A.c | 184 ++++++++++++-----------------------------
 1 file changed, 53 insertions(+), 131 deletions(-)

diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index d8d6d80c7f9d5..d87e752478a72 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -166,158 +166,80 @@ static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
     }
 }
 
-#ifdef SDL_MMX_INTRINSICS
+#ifdef SDL_SSE2_INTRINSICS
 
-/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
-static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *info)
 {
     int width = info->dst_w;
     int height = info->dst_h;
-    Uint32 *srcp = (Uint32 *)info->src;
-    int srcskip = info->src_skip >> 2;
-    Uint32 *dstp = (Uint32 *)info->dst;
-    int dstskip = info->dst_skip >> 2;
-    Uint32 dalpha = info->dst_fmt->Amask;
-
-    __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
+    Uint8 *src = info->src;
+    int srcskip = info->src_skip;
+    Uint8 *dst = info->dst;
+    int dstskip = info->dst_skip;
+    Uint8 alpha = info->a;
 
-    hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
-    lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
-    dsta = _mm_set_pi32(dalpha, dalpha);          /* dst alpha mask -> dsta */
+    const __m128i alpha_fill_mask = _mm_set1_epi32((int)0xff000000);
+    const __m128i srcA = _mm_set1_epi16(alpha);
 
     while (height--) {
-        int n = width;
-        if (n & 1) {
-            Uint32 s = *srcp++;
-            Uint32 d = *dstp;
-            *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) + (s & d & 0x00010101)) | dalpha;
-            n--;
-        }
-
-        for (n >>= 1; n > 0; --n) {
-            dst1 = *(__m64 *)dstp; /* 2 x dst -> dst1(ARGBARGB) */
-            dst2 = dst1;           /* 2 x dst -> dst2(ARGBARGB) */
-
-            src1 = *(__m64 *)srcp; /* 2 x src -> src1(ARGBARGB) */
-            src2 = src1;           /* 2 x src -> src2(ARGBARGB) */
-
-            dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
-            src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
-            src2 = _mm_add_pi32(src2, dst2);  /* dst2 + src2 -> src2 */
-            src2 = _mm_srli_pi32(src2, 1);    /* src2 >> 1 -> src2 */
-
-            dst1 = _mm_and_si64(dst1, src1);  /* src & dst -> dst1 */
-            dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
-            dst1 = _mm_add_pi32(dst1, src2);  /* src2 + dst1 -> dst1 */
-            dst1 = _mm_or_si64(dst1, dsta);   /* dsta(full alpha) | dst1 -> dst1 */
-
-            *(__m64 *)dstp = dst1; /* dst1 -> 2 x dst pixels */
-            dstp += 2;
-            srcp += 2;
-        }
-
-        srcp += srcskip;
-        dstp += dstskip;
-    }
-    _mm_empty();
-}
+        int i = 0;
 
-/* fast RGB888->(A)RGB888 blending with surface alpha */
-static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
-{
-    SDL_PixelFormat *df = info->dst_fmt;
-    Uint32 chanmask;
-    unsigned alpha = info->a;
+        for (; i + 4 <= width; i += 4) {
+            // Load 4 src pixels
+            __m128i src128 = _mm_loadu_si128((__m128i *)src);
 
-    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
-        /* only call a128 version when R,G,B occupy lower bits */
-        BlitRGBtoRGBSurfaceAlpha128MMX(info);
-    } else {
-        int width = info->dst_w;
-        int height = info->dst_h;
-        Uint32 *srcp = (Uint32 *)info->src;
-        int srcskip = info->src_skip >> 2;
-        Uint32 *dstp = (Uint32 *)info->dst;
-        int dstskip = info->dst_skip >> 2;
-        Uint32 dalpha = df->Amask;
-        Uint32 amult;
-
-        __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
-
-        mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
-        /* form the alpha mult */
-        amult = alpha | (alpha << 8);
-        amult = amult | (amult << 16);
-        chanmask =
-            (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
-        mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
-        mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
-        /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
-        dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
+            // Load 4 dst pixels
+            __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
 
-        while (height--) {
-            int n = width;
-            if (n & 1) {
-                /* One Pixel Blend */
-                src2 = _mm_cvtsi32_si64(*srcp);         /* src(ARGB) -> src2 (0000ARGB) */
-                src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
+            __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
+            __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
 
-                dst1 = _mm_cvtsi32_si64(*dstp);         /* dst(ARGB) -> dst1 (0000ARGB) */
-                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
+            __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
+            __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
 
-                src2 = _mm_sub_pi16(src2, dst1);       /* src2 - dst2 -> src2 */
-                src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
-                src2 = _mm_srli_pi16(src2, 8);         /* src2 >> 8 -> src2 */
-                dst1 = _mm_add_pi8(src2, dst1);        /* src2 + dst1 -> dst1 */
+            // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
+            dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srcA),
+                                      _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
+            dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srcA),
+                                      _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
 
-                dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
-                dst1 = _mm_or_si64(dst1, dsta);       /* dsta | dst1 -> dst1 */
-                *dstp = _mm_cvtsi64_si32(dst1);       /* dst1 -> pixel */
+            // dst += 0x1U (use 0x80 to round instead of floor)
+            dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
+            dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
 
-                ++srcp;
-                ++dstp;
+            // dst = (dst + (dst >> 8)) >> 8
+            dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
+            dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
 
-                n--;
-            }
+            dst128 = _mm_packus_epi16(dst_lo, dst_hi);
 
-            for (n >>= 1; n > 0; --n) {
-                /* Two Pixels Blend */
-                src1 = *(__m64 *)srcp;                  /* 2 x src -> src1(ARGBARGB) */
-                src2 = src1;                            /* 2 x src -> src2(ARGBARGB) */
-                src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
-                src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
+            // Set the alpha channels of dst to 255
+            dst128 = _mm_or_si128(dst128, alpha_fill_mask);
 
-                dst1 = *(__m64 *)dstp;                  /* 2 x dst -> dst1(ARGBARGB) */
-                dst2 = dst1;                            /* 2 x dst -> dst2(ARGBARGB) */
-                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
-                dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
+            _mm_storeu_si128((__m128i *)dst, dst128);
 
-                src1 = _mm_sub_pi16(src1, dst1);       /* src1 - dst1 -> src1 */
-                src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
-                src1 = _mm_srli_pi16(src1, 8);         /* src1 >> 8 -> src1 */
-                dst1 = _mm_add_pi8(src1, dst1);        /* src1 + dst1(dst1) -> dst1 */
+            src += 16;
+            dst += 16;
+        }
 
-                src2 = _mm_sub_pi16(src2, dst2);       /* src2 - dst2 -> src2 */
-                src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
-                src2 = _mm_srli_pi16(src2, 8);         /* src2 >> 8 -> src2 */
-                dst2 = _mm_add_pi8(src2, dst2);        /* src2 + dst2(dst2) -> dst2 */
+        for (; i < width; ++i) {
+            Uint32 src32 = *(Uint32 *)src;
+            Uint32 dst32 = *(Uint32 *)dst;
 
-                dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
-                dst1 = _mm_or_si64(dst1, dsta);    /* dsta | dst1 -> dst1 */
+            FACTOR_BLEND_8888(src32, dst32, alpha);
 
-                *(__m64 *)dstp = dst1; /* dst1 -> 2 x pixel */
+            *dst = dst32 | 0xff000000;
 
-                srcp += 2;
-                dstp += 2;
-            }
-            srcp += srcskip;
-            dstp += dstskip;
+            src += 4;
+            dst += 4;
         }
-        _mm_empty();
+
+        src += srcskip;
+        dst += dstskip;
     }
 }
 
-#endif /* SDL_MMX_INTRINSICS */
+#endif
 
 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
@@ -1133,7 +1055,7 @@ static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_Bli
             dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
             dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
 
-            // dst += dst >> 8
+            // dst = (dst + (dst >> 8)) >> 8
             dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
             dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
 
@@ -1225,7 +1147,7 @@ static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitIn
             dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
             dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
 
-            // dst += dst >> 8
+            // dst = (dst + (dst >> 8)) >> 8
             dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
             dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
 
@@ -1384,9 +1306,9 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
 
             case 4:
                 if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) {
-#ifdef SDL_MMX_INTRINSICS
-                    if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && SDL_HasMMX()) {
-                        return BlitRGBtoRGBSurfaceAlphaMMX;
+#ifdef SDL_SSE2_INTRINSICS
+                    if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && SDL_HasSSE2()) {
+                        return Blit888to888SurfaceAlphaSSE2;
                     }
 #endif
                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {