SDL: Implement alpha blend as macro and replace inaccurate blitters

From e5bbe32641a48a2cdb7eeb8246381b375543a929 Mon Sep 17 00:00:00 2001
From: Isaac Aronson <[EMAIL REDACTED]>
Date: Wed, 13 Sep 2023 17:35:50 -0500
Subject: [PATCH] Implement alpha blend as macro and replace inaccurate
 blitters

---
 src/video/SDL_blit.h          |  27 ++++---
 src/video/SDL_blit_A.c        | 140 ++++------------------------------
 src/video/SDL_blit_A_avx2.c   |   4 +-
 src/video/SDL_blit_A_sse4_1.c |   4 +-
 4 files changed, 38 insertions(+), 137 deletions(-)

diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h
index 90588153adbe7..04525d7ee080c 100644
--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -493,21 +493,30 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
         }                                             \
     }
 
+/* Blend a single color channel or alpha value */
+#define ALPHA_BLEND_CHANNEL(sC, dC, sA)                  \
+    do {                                                 \
+        Uint16 x;                                        \
+        x = ((sC - dC) * sA) + ((dC << 8) - dC);         \
+        x += 0x1U;                                       \
+        x += x >> 8;                                     \
+        dC = x >> 8;                                     \
+    } while (0)
 /* Blend the RGB values of two pixels with an alpha value */
 #define ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB)            \
     do {                                                      \
-        dR = (Uint8)((((int)(sR - dR) * (int)A) / 255) + dR); \
-        dG = (Uint8)((((int)(sG - dG) * (int)A) / 255) + dG); \
-        dB = (Uint8)((((int)(sB - dB) * (int)A) / 255) + dB); \
+        ALPHA_BLEND_CHANNEL(sR, dR, A);                       \
+        ALPHA_BLEND_CHANNEL(sG, dG, A);                       \
+        ALPHA_BLEND_CHANNEL(sB, dB, A);                       \
     } while (0)
 
 /* Blend the RGBA values of two pixels */
-#define ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA)       \
-    do {                                                       \
-        dR = (Uint8)((((int)(sR - dR) * (int)sA) / 255) + dR); \
-        dG = (Uint8)((((int)(sG - dG) * (int)sA) / 255) + dG); \
-        dB = (Uint8)((((int)(sB - dB) * (int)sA) / 255) + dB); \
-        dA = (Uint8)((int)sA + dA - ((int)sA * dA) / 255);     \
+#define ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA) \
+    do {                                                 \
+        ALPHA_BLEND_CHANNEL(sR, dR, sA);                 \
+        ALPHA_BLEND_CHANNEL(sG, dG, sA);                 \
+        ALPHA_BLEND_CHANNEL(sB, dB, sA);                 \
+        ALPHA_BLEND_CHANNEL(255, dA, sA);                \
     } while (0)
 
 /* This is a very useful loop for optimizing blitters */
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index 7f272fd0905cc..3707ca891c964 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -460,22 +460,24 @@ static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
         int dstskip = info->dst_skip >> 2;
         Uint32 s;
         Uint32 d;
-        Uint32 s1;
-        Uint32 d1;
 
         while (height--) {
             /* *INDENT-OFF* */ /* clang-format off */
             DUFFS_LOOP4({
                 s = *srcp;
                 d = *dstp;
-                s1 = s & 0xff00ff;
-                d1 = d & 0xff00ff;
-                d1 = (d1 + ((s1 - d1) * alpha >> 8))
-                     & 0xff00ff;
-                s &= 0xff00;
-                d &= 0xff00;
-                d = (d + ((s - d) * alpha >> 8)) & 0xff00;
-                *dstp = d1 | d | 0xff000000;
+                Uint8 sR = (s >> 16) & 0xFF;
+                Uint8 sG = (s >> 8) & 0xFF;
+                Uint8 sB = s & 0xFF;
+                Uint8 dR = (d >> 16) & 0xFF;
+                Uint8 dG = (d >> 8) & 0xFF;
+                Uint8 dB = d & 0xFF;
+
+                ALPHA_BLEND_CHANNEL(sR, dR, alpha);
+                ALPHA_BLEND_CHANNEL(sG, dG, alpha);
+                ALPHA_BLEND_CHANNEL(sB, dB, alpha);
+
+                *dstp = (dR << 16) | (dG << 8) | dB | 0xFF000000;
                 ++srcp;
                 ++dstp;
             }, width);
@@ -950,97 +952,6 @@ static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
     }
 }
 
-/* fast ARGB8888->RGB565 blending with pixel alpha */
-static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
-{
-    int width = info->dst_w;
-    int height = info->dst_h;
-    Uint32 *srcp = (Uint32 *)info->src;
-    int srcskip = info->src_skip >> 2;
-    Uint16 *dstp = (Uint16 *)info->dst;
-    int dstskip = info->dst_skip >> 1;
-
-    while (height--) {
-        /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4({
-        Uint32 s = *srcp;
-        unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
-        /* FIXME: Here we special-case opaque alpha since the
-           compositioning used (>>8 instead of /255) doesn't handle
-           it correctly. Also special-case alpha=0 for speed?
-           Benchmark this! */
-        if (alpha) {
-          if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
-            *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
-          } else {
-            Uint32 d = *dstp;
-            /*
-             * convert source and destination to G0RAB65565
-             * and blend all components at the same time
-             */
-            s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
-              + (s >> 3 & 0x1f);
-            d = (d | d << 16) & 0x07e0f81f;
-            d += (s - d) * alpha >> 5;
-            d &= 0x07e0f81f;
-            *dstp = (Uint16)(d | d >> 16);
-          }
-        }
-        srcp++;
-        dstp++;
-        }, width);
-        /* *INDENT-ON* */ /* clang-format on */
-        srcp += srcskip;
-        dstp += dstskip;
-    }
-}
-
-/* fast ARGB8888->RGB555 blending with pixel alpha */
-static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
-{
-    int width = info->dst_w;
-    int height = info->dst_h;
-    Uint32 *srcp = (Uint32 *)info->src;
-    int srcskip = info->src_skip >> 2;
-    Uint16 *dstp = (Uint16 *)info->dst;
-    int dstskip = info->dst_skip >> 1;
-
-    while (height--) {
-        /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4({
-        unsigned alpha;
-        Uint32 s = *srcp;
-        alpha = s >> 27; /* downscale alpha to 5 bits */
-        /* FIXME: Here we special-case opaque alpha since the
-           compositioning used (>>8 instead of /255) doesn't handle
-           it correctly. Also special-case alpha=0 for speed?
-           Benchmark this! */
-        if (alpha) {
-          if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
-            *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
-          } else {
-            Uint32 d = *dstp;
-            /*
-             * convert source and destination to G0RAB65565
-             * and blend all components at the same time
-             */
-            s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
-              + (s >> 3 & 0x1f);
-            d = (d | d << 16) & 0x03e07c1f;
-            d += (s - d) * alpha >> 5;
-            d &= 0x03e07c1f;
-            *dstp = (Uint16)(d | d >> 16);
-          }
-        }
-        srcp++;
-        dstp++;
-        }, width);
-        /* *INDENT-ON* */ /* clang-format on */
-        srcp += srcskip;
-        dstp += dstskip;
-    }
-}
-
 /* General (slow) N->N blending with per-surface alpha */
 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
 {
@@ -1119,15 +1030,6 @@ static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
     }
 }
 
-/* Accurate alpha blending with no division */
-static Uint8 AlphaBlendChannel(Uint8 sC, Uint8 dC, Uint8 sA)
-{
-    Uint16 x = ((sC - dC) * sA) + ((dC << 8) - dC);
-    x += 0x1U; // Use 0x80 to round instead of floor
-    x += x >> 8;
-    return x >> 8;
-}
-
 /* General (slow) N->N blending with pixel alpha */
 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
 {
@@ -1141,7 +1043,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
     SDL_PixelFormat *dstfmt = info->dst_fmt;
     int srcbpp;
     int dstbpp;
-    int freeFormat;
+    SDL_bool freeFormat;
     Uint32 Pixel;
     unsigned sR, sG, sB, sA;
     unsigned dR, dG, dB, dA;
@@ -1149,7 +1051,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
     /* Set up some basic variables */
     srcbpp = srcfmt->bytes_per_pixel;
     dstbpp = dstfmt->bytes_per_pixel;
-    freeFormat = 0;
+    freeFormat = SDL_FALSE;
 
 #ifdef SDL_AVX2_INTRINSICS
     if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasAVX2()) {
@@ -1167,7 +1069,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
     /* Handle case where bad input sent */
     if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) {
         dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888);
-        freeFormat = 1;
+        freeFormat = SDL_TRUE;
     }
 
     while (height--) {
@@ -1177,10 +1079,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
         DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
         if (sA) {
             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
-            dR = AlphaBlendChannel(sR, dR, sA);
-            dG = AlphaBlendChannel(sG, dG, sA);
-            dB = AlphaBlendChannel(sB, dB, sA);
-            dA = AlphaBlendChannel(255, dA, sA);
+            ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
         }
         src += srcbpp;
@@ -1214,13 +1113,6 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
             }
 
         case 2:
-            if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
-                if (df->Gmask == 0x7e0) {
-                    return BlitARGBto565PixelAlpha;
-                } else if (df->Gmask == 0x3e0) {
-                    return BlitARGBto555PixelAlpha;
-                }
-            }
             return BlitNtoNPixelAlpha;
 
         case 4:
diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c
index 8f4b3f3561cf4..04b5851c2de26 100644
--- a/src/video/SDL_blit_A_avx2.c
+++ b/src/video/SDL_blit_A_avx2.c
@@ -113,11 +113,11 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
     SDL_PixelFormat *dstfmt = info->dst_fmt;
 
     int chunks = width / 8;
-    int free_format = 0;
+    SDL_bool free_format = SDL_FALSE;
     /* Handle case when passed invalid format, assume ARGB destination */
     if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) {
         dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888);
-        free_format = 1;
+        free_format = SDL_TRUE;
     }
     const __m256i shift_mask = GetSDL_PixelFormatShuffleMask_AVX2(srcfmt, dstfmt);
     const __m256i splat_mask = GetSDL_PixelFormatAlphaSplatMask_AVX2(dstfmt);
diff --git a/src/video/SDL_blit_A_sse4_1.c b/src/video/SDL_blit_A_sse4_1.c
index e243561d8b6b4..fae70c6cb9f76 100644
--- a/src/video/SDL_blit_A_sse4_1.c
+++ b/src/video/SDL_blit_A_sse4_1.c
@@ -126,11 +126,11 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) {
     SDL_PixelFormat *dstfmt = info->dst_fmt;
 
     const int chunks = width / 4;
-    int free_format = 0;
+    SDL_bool free_format = SDL_FALSE;
     /* Handle case when passed invalid format, assume ARGB destination */
     if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) {
         dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888);
-        free_format = 1;
+        free_format = SDL_TRUE;
     }
     const __m128i shift_mask = GetSDL_PixelFormatShuffleMask_SSE4_1(srcfmt, dstfmt);
     const __m128i splat_mask = GetSDL_PixelFormatAlphaSplatMask_SSE4_1(dstfmt);