SDL: Implement mathematically correct scalar blitters

From f5ee88c2cfd60df3e418c576991be2002ec37caf Mon Sep 17 00:00:00 2001
From: Isaac Aronson <[EMAIL REDACTED]>
Date: Tue, 12 Sep 2023 22:28:08 -0500
Subject: [PATCH] Implement mathematically correct scalar blitters

---
 src/video/SDL_blit_A.c        | 135 ++++++----------------------------
 src/video/SDL_blit_A_avx2.c   |  10 ++-
 src/video/SDL_blit_A_sse4_1.c |  10 ++-
 test/testautomation_blit.c    |  25 +++----
 4 files changed, 46 insertions(+), 134 deletions(-)

diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index 1cdbaf35909a2..ed3c941e04c46 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -546,112 +546,6 @@ static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
     }
 }
 
-/* fast ARGB888->(A)RGB888 blending with pixel alpha */
-static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
-{
-    int width = info->dst_w;
-    int height = info->dst_h;
-    Uint32 *srcp = (Uint32 *)info->src;
-    int srcskip = info->src_skip >> 2;
-    Uint32 *dstp = (Uint32 *)info->dst;
-    int dstskip = info->dst_skip >> 2;
-
-    while (height--) {
-        /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4({
-        Uint32 dalpha;
-        Uint32 d;
-        Uint32 s1;
-        Uint32 d1;
-        Uint32 s = *srcp;
-        Uint32 alpha = s >> 24;
-        /* FIXME: Here we special-case opaque alpha since the
-           compositioning used (>>8 instead of /255) doesn't handle
-           it correctly. Also special-case alpha=0 for speed?
-           Benchmark this! */
-        if (alpha) {
-          if (alpha == SDL_ALPHA_OPAQUE) {
-              *dstp = *srcp;
-          } else {
-            /*
-             * take out the middle component (green), and process
-             * the other two in parallel. One multiply less.
-             */
-            d = *dstp;
-            dalpha = d >> 24;
-            s1 = s & 0xff00ff;
-            d1 = d & 0xff00ff;
-            d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
-            s &= 0xff00;
-            d &= 0xff00;
-            d = (d + ((s - d) * alpha >> 8)) & 0xff00;
-            dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
-            *dstp = d1 | d | (dalpha << 24);
-          }
-        }
-        ++srcp;
-        ++dstp;
-        }, width);
-        /* *INDENT-ON* */ /* clang-format on */
-        srcp += srcskip;
-        dstp += dstskip;
-    }
-}
-
-/* fast ARGB888->(A)BGR888 blending with pixel alpha */
-static void BlitRGBtoBGRPixelAlpha(SDL_BlitInfo *info)
-{
-    int width = info->dst_w;
-    int height = info->dst_h;
-    Uint32 *srcp = (Uint32 *)info->src;
-    int srcskip = info->src_skip >> 2;
-    Uint32 *dstp = (Uint32 *)info->dst;
-    int dstskip = info->dst_skip >> 2;
-
-    while (height--) {
-        /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4({
-        Uint32 dalpha;
-        Uint32 d;
-        Uint32 s1;
-        Uint32 d1;
-        Uint32 s = *srcp;
-        Uint32 alpha = s >> 24;
-        /* FIXME: Here we special-case opaque alpha since the
-           compositioning used (>>8 instead of /255) doesn't handle
-           it correctly. Also special-case alpha=0 for speed?
-           Benchmark this! */
-        if (alpha) {
-          /*
-           * take out the middle component (green), and process
-           * the other two in parallel. One multiply less.
-           */
-          s1 = s & 0xff00ff;
-          s1 = (s1 >> 16) | (s1 << 16);
-          s &= 0xff00;
-
-          if (alpha == SDL_ALPHA_OPAQUE) {
-              *dstp = 0xff000000 | s | s1;
-          } else {
-            d = *dstp;
-            dalpha = d >> 24;
-            d1 = d & 0xff00ff;
-            d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
-            d &= 0xff00;
-            d = (d + ((s - d) * alpha >> 8)) & 0xff00;
-            dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
-            *dstp = d1 | d | (dalpha << 24);
-          }
-        }
-        ++srcp;
-        ++dstp;
-        }, width);
-        /* *INDENT-ON* */ /* clang-format on */
-        srcp += srcskip;
-        dstp += dstskip;
-    }
-}
-
 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
 
 /* blend a single 16 bit pixel at 50% */
@@ -1285,6 +1179,15 @@ static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
     }
 }
 
+/* Accurate alpha blending with no division */
+static Uint8 AlphaBlendChannel(Uint8 sC, Uint8 dC, Uint8 sA)
+{
+    Uint16 x = ((sC - dC) * sA) + ((dC << 8) - dC);
+    x += 0x1U; // Use 0x80 to round instead of floor
+    x += x >> 8;
+    return x >> 8;
+}
+
 /* General (slow) N->N blending with pixel alpha */
 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
 {
@@ -1298,6 +1201,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
     SDL_PixelFormat *dstfmt = info->dst_fmt;
     int srcbpp;
     int dstbpp;
+    int freeFormat;
     Uint32 Pixel;
     unsigned sR, sG, sB, sA;
     unsigned dR, dG, dB, dA;
@@ -1305,6 +1209,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
     /* Set up some basic variables */
     srcbpp = srcfmt->bytes_per_pixel;
     dstbpp = dstfmt->bytes_per_pixel;
+    freeFormat = 0;
 
 #ifdef SDL_AVX2_INTRINSICS
     if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasAVX2()) {
@@ -1319,6 +1224,11 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
 	    return;
     }
 #endif
+    /* Handle case where bad input sent */
+    if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) {
+        dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888);
+        freeFormat = 1;
+    }
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
@@ -1327,7 +1237,10 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
         DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
         if (sA) {
             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
-            ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
+            dR = AlphaBlendChannel(sR, dR, sA);
+            dG = AlphaBlendChannel(sG, dG, sA);
+            dB = AlphaBlendChannel(sB, dB, sA);
+            dA = AlphaBlendChannel(255, dA, sA);
             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
         }
         src += srcbpp;
@@ -1338,6 +1251,9 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
         src += srcskip;
         dst += dstskip;
     }
+    if (freeFormat) {
+        SDL_DestroyPixelFormat(dstfmt);
+    }
 }
 
 SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
@@ -1406,11 +1322,6 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
                         return BlitRGBtoRGBPixelAlphaARMSIMD;
                     }
 #endif
-                    return BlitRGBtoRGBPixelAlpha;
-                }
-            } else if (sf->Rmask == df->Bmask && sf->Gmask == df->Gmask && sf->Bmask == df->Rmask && sf->bytes_per_pixel == 4) {
-                if (sf->Amask == 0xff000000) {
-                    return BlitRGBtoBGRPixelAlpha;
                 }
             }
             return BlitNtoNPixelAlpha;
diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c
index ed2bfc1bfe8eb..8f4b3f3561cf4 100644
--- a/src/video/SDL_blit_A_avx2.c
+++ b/src/video/SDL_blit_A_avx2.c
@@ -89,9 +89,13 @@ __m256i SDL_TARGETING("avx2") MixRGBA_AVX2(__m256i src, __m256i dst, const __m25
     dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
                               _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
 
-    // dst = (dst * 0x8081) >> 23
-    dst_lo = _mm256_srli_epi16(_mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(-0x7F7F)), 7);
-    dst_hi = _mm256_srli_epi16(_mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(-0x7F7F)), 7);
+    // dst += 0x1U (use 0x80 to round instead of floor)
+    dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
+    dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
+
+    // dst += dst >> 8
+    dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
+    dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
 
     dst = _mm256_packus_epi16(dst_lo, dst_hi);
     return dst;
diff --git a/src/video/SDL_blit_A_sse4_1.c b/src/video/SDL_blit_A_sse4_1.c
index 34355e8c950a3..e243561d8b6b4 100644
--- a/src/video/SDL_blit_A_sse4_1.c
+++ b/src/video/SDL_blit_A_sse4_1.c
@@ -90,9 +90,13 @@ __m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst,
     dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
                            _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
 
-    // dst = (dst * 0x8081) >> 23
-    dst_lo = _mm_srli_epi16(_mm_mulhi_epu16(dst_lo, _mm_set1_epi16(-0x7F7F)), 7);
-    dst_hi = _mm_srli_epi16(_mm_mulhi_epu16(dst_hi, _mm_set1_epi16(-0x7F7F)), 7);
+    // dst += 0x1U (use 0x80 to round instead of floor)
+    dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
+    dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
+
+    // dst += dst >> 8;
+    dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
+    dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
 
     dst = _mm_packus_epi16(dst_lo, dst_hi);
     return dst;
diff --git a/test/testautomation_blit.c b/test/testautomation_blit.c
index 9553bbeea1262..dc745a9d8b33b 100644
--- a/test/testautomation_blit.c
+++ b/test/testautomation_blit.c
@@ -98,8 +98,7 @@ Uint32 hashSurfacePixels(SDL_Surface * surface) {
 int blit_testExampleApplicationRender(void *arg) {
     const int width = 32;
     const int height = 32;
-    const unsigned long scalar_hash = 0xf47a3f55;
-    const unsigned long x86_simd_hash = 0xe345d7a7;
+    const unsigned long correct_hash = 0xe345d7a7;
     SDL_Surface* dest_surface = SDL_CreateSurface(width, height, SDL_PIXELFORMAT_ARGB8888);
     SDL_Surface* rainbow_background = SDLTest_ImageBlendingBackground();
     SDL_Surface* gearbrain_sprite = SDLTest_ImageBlendingSprite();
@@ -109,9 +108,8 @@ int blit_testExampleApplicationRender(void *arg) {
     SDL_BlitSurface(gearbrain_sprite, NULL, dest_surface, NULL);
     // Check result
     const unsigned long hash = hashSurfacePixels(dest_surface);
-    SDLTest_AssertCheck(hash == scalar_hash || hash == x86_simd_hash,
-                        "Should render identically, expected 0x%lx (scalar) or 0x%lx (x86_simd), got 0x%lx",
-                        scalar_hash, x86_simd_hash, hash);
+    SDLTest_AssertCheck(hash == correct_hash,
+                        "Should render identically, expected hash 0x%lx, got 0x%lx", correct_hash, hash);
     // Clean up
     SDL_DestroySurface(rainbow_background);
     SDL_DestroySurface(gearbrain_sprite);
@@ -126,8 +124,7 @@ int blit_testExampleApplicationRender(void *arg) {
 int blit_testRandomToRandomSVGA(void *arg) {
     const int width = 800;
     const int height = 600;
-    const unsigned long scalar_hash = 0x1f56efad;
-    const unsigned long x86_simd_hash = 0x42140c5f;
+    const unsigned long correct_hash = 0x42140c5f;
     // Allocate random buffers
     Uint32 *dest_pixels = getNextRandomBuffer(width, height);
     Uint32 *src_pixels = getNextRandomBuffer(width, height);
@@ -138,9 +135,8 @@ int blit_testRandomToRandomSVGA(void *arg) {
     SDL_BlitSurface(src_surface, NULL, dest_surface, NULL);
     // Check result
     const unsigned long hash = hashSurfacePixels(dest_surface);
-    SDLTest_AssertCheck(hash == scalar_hash || hash == x86_simd_hash,
-                        "Should render identically, expected 0x%lx (scalar) or 0x%lx (x86_simd), got 0x%lx",
-                        scalar_hash, x86_simd_hash, hash);
+    SDLTest_AssertCheck(hash == correct_hash,
+                        "Should render identically, expected hash 0x%lx, got 0x%lx", correct_hash, hash);
     // Clean up
     SDL_DestroySurface(dest_surface);
     SDL_DestroySurface(src_surface);
@@ -157,8 +153,7 @@ int blit_testRandomToRandomSVGAMultipleIterations(void *arg) {
     const int width = 800;
     const int height = 600;
     int i;
-    const unsigned long x86_simd_hash = 0x2626be78;
-    const unsigned long scalar_hash = 0xfb2a8ee8;
+    const unsigned long correct_hash = 0x5d26be78;
     // Create blank source surface
     SDL_Surface* dest_surface = SDL_CreateSurface(width, height, SDL_PIXELFORMAT_ABGR8888);
 
@@ -180,11 +175,9 @@ int blit_testRandomToRandomSVGAMultipleIterations(void *arg) {
     // Check result
     const unsigned long hash = hashSurfacePixels(dest_surface);
     // Clean up
-    SDL_SaveBMP(dest_surface, "250k_scalar.bmp");
     SDL_DestroySurface(dest_surface);
-    SDLTest_AssertCheck(hash == scalar_hash || hash == x86_simd_hash,
-                        "Should render identically, expected 0x%lx (scalar) or 0x%lx (x86_simd), got 0x%lx",
-                        scalar_hash, x86_simd_hash, hash);
+    SDLTest_AssertCheck(hash == correct_hash,
+                        "Should render identically, expected hash 0x%lx, got 0x%lx", correct_hash, hash);
     return TEST_COMPLETED;
 }