SDL: Fixed bug #6990: fix computation of alpha in BlitRGBtoRGBPixelAlphaMMX (13ab1)

From 13ab10031790f9aec74f8501be1f34e99abf6e5c Mon Sep 17 00:00:00 2001
From: Sylvain <[EMAIL REDACTED]>
Date: Fri, 6 Jan 2023 13:27:53 +0100
Subject: [PATCH] Fixed bug #6990: fix computation of alpha in
 BlitRGBtoRGBPixelAlphaMMX backport from SDL2 branch

---
 src/video/SDL_blit_A.c | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index d06732170bde..fcaa9a7a096d 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -332,9 +332,19 @@ static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
     Uint32 ashift = sf->Ashift;
     Uint64 multmask, multmask2;
 
-    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
+    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2, mm_one_alpha;
 
     mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
+    if (amask == 0xFF000000) { /* 1 in the alpha channel -> mm_one_alpha */
+        mm_one_alpha = _mm_set_pi16(1, 0, 0, 0);
+    } else if (amask == 0x00FF0000) {
+        mm_one_alpha = _mm_set_pi16(0, 1, 0, 0);
+    } else if (amask == 0x0000FF00) {
+        mm_one_alpha = _mm_set_pi16(0, 0, 1, 0);
+    } else {
+        mm_one_alpha = _mm_set_pi16(0, 0, 0, 1);
+    }
+
     multmask = 0x00FF;
     multmask <<= (ashift * 2);
     multmask2 = 0x00FF00FF00FF00FFULL;
@@ -361,14 +371,33 @@ static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
 
-            /* blend */            
+            /*
+                Alpha blending is:
+                    dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA))
+                    dstA = srcA + (dstA * (1-srcA)) *
+
+                Here, 'src1' is:
+                    srcRGB * srcA
+                    srcA
+                And 'dst1' is:
+                    dstRGB * (1-srcA)
+                    dstA * (1-srcA)
+                so that *dstp is 'src1 + dst1'
+
+                src1 is computed using mullo_pi16: (X * mask) >> 8, but is approximate for srcA ((srcA * 255) >> 8).
+
+                need to a 1 to get an exact result: (srcA * 256) >> 8 == srcA
+             */
+            mm_alpha = _mm_add_pi16(mm_alpha, mm_one_alpha);
+
+            /* blend */
             src1 = _mm_mullo_pi16(src1, mm_alpha);
             src1 = _mm_srli_pi16(src1, 8);
             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
             dst1 = _mm_srli_pi16(dst1, 8);
             dst1 = _mm_add_pi16(src1, dst1);
             dst1 = _mm_packs_pu16(dst1, mm_zero);
-            
+
             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
         }
         ++srcp;
@@ -952,7 +981,7 @@ static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
 
                 mm_res = dst2; /* RED -> mm_res */
-                
+
                 /* green -- process the bits in place */
                 src2 = src1;
                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
@@ -1094,7 +1123,7 @@ static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
            compositioning used (>>8 instead of /255) doesn't handle
            it correctly. Also special-case alpha=0 for speed?
            Benchmark this! */
-        if (alpha) {   
+        if (alpha) {
           if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
             *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
           } else {
@@ -1140,7 +1169,7 @@ static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
            compositioning used (>>8 instead of /255) doesn't handle
            it correctly. Also special-case alpha=0 for speed?
            Benchmark this! */
-        if (alpha) {   
+        if (alpha) {
           if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
             *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
           } else {