SDL: Allow for more fine tuning of Duff's device routines (084db)

From 084dbb6410696c1e20f5e6f755421fc666cd1c42 Mon Sep 17 00:00:00 2001
From: Cameron Cawley <[EMAIL REDACTED]>
Date: Sat, 12 Oct 2024 17:20:32 +0100
Subject: [PATCH] Allow for more fine tuning of Duff's device routines

---
 src/video/SDL_blit.h   | 46 +++++++++++++++++++++++++++++++++---------
 src/video/SDL_blit_1.c | 16 +++++++--------
 src/video/SDL_blit_A.c | 22 ++++++++++----------
 src/video/SDL_blit_N.c | 14 ++++++-------
 4 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h
index 731ea63753f2e..00c6dd6c345b5 100644
--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -471,6 +471,15 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
 #else
 #define USE_DUFFS_LOOP
 #endif
+
+#define DUFFS_LOOP1(pixel_copy_increment, width) \
+    {                                            \
+        int n;                                   \
+        for (n = width; n > 0; --n) {            \
+            pixel_copy_increment;                \
+        }                                        \
+    }
+
 #ifdef USE_DUFFS_LOOP
 
 /* 8-times unrolled loop */
@@ -527,8 +536,26 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
         }                                        \
     }
 
-/* Use the 8-times version of the loop by default */
+/* 2-times unrolled loop */
+#define DUFFS_LOOP2(pixel_copy_increment, width) \
+    {                                            \
+        int n = (width + 1) / 2;                 \
+        switch (width & 1) {                     \
+        case 0:                                  \
+            do {                                 \
+                pixel_copy_increment;            \
+                SDL_FALLTHROUGH;                 \
+            case 1:                              \
+                pixel_copy_increment;            \
+            } while (--n > 0);                   \
+        }                                        \
+    }
+
+/* Use the 4-times version of the loop by default */
 #define DUFFS_LOOP(pixel_copy_increment, width) \
+    DUFFS_LOOP4(pixel_copy_increment, width)
+/* Use the 8-times version of the loop for simple routines */
+#define DUFFS_LOOP_TRIVIAL(pixel_copy_increment, width) \
     DUFFS_LOOP8(pixel_copy_increment, width)
 
 /* Special version of Duff's device for even more optimization */
@@ -562,20 +589,19 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
 
 /* Don't use Duff's device to unroll loops */
 #define DUFFS_LOOP(pixel_copy_increment, width) \
-    {                                           \
-        int n;                                  \
-        for (n = width; n > 0; --n) {           \
-            pixel_copy_increment;               \
-        }                                       \
-    }
+    DUFFS_LOOP1(pixel_copy_increment, width)
+#define DUFFS_LOOP_TRIVIAL(pixel_copy_increment, width) \
+    DUFFS_LOOP1(pixel_copy_increment, width)
 #define DUFFS_LOOP8(pixel_copy_increment, width) \
-    DUFFS_LOOP(pixel_copy_increment, width)
+    DUFFS_LOOP1(pixel_copy_increment, width)
 #define DUFFS_LOOP4(pixel_copy_increment, width) \
-    DUFFS_LOOP(pixel_copy_increment, width)
+    DUFFS_LOOP1(pixel_copy_increment, width)
+#define DUFFS_LOOP2(pixel_copy_increment, width) \
+    DUFFS_LOOP1(pixel_copy_increment, width)
 #define DUFFS_LOOP_124(pixel_copy_increment1,        \
                        pixel_copy_increment2,        \
                        pixel_copy_increment4, width) \
-    DUFFS_LOOP(pixel_copy_increment1, width)
+    DUFFS_LOOP1(pixel_copy_increment1, width)
 
 #endif /* USE_DUFFS_LOOP */
 
diff --git a/src/video/SDL_blit_1.c b/src/video/SDL_blit_1.c
index 93fdb3ec1bc34..6ec6677ecd346 100644
--- a/src/video/SDL_blit_1.c
+++ b/src/video/SDL_blit_1.c
@@ -50,7 +50,7 @@ static void Blit1to1(SDL_BlitInfo *info)
     while (height--) {
 #ifdef USE_DUFFS_LOOP
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP(
+        DUFFS_LOOP_TRIVIAL(
             {
               *dst = map[*src];
             }
@@ -102,7 +102,7 @@ static void Blit1to2(SDL_BlitInfo *info)
 #ifdef USE_DUFFS_LOOP
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP(
+        DUFFS_LOOP_TRIVIAL(
         {
             *(Uint16 *)dst = map[*src++];
             dst += 2;
@@ -258,7 +258,7 @@ static void Blit1to4(SDL_BlitInfo *info)
     while (height--) {
 #ifdef USE_DUFFS_LOOP
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP(
+        DUFFS_LOOP_TRIVIAL(
             *dst++ = map[*src++];
         , width);
         /* *INDENT-ON* */ /* clang-format on */
@@ -299,7 +299,7 @@ static void Blit1to1Key(SDL_BlitInfo *info)
     if (palmap) {
         while (height--) {
             /* *INDENT-OFF* */ /* clang-format off */
-            DUFFS_LOOP(
+            DUFFS_LOOP_TRIVIAL(
             {
                 if ( *src != ckey ) {
                   *dst = palmap[*src];
@@ -315,7 +315,7 @@ static void Blit1to1Key(SDL_BlitInfo *info)
     } else {
         while (height--) {
             /* *INDENT-OFF* */ /* clang-format off */
-            DUFFS_LOOP(
+            DUFFS_LOOP_TRIVIAL(
             {
                 if ( *src != ckey ) {
                   *dst = *src;
@@ -347,7 +347,7 @@ static void Blit1to2Key(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP(
+        DUFFS_LOOP_TRIVIAL(
         {
             if ( *src != ckey ) {
                 *dstp=palmap[*src];
@@ -410,7 +410,7 @@ static void Blit1to4Key(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP(
+        DUFFS_LOOP_TRIVIAL(
         {
             if ( *src != ckey ) {
                 *dstp = palmap[*src];
@@ -446,7 +446,7 @@ static void Blit1toNAlpha(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4(
+        DUFFS_LOOP(
         {
             sR = srcpal[*src].r;
             sG = srcpal[*src].g;
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index 96fec1b478fc8..4e3d9ed2c5d4b 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -47,7 +47,7 @@ static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4(
+        DUFFS_LOOP(
         {
         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
         dR = dstfmt->palette->colors[*dst].r;
@@ -92,7 +92,7 @@ static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4(
+        DUFFS_LOOP(
         {
         DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
         dR = dstfmt->palette->colors[*dst].r;
@@ -484,7 +484,7 @@ static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4({
+        DUFFS_LOOP({
             Uint32 s = *srcp++;
             Uint32 d = *dstp;
             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
@@ -516,7 +516,7 @@ static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
 
         while (height--) {
             /* *INDENT-OFF* */ /* clang-format off */
-            DUFFS_LOOP4({
+            DUFFS_LOOP({
                 s = *srcp;
                 d = *dstp;
                 s1 = s & 0xff00ff;
@@ -1148,7 +1148,7 @@ static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
 
         while (height--) {
             /* *INDENT-OFF* */ /* clang-format off */
-            DUFFS_LOOP4({
+            DUFFS_LOOP({
                 Uint32 s = *srcp++;
                 Uint32 d = *dstp;
                 /*
@@ -1186,7 +1186,7 @@ static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
 
         while (height--) {
             /* *INDENT-OFF* */ /* clang-format off */
-            DUFFS_LOOP4({
+            DUFFS_LOOP({
                 Uint32 s = *srcp++;
                 Uint32 d = *dstp;
                 /*
@@ -1219,7 +1219,7 @@ static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4({
+        DUFFS_LOOP({
         Uint32 s = *srcp;
         unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
         /* Here we special-case opaque alpha since the
@@ -1262,7 +1262,7 @@ static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4({
+        DUFFS_LOOP({
         unsigned alpha;
         Uint32 s = *srcp;
         alpha = s >> 27; /* downscale alpha to 5 bits */
@@ -1315,7 +1315,7 @@ static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
     if (sA) {
         while (height--) {
             /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4(
+        DUFFS_LOOP(
         {
         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
         DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
@@ -1353,7 +1353,7 @@ static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4(
+        DUFFS_LOOP(
         {
         RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
         if (sA && Pixel != ckey) {
@@ -1395,7 +1395,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP4(
+        DUFFS_LOOP(
         {
         DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
         if (sA) {
diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c
index 123d0aad0395f..1636c17e3e1c3 100644
--- a/src/video/SDL_blit_N.c
+++ b/src/video/SDL_blit_N.c
@@ -2076,7 +2076,7 @@ static void Blit_RGB555_ARGB1555(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP(
+        DUFFS_LOOP_TRIVIAL(
         {
             *dst = *src | mask;
             ++dst;
@@ -2200,7 +2200,7 @@ static void Blit4to4MaskAlpha(SDL_BlitInfo *info)
 
         while (height--) {
             /* *INDENT-OFF* */ /* clang-format off */
-            DUFFS_LOOP(
+            DUFFS_LOOP_TRIVIAL(
             {
                 *dst = *src | mask;
                 ++dst;
@@ -2217,7 +2217,7 @@ static void Blit4to4MaskAlpha(SDL_BlitInfo *info)
 
         while (height--) {
             /* *INDENT-OFF* */ /* clang-format off */
-            DUFFS_LOOP(
+            DUFFS_LOOP_TRIVIAL(
             {
                 *dst = *src & mask;
                 ++dst;
@@ -2576,7 +2576,7 @@ static void Blit2to2Key(SDL_BlitInfo *info)
 
     while (height--) {
         /* *INDENT-OFF* */ /* clang-format off */
-        DUFFS_LOOP(
+        DUFFS_LOOP_TRIVIAL(
         {
             if ( (*srcp & rgbmask) != ckey ) {
                 *dstp = *srcp;
@@ -2622,7 +2622,7 @@ static void BlitNtoNKey(SDL_BlitInfo *info)
             Uint32 mask = ((Uint32)info->a) << dstfmt->Ashift;
             while (height--) {
                 /* *INDENT-OFF* */ /* clang-format off */
-                DUFFS_LOOP(
+                DUFFS_LOOP_TRIVIAL(
                 {
                     if ((*src32 & rgbmask) != ckey) {
                         *dst32 = *src32 | mask;
@@ -2640,7 +2640,7 @@ static void BlitNtoNKey(SDL_BlitInfo *info)
             Uint32 mask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
             while (height--) {
                 /* *INDENT-OFF* */ /* clang-format off */
-                DUFFS_LOOP(
+                DUFFS_LOOP_TRIVIAL(
                 {
                     if ((*src32 & rgbmask) != ckey) {
                         *dst32 = *src32 & mask;
@@ -2897,7 +2897,7 @@ static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo *info)
             Uint32 *dst32 = (Uint32 *)dst;
             while (height--) {
                 /* *INDENT-OFF* */ /* clang-format off */
-                DUFFS_LOOP(
+                DUFFS_LOOP_TRIVIAL(
                 {
                     if ((*src32 & rgbmask) != ckey) {
                         *dst32 = *src32;