SDL: Added a slow blit function to handle large pixel formats and colorspace conversion

From 3c45544a146b0c6a0e45e4d12f52163d9431532c Mon Sep 17 00:00:00 2001
From: Sam Lantinga <[EMAIL REDACTED]>
Date: Tue, 30 Jan 2024 23:15:20 -0800
Subject: [PATCH] Added a slow blit function to handle large pixel formats and
 colorspace conversion

---
 include/SDL3/SDL_pixels.h |  36 +-
 src/video/SDL_blit.c      |  59 +--
 src/video/SDL_blit.h      |  16 +
 src/video/SDL_blit_N.c    |  30 +-
 src/video/SDL_blit_slow.c | 741 +++++++++++++++++++++++++++++---------
 src/video/SDL_blit_slow.h |   2 +-
 6 files changed, 628 insertions(+), 256 deletions(-)

diff --git a/include/SDL3/SDL_pixels.h b/include/SDL3/SDL_pixels.h
index a2a8bbb89ebd..ecbc129f6df4 100644
--- a/include/SDL3/SDL_pixels.h
+++ b/include/SDL3/SDL_pixels.h
@@ -326,58 +326,58 @@ typedef enum
                                SDL_PACKEDLAYOUT_2101010, 32, 4),
     SDL_PIXELFORMAT_RGB48 =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYU16, SDL_ARRAYORDER_RGB, 0,
-                               48, 3),
+                               48, 6),
     SDL_PIXELFORMAT_BGR48 =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYU16, SDL_ARRAYORDER_BGR, 0,
-                               48, 3),
+                               48, 6),
     SDL_PIXELFORMAT_RGBA64 =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYU16, SDL_ARRAYORDER_RGBA, 0,
-                               64, 4),
+                               64, 8),
     SDL_PIXELFORMAT_ARGB64 =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYU16, SDL_ARRAYORDER_ARGB, 0,
-                               64, 4),
+                               64, 8),
     SDL_PIXELFORMAT_BGRA64 =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYU16, SDL_ARRAYORDER_BGRA, 0,
-                               64, 4),
+                               64, 8),
     SDL_PIXELFORMAT_ABGR64 =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYU16, SDL_ARRAYORDER_ABGR, 0,
-                               64, 4),
+                               64, 8),
     SDL_PIXELFORMAT_RGB48_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF16, SDL_ARRAYORDER_RGB, 0,
-                               48, 3),
+                               48, 6),
     SDL_PIXELFORMAT_BGR48_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF16, SDL_ARRAYORDER_BGR, 0,
-                               48, 3),
+                               48, 6),
     SDL_PIXELFORMAT_RGBA64_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF16, SDL_ARRAYORDER_RGBA, 0,
-                               64, 4),
+                               64, 8),
     SDL_PIXELFORMAT_ARGB64_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF16, SDL_ARRAYORDER_ARGB, 0,
-                               64, 4),
+                               64, 8),
     SDL_PIXELFORMAT_BGRA64_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF16, SDL_ARRAYORDER_BGRA, 0,
-                               64, 4),
+                               64, 8),
     SDL_PIXELFORMAT_ABGR64_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF16, SDL_ARRAYORDER_ABGR, 0,
-                               64, 4),
+                               64, 8),
     SDL_PIXELFORMAT_RGB96_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF32, SDL_ARRAYORDER_RGB, 0,
-                               96, 3),
+                               96, 12),
     SDL_PIXELFORMAT_BGR96_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF32, SDL_ARRAYORDER_BGR, 0,
-                               96, 3),
+                               96, 12),
     SDL_PIXELFORMAT_RGBA128_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF32, SDL_ARRAYORDER_RGBA, 0,
-                               128, 4),
+                               128, 16),
     SDL_PIXELFORMAT_ARGB128_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF32, SDL_ARRAYORDER_ARGB, 0,
-                               128, 4),
+                               128, 16),
     SDL_PIXELFORMAT_BGRA128_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF32, SDL_ARRAYORDER_BGRA, 0,
-                               128, 4),
+                               128, 16),
     SDL_PIXELFORMAT_ABGR128_FLOAT =
         SDL_DEFINE_PIXELFORMAT(SDL_PIXELTYPE_ARRAYF32, SDL_ARRAYORDER_ABGR, 0,
-                               128, 4),
+                               128, 16),
 
     /* Aliases for RGBA byte arrays of color data, for the current platform */
 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
diff --git a/src/video/SDL_blit.c b/src/video/SDL_blit.c
index 78820bf2dda6..07c858583a32 100644
--- a/src/video/SDL_blit.c
+++ b/src/video/SDL_blit.c
@@ -183,18 +183,13 @@ static SDL_BlitFunc SDL_ChooseBlitFunc(Uint32 src_format, Uint32 dst_format, int
 }
 #endif /* SDL_HAVE_BLIT_AUTO */
 
-static SDL_bool IsSurfaceHDR(SDL_Surface *surface)
+static SDL_Colorspace GetSurfaceColorspace(SDL_Surface *surface)
 {
     if (surface->flags & SDL_SURFACE_USES_PROPERTIES) {
         SDL_PropertiesID props = SDL_GetSurfaceProperties(surface);
-        SDL_Colorspace colorspace = SDL_GetNumberProperty(props, SDL_PROP_SURFACE_COLORSPACE_NUMBER, SDL_COLORSPACE_RGB_DEFAULT);
-        SDL_TransferCharacteristics transfer = SDL_COLORSPACETRANSFER(colorspace);
-        if (transfer == SDL_TRANSFER_CHARACTERISTICS_PQ
-            /*|| (colorspace == SDL_COLORSPACE_SCRGB && SDL_BITSPERPIXEL(surface->format->format) > 32*/) {
-            return SDL_TRUE;
-        }
+        return (SDL_Colorspace)SDL_GetNumberProperty(props, SDL_PROP_SURFACE_COLORSPACE_NUMBER, SDL_COLORSPACE_RGB_DEFAULT);
     }
-    return SDL_FALSE;
+    return SDL_COLORSPACE_RGB_DEFAULT;
 }
 
 /* Figure out which of many blit routines to set up on a surface */
@@ -203,8 +198,8 @@ int SDL_CalculateBlit(SDL_Surface *surface)
     SDL_BlitFunc blit = NULL;
     SDL_BlitMap *map = surface->map;
     SDL_Surface *dst = map->dst;
-    SDL_bool src_HDR = IsSurfaceHDR(surface);
-    SDL_bool dst_HDR = IsSurfaceHDR(dst);
+    SDL_Colorspace src_colorspace = GetSurfaceColorspace(surface);
+    SDL_Colorspace dst_colorspace = GetSurfaceColorspace(dst);
 
     /* We don't currently support blitting to < 8 bpp surfaces */
     if (dst->format->BitsPerPixel < 8) {
@@ -237,45 +232,11 @@ int SDL_CalculateBlit(SDL_Surface *surface)
 #endif
 
     /* Choose a standard blit function */
-    if (src_HDR || dst_HDR) {
-        if (src_HDR && dst_HDR) {
-            /* See if they're in the same colorspace and light level */
-            SDL_PropertiesID src_props = SDL_GetSurfaceProperties(surface);
-            SDL_PropertiesID dst_props = SDL_GetSurfaceProperties(dst);
-            if ((SDL_GetNumberProperty(src_props, SDL_PROP_SURFACE_COLORSPACE_NUMBER, SDL_COLORSPACE_RGB_DEFAULT) !=
-                 SDL_GetNumberProperty(dst_props, SDL_PROP_SURFACE_COLORSPACE_NUMBER, SDL_COLORSPACE_RGB_DEFAULT)) ||
-                (SDL_GetNumberProperty(src_props, SDL_PROP_SURFACE_MAXCLL_NUMBER, 0) !=
-                 SDL_GetNumberProperty(dst_props, SDL_PROP_SURFACE_MAXCLL_NUMBER, 0)) ||
-                (SDL_GetNumberProperty(src_props, SDL_PROP_SURFACE_MAXFALL_NUMBER, 0) !=
-                 SDL_GetNumberProperty(dst_props, SDL_PROP_SURFACE_MAXFALL_NUMBER, 0))) {
-                SDL_InvalidateMap(map);
-                return SDL_SetError("Tone mapping between HDR surfaces not supported");
-            }
-
-            /* Fall through to the normal blit calculation (is this correct?) */
-
-        } else if (dst_HDR) {
-            SDL_InvalidateMap(map);
-            return SDL_SetError("Tone mapping from an SDR to an HDR surface not supported");
-        } else {
-            /* Tone mapping from an HDR surface to SDR surface */
-            SDL_PropertiesID src_props = SDL_GetSurfaceProperties(surface);
-            SDL_Colorspace src_colorspace = (SDL_Colorspace)SDL_GetNumberProperty(src_props, SDL_PROP_SURFACE_COLORSPACE_NUMBER, SDL_COLORSPACE_SRGB);
-            SDL_ColorPrimaries src_primaries = SDL_COLORSPACEPRIMARIES(src_colorspace);
-            SDL_PropertiesID dst_props = SDL_GetSurfaceProperties(dst);
-            SDL_Colorspace dst_colorspace = (SDL_Colorspace)SDL_GetNumberProperty(dst_props, SDL_PROP_SURFACE_COLORSPACE_NUMBER, SDL_COLORSPACE_SRGB);
-            SDL_ColorPrimaries dst_primaries = SDL_COLORSPACEPRIMARIES(dst_colorspace);
-            if (SDL_GetColorPrimariesConversionMatrix(src_primaries, dst_primaries) != NULL) {
-                if (SDL_ISPIXELFORMAT_10BIT(surface->format->format)) {
-                    blit = SDL_Blit_Slow_PQtoSDR;
-                } else {
-                    SDL_InvalidateMap(map);
-                    return SDL_SetError("Surface has unknown HDR pixel format");
-                }
-            } else {
-                SDL_InvalidateMap(map);
-                return SDL_SetError("Surface has unknown HDR colorspace");
-            }
+    if (!blit) {
+        if (src_colorspace != dst_colorspace ||
+            surface->format->BytesPerPixel > 4 ||
+            dst->format->BytesPerPixel > 4) {
+            blit = SDL_Blit_Slow_Float;
         }
     }
     if (!blit) {
diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h
index b78ecdd8a2de..285a098b8981 100644
--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -262,6 +262,14 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
         a = (a * 3) / 255;                             \
         Pixel = (a << 30) | (r << 20) | (g << 10) | b; \
     }
+#define ARGB2101010_FROM_RGBAFLOAT(Pixel, r, g, b, a)  \
+    {                                                  \
+        r = SDL_clamp(r, 0.0f, 1.0f) * 1023.0f;        \
+        g = SDL_clamp(g, 0.0f, 1.0f) * 1023.0f;        \
+        b = SDL_clamp(b, 0.0f, 1.0f) * 1023.0f;        \
+        a = SDL_clamp(a, 0.0f, 1.0f) * 3.0f;           \
+        Pixel = (((Uint32)a) << 30) | (((Uint32)r) << 20) | (((Uint32)g) << 10) | (Uint32)b; \
+    }
 #define ABGR2101010_FROM_RGBA(Pixel, r, g, b, a)       \
     {                                                  \
         r = r ? ((r << 2) | 0x3) : 0;                  \
@@ -270,6 +278,14 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface);
         a = (a * 3) / 255;                             \
         Pixel = (a << 30) | (b << 20) | (g << 10) | r; \
     }
+#define ABGR2101010_FROM_RGBAFLOAT(Pixel, r, g, b, a)  \
+    {                                                  \
+        r = SDL_clamp(r, 0.0f, 1.0f) * 1023.0f;        \
+        g = SDL_clamp(g, 0.0f, 1.0f) * 1023.0f;        \
+        b = SDL_clamp(b, 0.0f, 1.0f) * 1023.0f;        \
+        a = SDL_clamp(a, 0.0f, 1.0f) * 3.0f;           \
+        Pixel = (((Uint32)a) << 30) | (((Uint32)b) << 20) | (((Uint32)g) << 10) | (Uint32)r; \
+    }
 #define ASSEMBLE_RGB(buf, bpp, fmt, r, g, b)        \
     {                                               \
         switch (bpp) {                              \
diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c
index dc2f786c137f..54dc67092106 100644
--- a/src/video/SDL_blit_N.c
+++ b/src/video/SDL_blit_N.c
@@ -3374,22 +3374,24 @@ SDL_BlitFunc SDL_CalculateBlitN(SDL_Surface *surface)
             if (dstfmt->Amask) {
                 a_need = srcfmt->Amask ? COPY_ALPHA : SET_ALPHA;
             }
-            table = normal_blit[srcfmt->BytesPerPixel - 1];
-            for (which = 0; table[which].dstbpp; ++which) {
-                if (MASKOK(srcfmt->Rmask, table[which].srcR) &&
-                    MASKOK(srcfmt->Gmask, table[which].srcG) &&
-                    MASKOK(srcfmt->Bmask, table[which].srcB) &&
-                    MASKOK(dstfmt->Rmask, table[which].dstR) &&
-                    MASKOK(dstfmt->Gmask, table[which].dstG) &&
-                    MASKOK(dstfmt->Bmask, table[which].dstB) &&
-                    dstfmt->BytesPerPixel == table[which].dstbpp &&
-                    (a_need & table[which].alpha) == a_need &&
-                    ((table[which].blit_features & GetBlitFeatures()) ==
-                     table[which].blit_features)) {
-                    break;
+            if (srcfmt->BytesPerPixel <= SDL_arraysize(normal_blit)) {
+                table = normal_blit[srcfmt->BytesPerPixel - 1];
+                for (which = 0; table[which].dstbpp; ++which) {
+                    if (MASKOK(srcfmt->Rmask, table[which].srcR) &&
+                        MASKOK(srcfmt->Gmask, table[which].srcG) &&
+                        MASKOK(srcfmt->Bmask, table[which].srcB) &&
+                        MASKOK(dstfmt->Rmask, table[which].dstR) &&
+                        MASKOK(dstfmt->Gmask, table[which].dstG) &&
+                        MASKOK(dstfmt->Bmask, table[which].dstB) &&
+                        dstfmt->BytesPerPixel == table[which].dstbpp &&
+                        (a_need & table[which].alpha) == a_need &&
+                        ((table[which].blit_features & GetBlitFeatures()) ==
+                         table[which].blit_features)) {
+                        break;
+                    }
                 }
+                blitfun = table[which].blitfunc;
             }
-            blitfun = table[which].blitfunc;
 
             if (blitfun == BlitNtoN) { /* default C fallback catch-all. Slow! */
                 if (srcfmt->format == SDL_PIXELFORMAT_ARGB2101010) {
diff --git a/src/video/SDL_blit_slow.c b/src/video/SDL_blit_slow.c
index 0928c3c4135a..ad91273fd66d 100644
--- a/src/video/SDL_blit_slow.c
+++ b/src/video/SDL_blit_slow.c
@@ -24,19 +24,24 @@
 #include "SDL_blit_slow.h"
 #include "SDL_pixels_c.h"
 
-#define FORMAT_ALPHA                0
-#define FORMAT_NO_ALPHA             (-1)
-#define FORMAT_2101010              1
-#define FORMAT_HAS_ALPHA(format)    format == 0
-#define FORMAT_HAS_NO_ALPHA(format) format < 0
-static int detect_format(SDL_PixelFormat *pf)
+typedef enum
 {
-    if (SDL_ISPIXELFORMAT_10BIT(pf->format)) {
-        return FORMAT_2101010;
+    SlowBlitPixelAccess_RGB,
+    SlowBlitPixelAccess_RGBA,
+    SlowBlitPixelAccess_10Bit,
+    SlowBlitPixelAccess_Large,
+} SlowBlitPixelAccess;
+
+static SlowBlitPixelAccess GetPixelAccessMethod(SDL_PixelFormat *pf)
+{
+    if (pf->BytesPerPixel > 4) {
+        return SlowBlitPixelAccess_Large;
+    } else if (SDL_ISPIXELFORMAT_10BIT(pf->format)) {
+        return SlowBlitPixelAccess_10Bit;
     } else if (pf->Amask) {
-        return FORMAT_ALPHA;
+        return SlowBlitPixelAccess_RGBA;
     } else {
-        return FORMAT_NO_ALPHA;
+        return SlowBlitPixelAccess_RGB;
     }
 }
 
@@ -61,13 +66,13 @@ void SDL_Blit_Slow(SDL_BlitInfo *info)
     SDL_PixelFormat *dst_fmt = info->dst_fmt;
     int srcbpp = src_fmt->BytesPerPixel;
     int dstbpp = dst_fmt->BytesPerPixel;
-    int srcfmt_val;
-    int dstfmt_val;
+    SlowBlitPixelAccess src_access;
+    SlowBlitPixelAccess dst_access;
     Uint32 rgbmask = ~src_fmt->Amask;
     Uint32 ckey = info->colorkey & rgbmask;
 
-    srcfmt_val = detect_format(src_fmt);
-    dstfmt_val = detect_format(dst_fmt);
+    src_access = GetPixelAccessMethod(src_fmt);
+    dst_access = GetPixelAccessMethod(dst_fmt);
 
     incy = ((Uint64)info->src_h << 16) / info->dst_h;
     incx = ((Uint64)info->src_w << 16) / info->dst_w;
@@ -83,13 +88,15 @@ void SDL_Blit_Slow(SDL_BlitInfo *info)
             srcx = posx >> 16;
             src = (info->src + (srcy * info->src_pitch) + (srcx * srcbpp));
 
-            if (FORMAT_HAS_ALPHA(srcfmt_val)) {
-                DISEMBLE_RGBA(src, srcbpp, src_fmt, srcpixel, srcR, srcG, srcB, srcA);
-            } else if (FORMAT_HAS_NO_ALPHA(srcfmt_val)) {
+            switch (src_access) {
+            case SlowBlitPixelAccess_RGB:
                 DISEMBLE_RGB(src, srcbpp, src_fmt, srcpixel, srcR, srcG, srcB);
                 srcA = 0xFF;
-            } else {
-                /* 10-bit pixel format */
+                break;
+            case SlowBlitPixelAccess_RGBA:
+                DISEMBLE_RGBA(src, srcbpp, src_fmt, srcpixel, srcR, srcG, srcB, srcA);
+                break;
+            case SlowBlitPixelAccess_10Bit:
                 srcpixel = *((Uint32 *)(src));
                 switch (src_fmt->format) {
                 case SDL_PIXELFORMAT_XRGB2101010:
@@ -110,6 +117,11 @@ void SDL_Blit_Slow(SDL_BlitInfo *info)
                     srcR = srcG = srcB = srcA = 0;
                     break;
                 }
+                break;
+            case SlowBlitPixelAccess_Large:
+                /* Handled in SDL_Blit_Slow_Float() */
+                srcpixel = srcR = srcG = srcB = srcA = 0;
+                break;
             }
 
             if (flags & SDL_COPY_COLORKEY) {
@@ -125,15 +137,40 @@ void SDL_Blit_Slow(SDL_BlitInfo *info)
                 }
             }
             if ((flags & (SDL_COPY_BLEND | SDL_COPY_ADD | SDL_COPY_MOD | SDL_COPY_MUL))) {
-                if (FORMAT_HAS_ALPHA(dstfmt_val)) {
-                    DISEMBLE_RGBA(dst, dstbpp, dst_fmt, dstpixel, dstR, dstG, dstB, dstA);
-                } else if (FORMAT_HAS_NO_ALPHA(dstfmt_val)) {
+                switch (dst_access) {
+                case SlowBlitPixelAccess_RGB:
                     DISEMBLE_RGB(dst, dstbpp, dst_fmt, dstpixel, dstR, dstG, dstB);
                     dstA = 0xFF;
-                } else {
-                    /* SDL_PIXELFORMAT_ARGB2101010 */
-                    dstpixel = *((Uint32 *) (dst));
-                    RGBA_FROM_ARGB2101010(dstpixel, dstR, dstG, dstB, dstA);
+                    break;
+                case SlowBlitPixelAccess_RGBA:
+                    DISEMBLE_RGBA(dst, dstbpp, dst_fmt, dstpixel, dstR, dstG, dstB, dstA);
+                    break;
+                case SlowBlitPixelAccess_10Bit:
+                    dstpixel = *((Uint32 *)(dst));
+                    switch (dst_fmt->format) {
+                    case SDL_PIXELFORMAT_XRGB2101010:
+                        RGBA_FROM_ARGB2101010(dstpixel, dstR, dstG, dstB, dstA);
+                        dstA = 0xFF;
+                        break;
+                    case SDL_PIXELFORMAT_XBGR2101010:
+                        RGBA_FROM_ABGR2101010(dstpixel, dstR, dstG, dstB, dstA);
+                        dstA = 0xFF;
+                        break;
+                    case SDL_PIXELFORMAT_ARGB2101010:
+                        RGBA_FROM_ARGB2101010(dstpixel, dstR, dstG, dstB, dstA);
+                        break;
+                    case SDL_PIXELFORMAT_ABGR2101010:
+                        RGBA_FROM_ABGR2101010(dstpixel, dstR, dstG, dstB, dstA);
+                        break;
+                    default:
+                        dstR = dstG = dstB = dstA = 0;
+                        break;
+                    }
+                    break;
+                case SlowBlitPixelAccess_Large:
+                    /* Handled in SDL_Blit_Slow_Float() */
+                    dstR = dstG = dstB = dstA = 0;
+                    break;
                 }
             } else {
                 /* don't care */
@@ -203,12 +240,16 @@ void SDL_Blit_Slow(SDL_BlitInfo *info)
                 }
                 break;
             }
-            if (FORMAT_HAS_ALPHA(dstfmt_val)) {
-                ASSEMBLE_RGBA(dst, dstbpp, dst_fmt, dstR, dstG, dstB, dstA);
-            } else if (FORMAT_HAS_NO_ALPHA(dstfmt_val)) {
+
+            switch (dst_access) {
+            case SlowBlitPixelAccess_RGB:
                 ASSEMBLE_RGB(dst, dstbpp, dst_fmt, dstR, dstG, dstB);
-            } else {
-                /* 10-bit pixel format */
+                break;
+            case SlowBlitPixelAccess_RGBA:
+                ASSEMBLE_RGBA(dst, dstbpp, dst_fmt, dstR, dstG, dstB, dstA);
+                break;
+            case SlowBlitPixelAccess_10Bit:
+            {
                 Uint32 pixel;
                 switch (dst_fmt->format) {
                 case SDL_PIXELFORMAT_XRGB2101010:
@@ -228,7 +269,13 @@ void SDL_Blit_Slow(SDL_BlitInfo *info)
                     break;
                 }
                 *(Uint32 *)dst = pixel;
+                break;
             }
+            case SlowBlitPixelAccess_Large:
+                /* Handled in SDL_Blit_Slow_Float() */
+                break;
+            }
+
             posx += incx;
             dst += dstbpp;
         }
@@ -237,69 +284,478 @@ void SDL_Blit_Slow(SDL_BlitInfo *info)
     }
 }
 
-static void MatrixMultiply(float v[3], const float *matrix)
+/* Convert from F16 to float
+ * Public domain implementation from https://gist.github.com/rygorous/2144712
+ */
+typedef union
 {
-    float out[3];
-
-    out[0] = matrix[0 * 3 + 0] * v[0] + matrix[0 * 3 + 1] * v[1] + matrix[0 * 3 + 2] * v[2];
-    out[1] = matrix[1 * 3 + 0] * v[0] + matrix[1 * 3 + 1] * v[1] + matrix[1 * 3 + 2] * v[2];
-    out[2] = matrix[2 * 3 + 0] * v[0] + matrix[2 * 3 + 1] * v[1] + matrix[2 * 3 + 2] * v[2];
-    v[0] = out[0];
-    v[1] = out[1];
-    v[2] = out[2];
+    Uint32 u;
+    float f;
+    struct
+    {
+        Uint32 Mantissa : 23;
+        Uint32 Exponent : 8;
+        Uint32 Sign : 1;
+    } x;
+} FP32;
+
+typedef union
+{
+    Uint16 u;
+    struct
+    {
+        Uint16 Mantissa : 10;
+        Uint16 Exponent : 5;
+        Uint16 Sign : 1;
+    } x;
+} FP16;
+
+static float half_to_float(Uint16 unValue)
+{
+    static const FP32 magic = { (254 - 15) << 23 };
+    static const FP32 was_infnan = { (127 + 16) << 23 };
+    FP16 h;
+    FP32 o;
+
+    h.u = unValue;
+    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
+    o.f *= magic.f;                 // exponent adjust
+    if (o.f >= was_infnan.f)        // make sure Inf/NaN survive
+        o.u |= 255 << 23;
+    o.u |= (h.u & 0x8000) << 16;    // sign bit
+    return o.f;
 }
 
-static float PQtoNits(float pq)
+/* Convert from float to F16
+ * Public domain implementation from https://stackoverflow.com/questions/76799117/how-to-convert-a-float-to-a-half-type-and-the-other-way-around-in-c
+ */
+static Uint16 float_to_half(float a)
+{
+    Uint32 ia;
+    Uint16 ir;
+
+    SDL_memcpy(&ia, &a, sizeof(ia));
+
+    ir = (ia >> 16) & 0x8000;
+    if ((ia & 0x7f800000) == 0x7f800000) {
+        if ((ia & 0x7fffffff) == 0x7f800000) {
+            ir |= 0x7c00; /* infinity */
+        } else {
+            ir |= 0x7e00 | ((ia >> (24 - 11)) & 0x1ff); /* NaN, quietened */
+        }
+    } else if ((ia & 0x7f800000) >= 0x33000000) {
+        int shift = (int)((ia >> 23) & 0xff) - 127;
+        if (shift > 15) {
+            ir |= 0x7c00; /* infinity */
+        } else {
+            ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
+            if (shift < -14) { /* denormal */
+                ir |= ia >> (-1 - shift);
+                ia = ia << (32 - (-1 - shift));
+            } else { /* normal */
+                ir |= ia >> (24 - 11);
+                ia = ia << (32 - (24 - 11));
+                ir = ir + ((14 + shift) << 10);
+            }
+            /* IEEE-754 round to nearest of even */
+            if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
+                ir++;
+            }
+        }
+    }
+    return ir;
+}
+
+static float scRGBtoNits(float v)
+{
+    return v * 80.0f;
+}
+
+static float scRGBfromNits(float v)
+{
+    return v / 80.0f;
+}
+
+static float sRGBtoNits(float v)
+{
+    if (v <= 0.04045f) {
+        v = (v / 12.92f);
+    } else {
+        v = SDL_powf((v + 0.055f) / 1.055f, 2.4f);
+    }
+    return scRGBtoNits(v);
+}
+
+static float sRGBfromNits(float v)
+{
+    v = scRGBfromNits(v);
+
+    if (v <= 0.0031308f) {
+        v = (v * 12.92f);
+    } else {
+        v = (SDL_powf(v, 1.0f / 2.4f) * 1.055f - 0.055f);
+    }
+    return v;
+}
+
+static float PQtoNits(float v)
 {
     const float c1 = 0.8359375f;
     const float c2 = 18.8515625f;
     const float c3 = 18.6875f;
-
     const float oo_m1 = 1.0f / 0.1593017578125f;
     const float oo_m2 = 1.0f / 78.84375f;
 
-    float num = SDL_max(SDL_powf(pq, oo_m2) - c1, 0.0f);
-    float den = c2 - c3 * SDL_powf(pq, oo_m2);
-
+    float num = SDL_max(SDL_powf(v, oo_m2) - c1, 0.0f);
+    float den = c2 - c3 * SDL_powf(v, oo_m2);
     return 10000.0f * SDL_powf(num / den, oo_m1);
 }
 
+static float PQfromNits(float v)
+{
+    const float c1 = 0.8359375f;
+    const float c2 = 18.8515625f;
+    const float c3 = 18.6875f;
+    const float m1 = 0.1593017578125f;
+    const float m2 = 78.84375f;
+
+    float y = SDL_clamp(v / 10000.0f, 0.0f, 1.0f);
+    float num = c1 + c2 * pow(y, m1);
+    float den = 1.0f + c3 * pow(y, m1);
+    return pow(num / den, m2);
+}
 
-/* This isn't really a tone mapping algorithm but it generally works well for HDR -> SDR display */
-static void PQtoSDR(const float *color_primaries_matrix, float floatR, float floatG, float floatB, Uint32 *outR, Uint32 *outG, Uint32 *outB)
+static void ReadFloatPixel(Uint8 *pixels, SlowBlitPixelAccess access, SDL_PixelFormat *fmt, SDL_Colorspace colorspace,
+                           float *outR, float *outG, float *outB, float *outA)
 {
-    float v[3];
-    int i;
+    Uint32 pixel;
+    Uint32 R, G, B, A;
+    float fR = 0.0f, fG = 0.0f, fB = 0.0f, fA = 0.0f;
+    float v[4];
+
+    switch (access) {
+    case SlowBlitPixelAccess_RGB:
+        DISEMBLE_RGB(pixels, fmt->BytesPerPixel, fmt, pixel, R, G, B);
+        fR = (float)R / 255.0f;
+        fG = (float)G / 255.0f;
+        fB = (float)B / 255.0f;
+        fA = 1.0f;
+        break;
+    case SlowBlitPixelAccess_RGBA:
+        DISEMBLE_RGBA(pixels, fmt->BytesPerPixel, fmt, pixel, R, G, B, A);
+        fR = (float)R / 255.0f;
+        fG = (float)G / 255.0f;
+        fB = (float)B / 255.0f;
+        fA = (float)A / 255.0f;
+        break;
+    case SlowBlitPixelAccess_10Bit:
+        pixel = *((Uint32 *)pixels);
+        switch (fmt->format) {
+        case SDL_PIXELFORMAT_XRGB2101010:
+            RGBAFLOAT_FROM_ARGB2101010(pixel, fR, fG, fB, fA);
+            fA = 1.0f;
+            break;
+        case SDL_PIXELFORMAT_XBGR2101010:
+            RGBAFLOAT_FROM_ABGR2101010(pixel, fR, fG, fB, fA);
+            fA = 1.0f;
+            break;
+        case SDL_PIXELFORMAT_ARGB2101010:
+            RGBAFLOAT_FROM_ARGB2101010(pixel, fR, fG, fB, fA);
+            break;
+        case SDL_PIXELFORMAT_ABGR2101010:
+            RGBAFLOAT_FROM_ABGR2101010(pixel, fR, fG, fB, fA);
+            break;
+        default:
+            fR = fG = fB = fA = 0.0f;
+            break;
+        }
+        break;
+    case SlowBlitPixelAccess_Large:
+        switch (SDL_PIXELTYPE(fmt->format)) {
+        case SDL_PIXELTYPE_ARRAYU16:
+            v[0] = (float)(((Uint16 *)pixels)[0]) / SDL_MAX_UINT16;
+            v[1] = (float)(((Uint16 *)pixels)[1]) / SDL_MAX_UINT16;
+            v[2] = (float)(((Uint16 *)pixels)[2]) / SDL_MAX_UINT16;
+            if (fmt->BytesPerPixel == 8) {
+                v[3] = (float)(((Uint16 *)pixels)[3]) / SDL_MAX_UINT16;
+            } else {
+                v[3] = 1.0f;
+            }
+            break;
+        case SDL_PIXELTYPE_ARRAYF16:
+            v[0] = half_to_float(((Uint16 *)pixels)[0]);
+            v[1] = half_to_float(((Uint16 *)pixels)[1]);
+            v[2] = half_to_float(((Uint16 *)pixels)[2]);
+            if (fmt->BytesPerPixel == 8) {
+                v[3] = half_to_float(((Uint16 *)pixels)[3]);
+            } else {
+                v[3] = 1.0f;
+            }
+            break;
+        case SDL_PIXELTYPE_ARRAYF32:
+            v[0] = ((float *)pixels)[0];
+            v[1] = ((float *)pixels)[1];
+            v[2] = ((float *)pixels)[2];
+            if (fmt->BytesPerPixel == 16) {
+                v[3] = ((float *)pixels)[3];
+            } else {
+                v[3] = 1.0f;
+            }
+            break;
+        default:
+            /* Unknown array type */
+            v[0] = v[1] = v[2] = v[3] = 0.0f;
+            break;
+        }
+        switch (SDL_PIXELORDER(fmt->format)) {
+        case SDL_ARRAYORDER_RGB:
+            fR = v[0];
+            fG = v[1];
+            fB = v[2];
+            fA = 1.0f;
+            break;
+        case SDL_ARRAYORDER_RGBA:
+            fR = v[0];
+            fG = v[1];
+            fB = v[2];
+            fA = v[3];
+            break;
+        case SDL_ARRAYORDER_ARGB:
+            fA = v[0];
+            fR = v[1];
+            fG = v[2];
+            fB = v[3];
+            break;
+        case SDL_ARRAYORDER_BGR:
+            fB = v[0];
+            fG = v[1];
+            fR = v[2];
+            fA = 1.0f;
+            break;
+        case SDL_ARRAYORDER_BGRA:
+            fB = v[0];
+            fG = v[1];
+            fR = v[2];
+            fA = v[3];
+            break;
+        case SDL_ARRAYORDER_ABGR:
+            fA = v[0];
+            fB = v[1];
+            fG = v[2];
+            fR = v[3];
+            break;
+        default:
+            /* Unknown array order */
+            fA = fR = fG = fB = 0.0f;
+            break;
+        }
+        break;
+    }
 
-    v[0] = PQtoNits(floatR);
-    v[1] = PQtoNits(floatG);
-    v[2] = PQtoNits(floatB);
+    /* Convert to nits so src and dst are guaranteed to be linear and in the same units */
+    switch (SDL_COLORSPACETRANSFER(colorspace)) {
+    case SDL_TRANSFER_CHARACTERISTICS_SRGB:
+        fR = sRGBtoNits(fR);
+        fG = sRGBtoNits(fG);
+        fB = sRGBtoNits(fB);
+        break;
+    case SDL_TRANSFER_CHARACTERISTICS_PQ:
+        fR = PQtoNits(fR);
+        fG = PQtoNits(fG);
+        fB = PQtoNits(fB);
+        break;
+    case SDL_TRANSFER_CHARACTERISTICS_LINEAR:
+        /* Assuming scRGB for now */
+        fR = scRGBtoNits(fR);
+        fG = scRGBtoNits(fG);
+        fB = scRGBtoNits(fB);
+        break;
+    default:
+        /* Unknown, leave it alone */
+        break;
+    }
+
+    *outR = fR;
+    *outG = fG;
+    *outB = fB;
+    *outA = fA;
+}
 
-    MatrixMultiply(v, color_primaries_matrix);
+static void WriteFloatPixel(Uint8 *pixels, SlowBlitPixelAccess access, SDL_PixelFormat *fmt, SDL_Colorspace colorspace,
+                            float fR, float fG, float fB, float fA)
+{
+    Uint32 R, G, B, A;
+    float v[4];
+
+    /* We converted to nits so src and dst are guaranteed to be linear and in the same units */
+    switch (SDL_COLORSPACETRANSFER(colorspace)) {
+    case SDL_TRANSFER_CHARACTERISTICS_SRGB:
+        fR = sRGBfromNits(fR);
+        fG = sRGBfromNits(fG);
+        fB = sRGBfromNits(fB);
+        break;
+    case SDL_TRANSFER_CHARACTERISTICS_PQ:
+        fR = PQfromNits(fR);
+        fG = PQfromNits(fG);
+        fB = PQfromNits(fB);
+        break;
+    case SDL_TRANSFER_CHARACTERISTICS_LINEAR:
+        /* Assuming scRGB for now */
+        fR = scRGBfromNits(fR);
+        fG = scRGBfromNits(fG);
+        fB = scRGBfromNits(fB);
+        break;
+    default:
+        /* Unknown, leave it alone */
+        break;
+    }
 
-    for (i = 0; i < SDL_arraysize(v); ++i) {
-        v[i] /= 400.0f;
-        v[i] = SDL_clamp(v[i], 0.0f, 1.0f);
-        v[i] = SDL_powf(v[i], 1.0f / 2.2f);
+    switch (access) {
+    case SlowBlitPixelAccess_RGB:
+        R = (Uint8)(SDL_clamp(fR, 0.0f, 1.0f) * 255.0f);
+        G = (Uint8)(SDL_clamp(fG, 0.0f, 1.0f) * 255.0f);
+        B = (Uint8)(SDL_clamp(fB, 0.0f, 1.0f) * 255.0f);
+        ASSEMBLE_RGB(pixels, fmt->BytesPerPixel, fmt, R, G, B);
+        break;
+    case SlowBlitPixelAccess_RGBA:
+        R = (Uint8)(SDL_clamp(fR, 0.0f, 1.0f) * 255.0f);
+        G = (Uint8)(SDL_clamp(fG, 0.0f, 1.0f) * 255.0f);
+        B = (Uint8)(SDL_clamp(fB, 0.0f, 1.0f) * 255.0f);
+        A = (Uint8)(SDL_clamp(fA, 0.0f, 1.0f) * 255.0f);
+        ASSEMBLE_RGBA(pixels, fmt->BytesPerPixel, fmt, R, G, B, A);
+        break;
+    case SlowBlitPixelAccess_10Bit:
+    {
+        Uint32 pixel;
+        switch (fmt->format) {
+        case SDL_PIXELFORMAT_XRGB2101010:
+            fA =

(Patch may be truncated, please check the link at the top of this post.)