SDL: loongarch: add Blit8888to8888PixelAlphaSwizzleLSX opt

From 0b1eb4c84152188e48024cafc220af308d3a8694 Mon Sep 17 00:00:00 2001
From: yuanhecai <[EMAIL REDACTED]>
Date: Tue, 28 Oct 2025 17:59:19 +0800
Subject: [PATCH] loongarch: add Blit8888to8888PixelAlphaSwizzleLSX opt

---
 CMakeLists.txt            |  10 +++-
 include/SDL3/SDL_intrin.h |   6 ++-
 src/video/SDL_blit_A.c    | 102 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71d5e3b72cdfc..39cd099d69959 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -932,7 +932,15 @@ if(SDL_ASSEMBLY)
       cmake_pop_check_state()
 
       if(COMPILER_SUPPORTS_LSX AND HAVE_LSXINTRIN_H)
-        set_property(SOURCE "${SDL3_SOURCE_DIR}/src/video/yuv2rgb/yuv_rgb_lsx.c" APPEND PROPERTY COMPILE_OPTIONS "-mlsx")
+        set_property(SOURCE
+            "${SDL3_SOURCE_DIR}/src/video/yuv2rgb/yuv_rgb_lsx.c"
+            "${SDL3_SOURCE_DIR}/src/video/SDL_blit_A.c"
+            APPEND PROPERTY COMPILE_OPTIONS "-mlsx")
+
+        set_property(SOURCE
+            "${SDL3_SOURCE_DIR}/src/video/yuv2rgb/yuv_rgb_lsx.c"
+            "${SDL3_SOURCE_DIR}/src/video/SDL_blit_A.c"
+            PROPERTY SKIP_PRECOMPILE_HEADERS 1)
         set(HAVE_LSX TRUE)
       endif()
     endif()
diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h
index ab2dfeeee5846..802c1953b0713 100644
--- a/include/SDL3/SDL_intrin.h
+++ b/include/SDL3/SDL_intrin.h
@@ -281,12 +281,14 @@ _m_prefetch(void *__P)
  * \sa SDL_TARGETING
  */
 #define SDL_HAS_TARGET_ATTRIBS
-
+#elif defined(__loongarch64) && defined(__GNUC__) && (__GNUC__ >= 15)
+/* LoongArch requires GCC 15+ for target attribute support */
+# define SDL_HAS_TARGET_ATTRIBS
 #elif defined(__clang__) && defined(__has_attribute)
 # if __has_attribute(target)
 # define SDL_HAS_TARGET_ATTRIBS
 # endif
-#elif defined(__GNUC__) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */
+#elif defined(__GNUC__) && !defined(__loongarch64) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */
 # define SDL_HAS_TARGET_ATTRIBS
 #elif defined(__ICC) && __ICC >= 1600
 # define SDL_HAS_TARGET_ATTRIBS
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index fc5ad1d737002..965b0b2d21f47 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -242,6 +242,103 @@ static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *inf
 
 #endif
 
+#ifdef SDL_LSX_INTRINSICS
+
+static void SDL_TARGETING("lsx") Blit8888to8888PixelAlphaSwizzleLSX(SDL_BlitInfo *info)
+{
+    int width = info->dst_w;
+    int height = info->dst_h;
+    Uint8 *src = info->src;
+    int srcskip = info->src_skip;
+    Uint8 *dst = info->dst;
+    int dstskip = info->dst_skip;
+    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
+    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
+    bool fill_alpha = !dstfmt->Amask;
+    Uint32 dstAmask, dstAshift;
+    const Uint8 offsets[] = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+
+    SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
+
+    const __m128i const_0xff00 = __lsx_vreplgr2vr_h(0xff00);
+    const __m128i const_128 = __lsx_vreplgr2vr_b((Uint8)128);
+    const __m128i const_32641 = __lsx_vreplgr2vr_h(32641);
+    const __m128i const_257 = __lsx_vreplgr2vr_h(257);
+
+    // The byte offsets for the start of each pixel
+    const __m128i mask_offsets = __lsx_vld(offsets, 0);
+
+    const __m128i convert_mask = __lsx_vadd_w(
+        __lsx_vreplgr2vr_w(
+            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
+            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
+            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
+        mask_offsets);
+
+    const __m128i alpha_splat_mask = __lsx_vadd_b(__lsx_vreplgr2vr_b(srcfmt->Ashift >> 3), mask_offsets);
+    const __m128i alpha_fill_mask = __lsx_vreplgr2vr_w((int)dstAmask);
+
+    while (height--) {
+        int i = 0;
+
+        for (; i + 4 <= width; i += 4) {
+            __m128i src128 = __lsx_vld(src, 0);
+            __m128i dst128 = __lsx_vld(dst, 0);
+
+            __m128i srcA = __lsx_vshuf_b(src128, src128, alpha_splat_mask);
+            src128 = __lsx_vshuf_b(src128, src128, convert_mask);
+
+            src128 = __lsx_vor_v(src128, alpha_fill_mask);
+
+            __m128i srca_lo = __lsx_vilvl_b(srcA, srcA);
+            __m128i srca_hi = __lsx_vilvh_b(srcA, srcA);
+
+            srca_lo = __lsx_vxor_v(srca_lo, const_0xff00);
+            srca_hi = __lsx_vxor_v(srca_hi, const_0xff00);
+
+            src128 = __lsx_vsub_b(src128, const_128);
+            dst128 = __lsx_vsub_b(dst128, const_128);
+
+            __m128i tmp = __lsx_vilvl_b(dst128, src128);
+            __m128i dst_lo = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_lo, tmp), __lsx_vmulwod_h_bu_b(srca_lo, tmp));
+            tmp = __lsx_vilvh_b(dst128, src128);
+            __m128i dst_hi = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_hi, tmp), __lsx_vmulwod_h_bu_b(srca_hi, tmp));
+
+            dst_lo = __lsx_vadd_h(dst_lo, const_32641);
+            dst_hi = __lsx_vadd_h(dst_hi, const_32641);
+
+            dst_lo = __lsx_vmuh_hu(dst_lo, const_257);
+            dst_hi = __lsx_vmuh_hu(dst_hi, const_257);
+
+            dst128 = __lsx_vssrarni_bu_h(dst_hi, dst_lo, 0);
+            if (fill_alpha) {
+                dst128 = __lsx_vor_v(dst128, alpha_fill_mask);
+            }
+            __lsx_vst(dst128, dst, 0);
+
+            src += 16;
+            dst += 16;
+        }
+
+        for (; i < width; ++i) {
+            Uint32 src32 = *(Uint32 *)src;
+            Uint32 dst32 = *(Uint32 *)dst;
+            ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
+            if (fill_alpha) {
+                dst32 |= dstAmask;
+            }
+            *(Uint32 *)dst = dst32;
+            src += 4;
+            dst += 4;
+        }
+
+        src += srcskip;
+        dst += dstskip;
+    }
+}
+
+#endif
+
 // fast RGB888->(A)RGB888 blending with surface alpha=128 special case
 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
 {
@@ -1402,6 +1499,11 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
                     return Blit8888to8888PixelAlphaSwizzleSSE41;
                 }
 #endif
+#ifdef SDL_LSX_INTRINSICS
+                if (SDL_HasLSX()) {
+                    return Blit8888to8888PixelAlphaSwizzleLSX;
+                }
+#endif
 #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8)
                 // To prevent "unused function" compiler warnings/errors
                 (void)Blit8888to8888PixelAlpha;