From 0f175891a608108a458890be6194358e0234f09f Mon Sep 17 00:00:00 2001
From: Gabriel Wang <[EMAIL REDACTED]>
Date: Thu, 14 May 2026 14:37:46 +0800
Subject: [PATCH] Add SVE2 SIMD Alpha-Blending Blitter (#15504)
SVE/SVE2 is a new SIMD extension for AArch64. Compared to NEON, SVE/SVE2 brings the following benefits that are good for SDL projects:
- Lane prediction: we don't have to treat the tail part of a stride separately when the width is n times the hardware vector size
- Although the performance is almost no difference from NEON when the hardware vector size is 128bits, when the hardware provides a longer vector size, e.g. 256, 512, ... 2048, we can enjoy the large performance gain without modifying the source code or recompiling a library.
The functional correctness is validated in a dedicated [qemu project](https://github.com/GorgonMeducer/aarch64_qemu_mac_template/tree/SDL-SVE2-Acceleration-Validation).
The performance is tested on [Radxa Orion 6 N](https://radxa.com/products/orion/o6n/), which provides 4x A720 and 4x A520 processors. Since the vector size is 128 bits, which is the same as NEON, the performance is almost the same (or no worse than) the NEON acceleration.
---
Android.mk | 1 +
CMakeLists.txt | 36 +
include/SDL3/SDL_cpuinfo.h | 12 +
include/SDL3/SDL_intrin.h | 28 +
include/build_config/SDL_build_config.h.cmake | 1 +
include/build_config/SDL_build_config_ios.h | 3 +
src/cpuinfo/SDL_cpuinfo.c | 33 +
src/dynapi/SDL_dynapi.exports | 1 +
src/dynapi/SDL_dynapi.sym | 1 +
src/dynapi/SDL_dynapi_overrides.h | 1 +
src/dynapi/SDL_dynapi_procs.h | 1 +
src/video/SDL_blit_A.c | 28 +
src/video/SDL_blit_N.c | 21 +
src/video/arm/SDL_sve2_blit_A.c | 89 +
src/video/arm/SDL_sve2_blit_A.h | 37 +
src/video/arm/SDL_sve2_blit_N.c | 64 +
src/video/arm/SDL_sve2_blit_N.h | 35 +
src/video/arm/SDL_sve2_extension.h | 1142 ++++++++
src/video/arm/SDL_sve2_swizzle.h | 2375 +++++++++++++++++
src/video/arm/SDL_sve2_util.h | 206 ++
test/testplatform.c | 1 +
21 files changed, 4116 insertions(+)
create mode 100644 src/video/arm/SDL_sve2_blit_A.c
create mode 100644 src/video/arm/SDL_sve2_blit_A.h
create mode 100644 src/video/arm/SDL_sve2_blit_N.c
create mode 100644 src/video/arm/SDL_sve2_blit_N.h
create mode 100644 src/video/arm/SDL_sve2_extension.h
create mode 100644 src/video/arm/SDL_sve2_swizzle.h
create mode 100644 src/video/arm/SDL_sve2_util.h
diff --git a/Android.mk b/Android.mk
index 2e3b11483c75e..d53bf403b1611 100644
--- a/Android.mk
+++ b/Android.mk
@@ -84,6 +84,7 @@ LOCAL_SRC_FILES := \
$(wildcard $(LOCAL_PATH)/src/tray/*.c) \
$(wildcard $(LOCAL_PATH)/src/video/*.c) \
$(wildcard $(LOCAL_PATH)/src/video/android/*.c) \
+ $(wildcard $(LOCAL_PATH)/src/video/arm/*.c) \
$(wildcard $(LOCAL_PATH)/src/video/yuv2rgb/*.c))
LOCAL_CFLAGS += -DGL_GLEXT_PROTOTYPES
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e591a011c1a72..851e11add9e85 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -314,6 +314,7 @@ dep_option(SDL_SSE4_2 "Use SSE4.2 assembly routines" ON "SDL_ASSEMB
dep_option(SDL_MMX "Use MMX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_X86 OR SDL_CPU_X64" OFF)
dep_option(SDL_ALTIVEC "Use Altivec assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_POWERPC32 OR SDL_CPU_POWERPC64" OFF)
dep_option(SDL_ARMNEON "Use NEON assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM32 OR SDL_CPU_ARM64" OFF)
+dep_option(SDL_ARMSVE2 "Use SVE2 assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM64" OFF)
dep_option(SDL_LSX "Use LSX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF)
dep_option(SDL_LASX "Use LASX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF)
@@ -939,6 +940,37 @@ if(SDL_ASSEMBLY)
endif()
endif()
+ if(SDL_ARMSVE2)
+ cmake_push_check_state()
+ string(APPEND CMAKE_REQUIRED_FLAGS " -march=armv8-a+sve2")
+ check_arm_source_compiles([==[
+ #include <arm_sve.h>
+ svuint32_t sve2_test(svuint32_t a, svuint32_t b) {
+ return svadd_u32_x(svptrue_b32(), a, b);
+ }
+ int main(int argc, char *argv[]) {
+ sve2_test(svdup_u32(0), svdup_u32(0));
+ return 0;
+ }]==] COMPILER_SUPPORTS_ARMSVE2)
+ if(COMPILER_SUPPORTS_ARMSVE2)
+ set(HAVE_ARMSVE2 TRUE)
+ endif()
+ cmake_pop_check_state()
+
+ if(HAVE_ARMSVE2)
+ sdl_sources(
+ "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_A.c"
+ "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_N.c"
+ )
+ set_source_files_properties(
+ "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_A.c"
+ "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_N.c"
+ PROPERTIES
+ SKIP_PRECOMPILE_HEADERS ON
+ )
+ endif()
+ endif()
+
if(USE_GCC OR USE_CLANG)
# TODO: Those all seem to be quite GCC specific - needs to be
# reworked for better compiler support
@@ -1055,6 +1087,10 @@ if(NOT HAVE_ARMNEON)
set(SDL_DISABLE_NEON 1)
endif()
+if(NOT HAVE_ARMSVE2)
+ set(SDL_DISABLE_SVE2 1)
+endif()
+
set(SDL_DISABLE_ALLOCA 0)
check_include_file("alloca.h" "HAVE_ALLOCA_H")
if(MSVC)
diff --git a/include/SDL3/SDL_cpuinfo.h b/include/SDL3/SDL_cpuinfo.h
index 5669c2373d9f4..765cadf287bc1 100644
--- a/include/SDL3/SDL_cpuinfo.h
+++ b/include/SDL3/SDL_cpuinfo.h
@@ -281,6 +281,18 @@ extern SDL_DECLSPEC bool SDLCALL SDL_HasARMSIMD(void);
*/
extern SDL_DECLSPEC bool SDLCALL SDL_HasNEON(void);
+/**
+ * Determine whether the CPU has SVE2 (Scalable Vector Extension 2).
+ *
+ * This is only relevant on ARM64 Linux. On other platforms it always returns
+ * false.
+ *
+ * \returns true if the CPU has SVE2, false otherwise.
+ *
+ * \since This function is available since SDL 3.6.0.
+ */
+extern SDL_DECLSPEC bool SDLCALL SDL_HasSVE2(void);
+
/**
* Determine whether the CPU has LSX (LOONGARCH SIMD) features.
*
diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h
index a2e968080cf6e..ecd8192941b18 100644
--- a/include/SDL3/SDL_intrin.h
+++ b/include/SDL3/SDL_intrin.h
@@ -85,6 +85,16 @@
*/
#define SDL_NEON_INTRINSICS 1
+/**
+ * Defined if (and only if) the compiler supports ARM SVE2 intrinsics.
+ *
+ * If this macro is defined, SDL will have already included `<arm_sve.h>`
+ * as appropriate.
+ *
+ * \since This macro is available since SDL 3.6.0.
+ */
+#define SDL_SVE2_INTRINSICS 1
+
/**
* Defined if (and only if) the compiler supports PowerPC Altivec intrinsics.
*
@@ -237,6 +247,10 @@ _m_prefetch(void *__P)
# define SDL_NEON_INTRINSICS 1
# include <arm_neon.h>
#endif
+#if defined(__ARM_FEATURE_SVE2) && !defined(SDL_DISABLE_SVE2)
+# define SDL_SVE2_INTRINSICS 1
+# include <arm_sve.h>
+#endif
#else
/* altivec.h redefining bool causes a number of problems, see bugs 3993 and 4392, so you need to explicitly define SDL_ENABLE_ALTIVEC to have it included. */
@@ -265,6 +279,20 @@ _m_prefetch(void *__P)
# endif
# endif
#endif
+#ifndef SDL_DISABLE_SVE2
+# if defined(SDL_PLATFORM_WINDOWS)
+/* Visual Studio doesn't define __ARM_ARCH, but _M_ARM (if set, always 7), and _M_ARM64 (if set, always 1). */
+# if defined (_M_ARM64) && 0 /* Please only remove this 0 when MSVC releasing support for SVE2 officially. */
+# define SDL_SVE2_INTRINSICS 1
+# include <arm_sve.h>
+# define __ARM_FEATURE_SVE2 1 /* Set __ARM_FEATURE_SVE2 so that it can be used elsewhere, at compile time */
+# define __ARM_ARCH 8
+# endif
+# elif !defined(SDL_PLATFORM_MACOS) /* Apple has no AArch64 device supporting SVE2 */
+# define SDL_SVE2_INTRINSICS 1
+# include <arm_sve.h>
+# endif
+#endif
#endif /* compiler version */
#ifdef SDL_WIKI_DOCUMENTATION_SECTION
diff --git a/include/build_config/SDL_build_config.h.cmake b/include/build_config/SDL_build_config.h.cmake
index e7d0b34f42b3d..2e0cdc21b4a6a 100644
--- a/include/build_config/SDL_build_config.h.cmake
+++ b/include/build_config/SDL_build_config.h.cmake
@@ -625,6 +625,7 @@ typedef unsigned int uintptr_t;
#cmakedefine SDL_DISABLE_LSX 1
#cmakedefine SDL_DISABLE_LASX 1
#cmakedefine SDL_DISABLE_NEON 1
+#cmakedefine SDL_DISABLE_SVE2 1
#ifdef SDL_PLATFORM_PRIVATE
#include "SDL_end_config_private.h"
diff --git a/include/build_config/SDL_build_config_ios.h b/include/build_config/SDL_build_config_ios.h
index 308270b5a05da..56f17f8b8ff65 100644
--- a/include/build_config/SDL_build_config_ios.h
+++ b/include/build_config/SDL_build_config_ios.h
@@ -226,4 +226,7 @@
/* Enable tray subsystem */
#define SDL_TRAY_DUMMY 1
+/* Disable ARM SVE2 intrinsics until we confirm they're available on all Apple mobile and TV hardware */
+#define SDL_DISABLE_SVE2 1
+
#endif /* SDL_build_config_ios_h_ */
diff --git a/src/cpuinfo/SDL_cpuinfo.c b/src/cpuinfo/SDL_cpuinfo.c
index 966a5ae79ac5c..19daae4421342 100644
--- a/src/cpuinfo/SDL_cpuinfo.c
+++ b/src/cpuinfo/SDL_cpuinfo.c
@@ -109,6 +109,7 @@
#define CPU_HAS_ARM_SIMD (1 << 11)
#define CPU_HAS_LSX (1 << 12)
#define CPU_HAS_LASX (1 << 13)
+#define CPU_HAS_SVE2 (1 << 14)
#define CPU_CFG2 0x2
#define CPU_CFG2_LSX (1 << 6)
@@ -514,6 +515,27 @@ static int CPU_haveNEON(void)
#endif
}
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+#ifndef HWCAP_SVE
+#define HWCAP_SVE (1 << 22)
+#endif
+#ifndef HWCAP2_SVE2
+#define HWCAP2_SVE2 (1 << 1)
+#endif
+
+static int CPU_haveSVE2(void)
+{
+#if defined(__aarch64__) && \
+ ((defined(SDL_PLATFORM_LINUX) && defined(HAVE_GETAUXVAL)) || defined(SDL_PLATFORM_ANDROID))
+ return ((getauxval(AT_HWCAP2) & HWCAP2_SVE2) == HWCAP2_SVE2)
+ && ((getauxval(AT_HWCAP) & HWCAP_SVE) == HWCAP_SVE);
+#else
+ return 0;
+#endif
+}
+
static int CPU_readCPUCFG(void)
{
uint32_t cfg2 = 0;
@@ -960,6 +982,8 @@ static Uint32 SDLCALL SDL_CPUFeatureMaskFromHint(void)
spot_mask = CPU_HAS_LSX;
} else if (ref_string_equals("lasx", spot, end)) {
spot_mask = CPU_HAS_LASX;
+ } else if (ref_string_equals("sve2", spot, end)) {
+ spot_mask = CPU_HAS_SVE2;
} else {
// Ignore unknown/incorrect cpu feature(s)
continue;
@@ -1036,6 +1060,10 @@ static Uint32 SDL_GetCPUFeatures(void)
SDL_CPUFeatures |= CPU_HAS_LASX;
SDL_SIMDAlignment = SDL_max(SDL_SIMDAlignment, 32);
}
+ if (CPU_haveSVE2()) {
+ SDL_CPUFeatures |= CPU_HAS_SVE2;
+ SDL_SIMDAlignment = SDL_max(SDL_SIMDAlignment, 16);
+ }
SDL_CPUFeatures &= SDL_CPUFeatureMaskFromHint();
}
return SDL_CPUFeatures;
@@ -1117,6 +1145,11 @@ bool SDL_HasLASX(void)
return CPU_FEATURE_AVAILABLE(CPU_HAS_LASX);
}
+bool SDL_HasSVE2(void)
+{
+ return CPU_FEATURE_AVAILABLE(CPU_HAS_SVE2);
+}
+
static int SDL_SystemRAM = 0;
int SDL_GetSystemRAM(void)
diff --git a/src/dynapi/SDL_dynapi.exports b/src/dynapi/SDL_dynapi.exports
index 67600f2b7bdb6..32e9fbff861c1 100644
--- a/src/dynapi/SDL_dynapi.exports
+++ b/src/dynapi/SDL_dynapi.exports
@@ -1287,3 +1287,4 @@ _SDL_GDKResumeRenderer
_SDL_IsPhone
_SDL_LoadJPG_IO
_SDL_LoadJPG
+_SDL_HasSVE2
diff --git a/src/dynapi/SDL_dynapi.sym b/src/dynapi/SDL_dynapi.sym
index 3fdc470a33730..ca1a1c97d940e 100644
--- a/src/dynapi/SDL_dynapi.sym
+++ b/src/dynapi/SDL_dynapi.sym
@@ -1288,6 +1288,7 @@ SDL3_0.0.0 {
SDL_IsPhone;
SDL_LoadJPG_IO;
SDL_LoadJPG;
+ SDL_HasSVE2;
# extra symbols go here (don't modify this line)
local: *;
};
diff --git a/src/dynapi/SDL_dynapi_overrides.h b/src/dynapi/SDL_dynapi_overrides.h
index 7b88affdc65c7..677768ff2f107 100644
--- a/src/dynapi/SDL_dynapi_overrides.h
+++ b/src/dynapi/SDL_dynapi_overrides.h
@@ -1314,3 +1314,4 @@
#define SDL_IsPhone SDL_IsPhone_REAL
#define SDL_LoadJPG_IO SDL_LoadJPG_IO_REAL
#define SDL_LoadJPG SDL_LoadJPG_REAL
+#define SDL_HasSVE2 SDL_HasSVE2_REAL
diff --git a/src/dynapi/SDL_dynapi_procs.h b/src/dynapi/SDL_dynapi_procs.h
index 24a5afad988e4..99899b346e9a6 100644
--- a/src/dynapi/SDL_dynapi_procs.h
+++ b/src/dynapi/SDL_dynapi_procs.h
@@ -1322,3 +1322,4 @@ SDL_DYNAPI_PROC(void,SDL_GDKResumeRenderer,(SDL_Renderer *a),(a),)
SDL_DYNAPI_PROC(bool,SDL_IsPhone,(void),(),return)
SDL_DYNAPI_PROC(SDL_Surface*,SDL_LoadJPG_IO,(SDL_IOStream *a,bool b),(a,b),return)
SDL_DYNAPI_PROC(SDL_Surface*,SDL_LoadJPG,(const char *a),(a),return)
+SDL_DYNAPI_PROC(bool,SDL_HasSVE2,(void),(),return)
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index f7a997f3b0d94..0dcd25d885ef9 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -25,6 +25,10 @@
#include "SDL_pixels_c.h"
#include "SDL_surface_c.h"
+#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
+#include "./arm/SDL_sve2_blit_A.h"
+#endif
+
// Functions to perform alpha blended blitting
// N->1 blending with per-surface alpha
@@ -1477,6 +1481,17 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
}
case 2:
+#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
+ if (SDL_HasSVE2()) {
+ if (sf->bytes_per_pixel == 4 &&
+ df->bytes_per_pixel == 2 &&
+ df->Rmask == 0x0000F800 &&
+ df->Gmask == 0x000007E0 &&
+ df->Bmask == 0x0000001F) {
+ return Blit8888to565PixelAlphaSwizzleSVE2;
+ }
+ }
+#endif
if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
if (df->Gmask == 0x7e0) {
return BlitARGBto565PixelAlpha;
@@ -1504,6 +1519,19 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
return Blit8888to8888PixelAlphaSwizzleLSX;
}
#endif
+#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
+ if (SDL_HasSVE2()
+ /* NEON is faster than SVE2 when vector size is 128bit */
+ #if defined(SDL_NEON_INTRINSICS)
+ && SDL_GetSVEVectorSize() > 128
+ #endif
+ ) {
+ // To prevent "unused function" compiler warnings/errors
+ (void)Blit8888to8888PixelAlpha;
+ (void)Blit8888to8888PixelAlphaSwizzle;
+ return Blit8888to8888PixelAlphaSwizzleSVE2;
+ }
+#endif
#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
// To prevent "unused function" compiler warnings/errors
(void)Blit8888to8888PixelAlpha;
diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c
index 204c1addbd2ad..b014d4233a1b6 100644
--- a/src/video/SDL_blit_N.c
+++ b/src/video/SDL_blit_N.c
@@ -26,6 +26,10 @@
#include "SDL_surface_c.h"
#include "SDL_blit_copy.h"
+#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
+#include "./arm/SDL_sve2_blit_N.h"
+#endif
+
// General optimized routines that write char by char
#define HAVE_FAST_WRITE_INT8 1
@@ -3117,10 +3121,27 @@ SDL_BlitFunc SDL_CalculateBlitN(SDL_Surface *surface)
return Blit8888to8888PixelSwizzleSSE41;
}
#endif
+#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
+ if (SDL_HasSVE2()) {
+ return Blit8888to8888PixelSwizzleSVE2;
+ }
+#endif
#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
return Blit8888to8888PixelSwizzleNEON;
#endif
}
+#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
+ if (SDL_HasSVE2()) {
+ /* RGBA8888/ARGB8888/XRGB8888 -> RGB565 */
+ if (srcfmt->bytes_per_pixel == 4 &&
+ dstfmt->bytes_per_pixel == 2 &&
+ dstfmt->Rmask == 0x0000F800 &&
+ dstfmt->Gmask == 0x000007E0 &&
+ dstfmt->Bmask == 0x0000001F) {
+ return Blit8888to565PixelSwizzleSVE2;
+ }
+ }
+#endif
blitfun = NULL;
if (dstfmt->bits_per_pixel > 8) {
diff --git a/src/video/arm/SDL_sve2_blit_A.c b/src/video/arm/SDL_sve2_blit_A.c
new file mode 100644
index 0000000000000..be029bcc70031
--- /dev/null
+++ b/src/video/arm/SDL_sve2_blit_A.c
@@ -0,0 +1,89 @@
+/*
+ Simple DirectMedia Layer
+ Copyright (C) 1997-2026 Sam Lantinga <slouken@libsdl.org>
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SDL_sve2_blit_A.h"
+#include <assert.h>
+
+#ifdef SDL_SVE2_INTRINSICS
+
+#undef sdl_sve_rgb32_blend_op_fill_alpha
+#define sdl_sve_rgb32_blend_op_fill_alpha(ma_alpha_chn_idx) \
+ if (sve_src_chn_idx == (ma_alpha_chn_idx)) { \
+ /* fill alpha */ \
+ sve_target_u16 = svdup_u16(0xFF); \
+ } else { \
+ svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \
+ sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \
+ sve_target_u16, \
+ vMask); \
+ }
+
+#undef sdl_sve_rgb32_blend_op_copy_alpha
+#define sdl_sve_rgb32_blend_op_copy_alpha(ma_alpha_chn_idx) \
+ if (sve_src_chn_idx == (ma_alpha_chn_idx)) { \
+ svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \
+ sve_target_u16 = sdl_sve_chn_blend_with_mask(svdup_u16(0xFF), \
+ sve_target_u16, \
+ vMask); \
+ } else { \
+ svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \
+ sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \
+ sve_target_u16, \
+ vMask); \
+ }
+
+#undef sdl_sve_rgb32_blend_to_rgb565_op
+#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) \
+ do { \
+ svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \
+ sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \
+ sve_target_u16, \
+ vMask); \
+ } while (0)
+
+#include "SDL_sve2_swizzle.h"
+
+/*-----------------------------------------------------------------------------*
+ * Swizzle Blend with Alpha *
+ *-----------------------------------------------------------------------------*/
+SDL_TARGETING("arch=armv8-a+sve2")
+void Blit8888to8888PixelAlphaSwizzleSVE2(SDL_BlitInfo *info)
+{
+ const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
+ assert(0 != srcfmt->Amask);
+ (void)srcfmt;
+
+ sdl_sve_8888_to_8888_swizzle_dispatcher(info);
+}
+
+SDL_TARGETING("arch=armv8-a+sve2")
+void Blit8888to565PixelAlphaSwizzleSVE2(SDL_BlitInfo *info)
+{
+ sdl_sve_rgb32_to_rgb565_swizzle_dispatcher(info);
+}
+
+SDL_TARGETING("arch=armv8-a+sve2")
+size_t SDL_GetSVEVectorSize(void)
+{
+ return svlen(svundef_u8()) * 8;
+}
+
+#endif /* SDL_SVE2_INTRINSICS */
\ No newline at end of file
diff --git a/src/video/arm/SDL_sve2_blit_A.h b/src/video/arm/SDL_sve2_blit_A.h
new file mode 100644
index 0000000000000..2a7e2b8149859
--- /dev/null
+++ b/src/video/arm/SDL_sve2_blit_A.h
@@ -0,0 +1,37 @@
+/*
+ Simple DirectMedia Layer
+ Copyright (C) 1997-2026 Sam Lantinga <slouken@libsdl.org>
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SDL_sve2_blit_A_h_
+#define SDL_sve2_blit_A_h_
+
+#include "../../SDL_internal.h"
+#include "../SDL_blit.h"
+
+#ifdef SDL_SVE2_INTRINSICS
+
+void Blit8888to8888PixelAlphaSwizzleSVE2(SDL_BlitInfo *info);
+void Blit8888to565PixelAlphaSwizzleSVE2(SDL_BlitInfo *info);
+
+size_t SDL_GetSVEVectorSize(void);
+
+#endif /* SDL_SVE2_INTRINSICS */
+
+#endif /* SDL_sve2_blitters_h_ */
\ No newline at end of file
diff --git a/src/video/arm/SDL_sve2_blit_N.c b/src/video/arm/SDL_sve2_blit_N.c
new file mode 100644
index 0000000000000..c6ae97e53b341
--- /dev/null
+++ b/src/video/arm/SDL_sve2_blit_N.c
@@ -0,0 +1,64 @@
+/*
+ Simple DirectMedia Layer
+ Copyright (C) 1997-2026 Sam Lantinga <slouken@libsdl.org>
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SDL_sve2_blit_N.h"
+#include <assert.h>
+
+#ifdef SDL_SVE2_INTRINSICS
+
+#undef sdl_sve_rgb32_blend_op_fill_alpha
+#define sdl_sve_rgb32_blend_op_fill_alpha(ma_alpha_chn_idx) \
+ do { \
+ if (sve_src_chn_idx == (ma_alpha_chn_idx)) { \
+ /* fill alpha */ \
+ sve_target_u16 = svdup_u16(0xFF); \
+ } else { \
+ sve_target_u16 = sve_source_u16; \
+ } \
+ } while (0)
+
+#undef sdl_sve_rgb32_blend_op_copy_alpha
+#define sdl_sve_rgb32_blend_op_copy_alpha(ma_alpha_chn_idx) \
+ do { \
+ sve_target_u16 = sve_source_u16; \
+ } while (0)
+
+#undef sdl_sve_rgb32_blend_to_rgb565_op
+#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) \
+ do { \
+ sve_target_u16 = sve_source_u16; \
+ } while (0)
+
+#include "SDL_sve2_swizzle.h"
+
+SDL_TARGETING("arch=armv8-a+sve2")
+void Blit8888to8888PixelSwizzleSVE2(SDL_BlitInfo *info)
+{
+ sdl_sve_8888_to_8888_swizzle_dispatcher(info);
+}
+
+SDL_TARGETING("arch=armv8-a+sve2")
+void Blit8888to565PixelSwizzleSVE2(SDL_BlitInfo *info)
+{
+ sdl_sve_rgb32_to_rgb565_swizzle_dispatcher(info);
+}
+
+#endif /* SDL_SVE2_INTRINSICS */
\ No newline at end of file
diff --git a/src/video/arm/SDL_sve2_blit_N.h b/src/video/arm/SDL_sve2_blit_N.h
new file mode 100644
index 0000000000000..3868de0dbb475
--- /dev/null
+++ b/src/video/arm/SDL_sve2_blit_N.h
@@ -0,0 +1,35 @@
+/*
+ Simple DirectMedia Layer
+ Copyright (C) 1997-2026 Sam Lantinga <slouken@libsdl.org>
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef SDL_sve2_blit_N_h_
+#define SDL_sve2_blit_N_h_
+
+#include "../../SDL_internal.h"
+#include "../SDL_blit.h"
+
+#ifdef SDL_SVE2_INTRINSICS
+
+void Blit8888to8888PixelSwizzleSVE2(SDL_BlitInfo *info);
+void Blit8888to565PixelSwizzleSVE2(SDL_BlitInfo *info);
+
+#endif /* SDL_SVE2_INTRINSICS */
+
+#endif /* SDL_sve2_blitters_h_ */
\ No newline at end of file
diff --git a/src/video/arm/SDL_sve2_extension.h b/src/video/arm/SDL_sve2_extension.h
new file mode 100644
index 0000000000000..2f5a74a12bb59
--- /dev/null
+++ b/src/video/arm/SDL_sve2_extension.h
@@ -0,0 +1,1142 @@
+/*
+ Simple DirectMedia Layer
+ Copyright (C) 1997-2026 Sam Lantinga <slouken@libsdl.org>
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#if !defined(SDL_SVE2_EXTENSION_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2)
+#define SDL_SVE2_EXTENSION_H
+
+#include "SDL_sve2_util.h"
+#include <arm_sve.h>
+#include <stdint.h>
+
+/*!
+ * \brief a wrapper for __attribute__((nonnull))
+ */
+#ifndef ARM_NONNULL
+#define ARM_NONNULL(...) __attribute__((nonnull(__VA_ARGS__)))
+#endif
+
+#define svlenu8() svcntb_pat(SV_ALL)
+#define svlenu16() (svcntb_pat(SV_ALL) / sizeof(uint16_t))
+#define svlenu32() (svcntb_pat(SV_ALL) / sizeof(uint32_t))
+#define svlenu64() (svcntb_pat(SV_ALL) / sizeof(uint64_t))
+
+#define svlens8() svlenu8()
+#define svlens16() svlenu16()
+#define svlens32() svlenu32()
+#define svlens64() svlenu64()
+
+#define sdl_sve_stride_loop_accc8888(ma_stride_size, ma_pred_name) \
+ for (svbool_t ma_pred_name, *pTemp = &ma_pred_name; \
+ pTemp != NULL; \
+ pTemp = NULL) \
+ for (size_t SVE_SAFE_NAME(n) = 0, \
+ sve_iteration_advance = svlenu32() * 4; \
+ ({ \
+ ma_pred_name = svwhilelt_b8((int32_t)SVE_SAFE_NAME(n), \
+ (int32_t)(ma_stride_size)); \
+ SVE_SAFE_NAME(n) < (ma_stride_size); \
+ }); \
+ SVE_SAFE_NAME(n) += sve_iteration_advance)
+
+#define sdl_sve_stride_loop_rgb32(ma_stride_size, ma_pred_name) \
+ sdl_sve_stride_loop_accc8888(ma_stride_size, ma_pred_name)
+
+#define sdl_sve_stride_loop_rgb16(ma_stride_size, ma_pred_name) \
+ for (svbool_t ma_pred_name, *pTemp = &ma_pred_name; \
+ pTemp != NULL; \
+ pTemp = NULL) \
+ for (size_t SVE_SAFE_NAME(n) = 0, \
+ sve_iteration_advance = svlenu16(); \
+ ({ \
+ ma_pred_name = svwhilelt_b16((int32_t)SVE_SAFE_NAME(n), \
+ (int32_t)(ma_stride_size)); \
+ SVE_SAFE_NAME(n) < (ma_stride_size); \
+ }); \
+ SVE_SAFE_NAME(n) += sve_iteration_advance)
+
+#define sdl_sve_pixel_ccc_foreach_chn(ma_source_u16x3, \
+ ma_target_u16x3, \
+ ...) \
+ do { \
+ svuint16x3_t sve_source_u16x3 = ma_source_u16x3; \
+ (void)sve_source_u16x3; \
+ do { \
+ const uint8_t sve_src_chn_idx = 0; \
+ (void)sve_src_chn_idx; \
+ svuint16_t sve_source_u16 = svget3((ma_source_u16x3), 0); \
+ svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \
+ (void)sve_source_u16; \
+ (void)sve_target_u16; \
+
(Patch may be truncated, please check the link at the top of this post.)