From f2dba2626ec2c02d5d37ccdbc7b1f03293321d61 Mon Sep 17 00:00:00 2001
From: Gabriel Wang <[EMAIL REDACTED]>
Date: Tue, 26 May 2026 22:57:44 +0800
Subject: [PATCH] SVE2: Improves SVE2 8888 swizzling performance and important
fixes (#15662)
* SVE2 was actually disabled in fdfbbce, this issue is fixed
- The macro __ARM_FEATURE_SVE is only defined when the compilation target is set as -march=armv8-m+sve2
* Improves 8888 alpha-blending performance
- Now, in In-Order AArch64 processors, e.g. A520, SVE2 is better than NEON with the 128bit vector width
- For Out-of-order processors, NEON is still better than SVE2 (We could improve this in the future), the performance is improved from 3.0 to 3.6.
* The 8888 -> RGB565 performance is also improved (from 7.4 to 9.3)
---
CMakeLists.txt | 5 +++
include/SDL3/SDL_intrin.h | 23 +++++++++-----
src/video/arm/SDL_sve2_extension.h | 50 +++++++++++++++++++++---------
src/video/arm/SDL_sve2_swizzle.h | 6 ++++
4 files changed, 62 insertions(+), 22 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e777e1537e7ad..26ac91a465049 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -934,6 +934,10 @@ if(SDL_ASSEMBLY)
return 0;
}]==] COMPILER_SUPPORTS_ARMSVE2)
if(COMPILER_SUPPORTS_ARMSVE2)
+ # IMPORTANT: As not all AArch64 processors support SVE2, we only
+ # attach the following compilation option to SVE
+ # dedicated source files.
+ set(SVE2_MARCH_FLAG "-march=armv8-a+sve2")
set(HAVE_ARMSVE2 TRUE)
endif()
cmake_pop_check_state()
@@ -947,6 +951,7 @@ if(SDL_ASSEMBLY)
"${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_A.c"
"${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_N.c"
PROPERTIES
+ COMPILE_FLAGS "${SVE2_MARCH_FLAG}"
SKIP_PRECOMPILE_HEADERS ON
)
endif()
diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h
index 289a356b7c1cc..449eb445557bb 100644
--- a/include/SDL3/SDL_intrin.h
+++ b/include/SDL3/SDL_intrin.h
@@ -88,8 +88,12 @@
/**
* Defined if (and only if) the compiler supports ARM SVE2 intrinsics.
*
- * If this macro is defined, SDL will have already included `<arm_sve.h>` as
- * appropriate.
+ * If this macro is defined, `<arm_sve.h>` (providing SVE intrinsics) will
+ * only be included if the target architecture supports SVE
+ * (`__ARM_FEATURE_SVE` feature macro).
+ * Some toolchains do not support `SDL_TARGETING("arch=armv8-a+sve2")`, so
+ * for best portability you need to write all SVE code in a separate
+ * translation unit and add appropriate compile flags.
*
* \since This macro is available since SDL 3.6.0.
*/
@@ -247,9 +251,11 @@ _m_prefetch(void *__P)
# define SDL_NEON_INTRINSICS 1
# include <arm_neon.h>
#endif
-#if defined(__ARM_FEATURE_SVE2) && !defined(SDL_DISABLE_SVE2)
+#if !defined(SDL_DISABLE_SVE2)
# define SDL_SVE2_INTRINSICS 1
-# include <arm_sve.h>
+# if defined(__ARM_FEATURE_SVE)
+# include <arm_sve.h>
+# endif
#endif
#else
@@ -284,16 +290,19 @@ _m_prefetch(void *__P)
/* Visual Studio doesn't define __ARM_ARCH, but _M_ARM (if set, always 7), and _M_ARM64 (if set, always 1). */
# if defined (_M_ARM64) && 0 /* Please only remove this 0 when MSVC releasing support for SVE2 officially. */
# define SDL_SVE2_INTRINSICS 1
-# include <arm_sve.h>
# define __ARM_FEATURE_SVE2 1 /* Set __ARM_FEATURE_SVE2 so that it can be used elsewhere, at compile time */
+# define __ARM_FEATURE_SVE 1 /* Set __ARM_FEATURE_SVE so that it can be used elsewhere, at compile time */
# define __ARM_ARCH 8
+# include <arm_sve.h>
# endif
# elif defined(SDL_PLATFORM_APPLE)
/* Apple has no AArch64 device supporting SVE2 */
# elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) && \
- defined(__has_include) && __has_include(<arm_sve.h>) && defined(__ARM_FEATURE_SVE)
+ defined(__has_include) && __has_include(<arm_sve.h>)
# define SDL_SVE2_INTRINSICS 1
-# include <arm_sve.h>
+# if defined(__ARM_FEATURE_SVE)
+# include <arm_sve.h>
+# endif
# endif
#endif
#endif /* compiler version */
diff --git a/src/video/arm/SDL_sve2_extension.h b/src/video/arm/SDL_sve2_extension.h
index b9db084bba413..c18dace028ae2 100644
--- a/src/video/arm/SDL_sve2_extension.h
+++ b/src/video/arm/SDL_sve2_extension.h
@@ -19,10 +19,33 @@
3. This notice may not be removed or altered from any source distribution.
*/
+/*
+ * IMPORTANT: Please do NOT include this header file directly or indirectly
+ * outside the src/video/arm folder.
+ *
+ */
+
#if !defined(SDL_SVE2_EXTENSION_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2)
#define SDL_SVE2_EXTENSION_H
#include "SDL_sve2_util.h"
+
+/*
+ * NOTE: Some Android builds didn't attach '-march=armv8-a+sve2' to
+ * SDL_sve2_*.c and hence the macro __ARM_FEATURE_SVE is not
+ * defined by the compiler. This might not be a problem as the
+ * SDL_TARGETING("arch=armv8-a+sve2") enables the feature for
+ * individual functions, until some version of compilers
+ * provides arm_sve.h raising errors then __ARM_FEATURE_SVE
+ * is not defined. Although it should be avoided, as a
+ * workaround, we have to define the __ARM_FEATURE_SVE here as
+ * an ugly hack.
+ */
+#ifdef SDL_PLATFORM_ANDROID
+#ifndef __ARM_FEATURE_SVE
+#define __ARM_FEATURE_SVE 1
+#endif
+#endif
#include <arm_sve.h>
#include <stdint.h>
@@ -907,7 +930,8 @@ static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource,
svuint16_t vMask)
{
// vTarget = vSource * vMask + vTarget * (255 - vMask);
- svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask);
+ svuint16_t vTemp0 = svdup_u16(1);
+ vTemp0 = svmla_u16_m(svptrue_b16(), vTemp0, vSource, vMask);
vTemp0 = svmla_u16_m(svptrue_b16(),
vTemp0,
vTarget,
@@ -915,17 +939,13 @@ static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource,
svdup_u16(255),
vMask));
- vTemp0 = svadd_n_u16_m(svptrue_b16(), vTemp0, 1);
-
- svuint16_t vTemp1 = svlsr_n_u16_m(svptrue_b16(), vTemp0, 8);
/* x += x >> 8 */
- vTemp0 = svadd_u16_m(svptrue_b16(),
- vTemp0,
- vTemp1);
-
- return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8;
+ return svreinterpret_u16_u8(
+ svaddhnb_u16(vTemp0,
+ svlsr_n_u16_m(svptrue_b16(),
+ vTemp0,
+ 8)));
}
-
/*! \note the Element range of vMask is [0, 0xFF]
*/
SDL_TARGETING("arch=armv8-a+sve2")
@@ -968,15 +988,15 @@ static inline svuint16_t sdl_sve_chn_blend_with_opacity(svuint16_t vSource,
*/
SDL_TARGETING("arch=armv8-a+sve2")
static inline svuint16_t sdl_sve_chn_blend_with_opacity_fast(svuint16_t vSource,
- svuint16_t vTarget,
- uint16_t hwOpacity)
+ svuint16_t vTarget,
+ uint16_t hwOpacity)
{
// vTarget = vSource * vMask + vTarget * (255 - vMask);
svuint16_t vTemp0 = svmul_n_u16_m(svptrue_b16(), vSource, hwOpacity);
vTemp0 = svmla_n_u16_m(svptrue_b16(),
- vTemp0,
- vTarget,
- 256 - hwOpacity);
+ vTemp0,
+ vTarget,
+ 256 - hwOpacity);
return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8;
}
diff --git a/src/video/arm/SDL_sve2_swizzle.h b/src/video/arm/SDL_sve2_swizzle.h
index a2d6f978d2f70..032e9d6796c4b 100644
--- a/src/video/arm/SDL_sve2_swizzle.h
+++ b/src/video/arm/SDL_sve2_swizzle.h
@@ -19,6 +19,12 @@
3. This notice may not be removed or altered from any source distribution.
*/
+/*
+ * IMPORTANT: Please do NOT include this header file directly or indirectly
+ * outside the src/video/arm folder.
+ *
+ */
+
#if !defined(SD_SVE2_SWIZZLE_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2)
#define SD_SVE2_SWIZZLE_H