From 69de6964e58d287919048e9ac0da41c206fd80bd Mon Sep 17 00:00:00 2001
From: Ozkan Sezer <[EMAIL REDACTED]>
Date: Fri, 17 Mar 2023 20:57:40 +0300
Subject: [PATCH] use target attributes of gcc / clang for SIMD code.
---
CMakeLists.txt | 5 -
include/SDL3/SDL_intrin.h | 24 +-
src/SDL_internal.h | 23 +-
src/audio/SDL_audiocvt.c | 4 +-
src/audio/SDL_audiotypecvt.c | 16 +-
src/video/SDL_blit_A.c | 10 +-
src/video/SDL_blit_copy.c | 21 +-
src/video/SDL_fillrect.c | 4 +-
src/video/SDL_stretch.c | 6 +-
src/video/SDL_yuv.c | 560 +++++++++++++++++++++------
src/video/yuv2rgb/yuv_rgb.c | 4 +-
src/video/yuv2rgb/yuv_rgb_sse_func.h | 2 +-
12 files changed, 510 insertions(+), 169 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccc0ee23ece7..13af6c4d7a8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -735,7 +735,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state()
if(CPU_SUPPORTS_AVX)
set(HAVE_AVX TRUE)
- target_compile_options(sdl-build-options INTERFACE "-mavx")
endif()
endif()
@@ -760,7 +759,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state()
if(CPU_SUPPORTS_MMX)
set(HAVE_MMX TRUE)
- target_compile_options(sdl-build-options INTERFACE "-mmmx")
endif()
endif()
@@ -785,7 +783,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state()
if(CPU_SUPPORTS_SSE)
set(HAVE_SSE ON)
- target_compile_options(sdl-build-options INTERFACE "-msse")
endif()
endif()
@@ -810,7 +807,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state()
if(CPU_SUPPORTS_SSE2)
set(HAVE_SSE2 TRUE)
- target_compile_options(sdl-build-options INTERFACE "-msse2")
endif()
endif()
@@ -835,7 +831,6 @@ if(SDL_ASSEMBLY)
cmake_pop_check_state()
if(CPU_SUPPORTS_SSE3)
set(HAVE_SSE3 TRUE)
- target_compile_options(sdl-build-options INTERFACE "-msse3")
endif()
endif()
diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h
index 6fedc18b2ab5..adec1346dc7a 100644
--- a/include/SDL3/SDL_intrin.h
+++ b/include/SDL3/SDL_intrin.h
@@ -93,25 +93,39 @@ _m_prefetch(void *__P)
#endif
#endif /* compiler version */
+#if defined(__clang__) && defined(__has_attribute)
+# if __has_attribute(target)
+# define SDL_HAS_TARGET_ATTRIBS
+# endif
+#elif defined(__GNUC__) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */
+# define SDL_HAS_TARGET_ATTRIBS
+#endif
+
+#ifdef SDL_HAS_TARGET_ATTRIBS
+# define SDL_TARGETING(x) __attribute__((target(x)))
+#else
+# define SDL_TARGETING(x)
+#endif
+
#if defined(__loongarch_sx) && !defined(SDL_DISABLE_LSX)
#include <lsxintrin.h>
#endif
#if defined(__loongarch_asx) && !defined(SDL_DISABLE_LASX)
#include <lasxintrin.h>
#endif
-#if defined(__AVX__) && !defined(SDL_DISABLE_AVX)
+#if (defined(__AVX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_AVX)
#include <immintrin.h>
#endif
-#if defined(__MMX__) && !defined(SDL_DISABLE_MMX)
+#if (defined(__MMX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_MMX)
#include <mmintrin.h>
#endif
-#if defined(__SSE__) && !defined(SDL_DISABLE_SSE)
+#if (defined(__SSE__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE)
#include <xmmintrin.h>
#endif
-#if defined(__SSE2__) && !defined(SDL_DISABLE_SSE2)
+#if (defined(__SSE2__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE2)
#include <emmintrin.h>
#endif
-#if defined(__SSE3__) && !defined(SDL_DISABLE_SSE3)
+#if (defined(__SSE3__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE3)
#include <pmmintrin.h>
#endif
diff --git a/src/SDL_internal.h b/src/SDL_internal.h
index 89c6f224e036..fd65cf8643be 100644
--- a/src/SDL_internal.h
+++ b/src/SDL_internal.h
@@ -194,23 +194,23 @@
#define HAVE_NEON_INTRINSICS 1
#endif
-#if defined(__MMX__) && !defined(SDL_DISABLE_MMX)
+#if (defined(__MMX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_MMX)
#define HAVE_MMX_INTRINSICS 1
#endif
-#if defined(__SSE__) && !defined(SDL_DISABLE_SSE)
+#if (defined(__SSE__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE)
#define HAVE_SSE_INTRINSICS 1
#endif
-#if defined(__SSE2__) && !defined(SDL_DISABLE_SSE2)
+#if (defined(__SSE2__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE2)
#define HAVE_SSE2_INTRINSICS 1
#endif
-#if defined(__SSE3__) && !defined(SDL_DISABLE_SSE3)
+#if (defined(__SSE3__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE3)
#define HAVE_SSE3_INTRINSICS 1
#endif
-#if defined(__AVX__) && !defined(SDL_DISABLE_AVX)
+#if (defined(__AVX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_AVX)
#define HAVE_AVX_INTRINSICS 1
#endif
@@ -222,19 +222,6 @@
#define HAVE_LASX_INTRINSICS 1
#endif
-#if defined __clang__
-#if (!__has_attribute(target))
-#undef HAVE_AVX_INTRINSICS
-#endif
-#if (defined(_MSC_VER) || defined(__SCE__)) && !defined(__AVX__)
-#undef HAVE_AVX_INTRINSICS
-#endif
-#elif defined __GNUC__
-#if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 9)
-#undef HAVE_AVX_INTRINSICS
-#endif
-#endif
-
#define SDL_MAIN_NOIMPL /* don't drag in header-only implementation of SDL_main */
#include <SDL3/SDL_main.h>
diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c
index c29c80358077..2eb74f8540c3 100644
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@@ -146,7 +146,7 @@ static int SDL_ConvertAudio(SDL_AudioCVT * cvt);
#if HAVE_SSE3_INTRINSICS
/* Convert from stereo to mono. Average left and right. */
-static void SDLCALL SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse3") SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
const __m128 divby2 = _mm_set1_ps(0.5f);
float *dst = (float *)cvt->buf;
@@ -183,7 +183,7 @@ static void SDLCALL SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFor
#if HAVE_SSE_INTRINSICS
/* Convert from mono to stereo. Duplicate to stereo left and right. */
-static void SDLCALL SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse") SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
float *dst = ((float *)(cvt->buf + (cvt->len_cvt * 2))) - 8;
const float *src = ((const float *)(cvt->buf + cvt->len_cvt)) - 4;
diff --git a/src/audio/SDL_audiotypecvt.c b/src/audio/SDL_audiotypecvt.c
index 86ab19507f29..e94a890dc413 100644
--- a/src/audio/SDL_audiotypecvt.c
+++ b/src/audio/SDL_audiotypecvt.c
@@ -225,7 +225,7 @@ static void SDLCALL SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFo
#endif
#if HAVE_SSE2_INTRINSICS
-static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
const Sint8 *src = ((const Sint8 *)(cvt->buf + cvt->len_cvt)) - 1;
float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
@@ -289,7 +289,7 @@ static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
}
}
-static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
const Uint8 *src = ((const Uint8 *)(cvt->buf + cvt->len_cvt)) - 1;
float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
@@ -355,7 +355,7 @@ static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
}
}
-static void SDLCALL SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
const Sint16 *src = ((const Sint16 *)(cvt->buf + cvt->len_cvt)) - 1;
float *dst = ((float *)(cvt->buf + cvt->len_cvt * 2)) - 1;
@@ -408,7 +408,7 @@ static void SDLCALL SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
}
}
-static void SDLCALL SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
const Sint32 *src = (const Sint32 *)cvt->buf;
float *dst = (float *)cvt->buf;
@@ -451,7 +451,7 @@ static void SDLCALL SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
}
}
-static void SDLCALL SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
const float *src = (const float *)cvt->buf;
Sint8 *dst = (Sint8 *)cvt->buf;
@@ -514,7 +514,7 @@ static void SDLCALL SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
}
}
-static void SDLCALL SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
const float *src = (const float *)cvt->buf;
Uint8 *dst = cvt->buf;
@@ -577,7 +577,7 @@ static void SDLCALL SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
}
}
-static void SDLCALL SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
const float *src = (const float *)cvt->buf;
Sint16 *dst = (Sint16 *)cvt->buf;
@@ -638,7 +638,7 @@ static void SDLCALL SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
}
}
-static void SDLCALL SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
{
const float *src = (const float *)cvt->buf;
Sint32 *dst = (Sint32 *)cvt->buf;
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index b0e20743a056..a9324a3ebf85 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -169,7 +169,7 @@ static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
#if HAVE_MMX_INTRINSICS
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
-static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
@@ -223,7 +223,7 @@ static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
}
/* fast RGB888->(A)RGB888 blending with surface alpha */
-static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
{
SDL_PixelFormat *df = info->dst_fmt;
Uint32 chanmask;
@@ -318,7 +318,7 @@ static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
}
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
-static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
{
int width = info->dst_w;
int height = info->dst_h;
@@ -753,7 +753,7 @@ static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
#if HAVE_MMX_INTRINSICS
/* fast RGB565->RGB565 blending with surface alpha */
-static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
{
unsigned alpha = info->a;
if (alpha == 128) {
@@ -889,7 +889,7 @@ static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
}
/* fast RGB555->RGB555 blending with surface alpha */
-static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
{
unsigned alpha = info->a;
if (alpha == 128) {
diff --git a/src/video/SDL_blit_copy.c b/src/video/SDL_blit_copy.c
index 45536b4b2f7b..cc8cd22040c5 100644
--- a/src/video/SDL_blit_copy.c
+++ b/src/video/SDL_blit_copy.c
@@ -25,7 +25,7 @@
#if HAVE_SSE_INTRINSICS
/* This assumes 16-byte aligned src and dst */
-static SDL_INLINE void SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
+static SDL_INLINE void SDL_TARGETING("sse") SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
{
int i;
@@ -54,7 +54,7 @@ static SDL_INLINE void SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
#ifdef _MSC_VER
#pragma warning(disable : 4799)
#endif
-static SDL_INLINE void SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
+static SDL_INLINE void SDL_TARGETING("mmx") SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
{
const int remain = (len & 63);
int i;
@@ -81,6 +81,16 @@ static SDL_INLINE void SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
SDL_memcpy(dst + skip, src + skip, remain);
}
}
+
+static SDL_INLINE void SDL_TARGETING("mmx") SDL_BlitCopyMMX(Uint8 *dst, const Uint8 *src, const int dstskip, const int srcskip, const int w, int h)
+{
+ while (h--) {
+ SDL_memcpyMMX(dst, src, w);
+ src += srcskip;
+ dst += dstskip;
+ }
+ _mm_empty();
+}
#endif /* HAVE_MMX_INTRINSICS */
void SDL_BlitCopy(SDL_BlitInfo *info)
@@ -137,12 +147,7 @@ void SDL_BlitCopy(SDL_BlitInfo *info)
#if HAVE_MMX_INTRINSICS
if (SDL_HasMMX() && !(srcskip & 7) && !(dstskip & 7)) {
- while (h--) {
- SDL_memcpyMMX(dst, src, w);
- src += srcskip;
- dst += dstskip;
- }
- _mm_empty();
+ SDL_BlitCopyMMX(dst, src, w, h, dstskip, srcskip);
return;
}
#endif
diff --git a/src/video/SDL_fillrect.c b/src/video/SDL_fillrect.c
index 595cf7c32a71..59c8128e9d01 100644
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@@ -55,7 +55,7 @@
#define SSE_END
#define DEFINE_SSE_FILLRECT(bpp, type) \
-static void SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
+static void SDL_TARGETING("sse") SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
{ \
int i, n; \
Uint8 *p = NULL; \
@@ -92,7 +92,7 @@ static void SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color
SSE_END; \
}
-static void SDL_FillSurfaceRect1SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
+static void SDL_TARGETING("sse") SDL_FillSurfaceRect1SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
{
int i, n;
diff --git a/src/video/SDL_stretch.c b/src/video/SDL_stretch.c
index 36eb391a0c03..4b4091470443 100644
--- a/src/video/SDL_stretch.c
+++ b/src/video/SDL_stretch.c
@@ -349,7 +349,7 @@ static int scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch,
#if defined(HAVE_SSE2_INTRINSICS)
#if 0
-static void printf_128(const char *str, __m128i var)
+static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var)
{
uint16_t *val = (uint16_t*) &var;
printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
@@ -367,7 +367,7 @@ static SDL_INLINE int hasSSE2(void)
return val;
}
-static SDL_INLINE void INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
+static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
{
__m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
__m128i v_frac_w0, k0, l0, d0, e0;
@@ -404,7 +404,7 @@ static SDL_INLINE void INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1,
*dst = _mm_cvtsi128_si32(e0);
}
-static int scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
+static int SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
{
BILINEAR___START
diff --git a/src/video/SDL_yuv.c b/src/video/SDL_yuv.c
index 06a7fa708b06..7c4d6f67bd68 100644
--- a/src/video/SDL_yuv.c
+++ b/src/video/SDL_yuv.c
@@ -303,14 +303,14 @@ static int GetYUVPlanes(int width, int height, Uint32 format, const void *yuv, i
return 0;
}
-static SDL_bool yuv_rgb_sse(
+#if HAVE_SSE2_INTRINSICS
+static SDL_bool SDL_TARGETING("sse2") yuv_rgb_sse(
Uint32 src_format, Uint32 dst_format,
Uint32 width, Uint32 height,
const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
Uint8 *rgb, Uint32 rgb_stride,
YCbCrType yuv_type)
{
-#if HAVE_SSE2_INTRINSICS
if (!SDL_HasSSE2()) {
return SDL_FALSE;
}
@@ -408,10 +408,21 @@ static SDL_bool yuv_rgb_sse(
break;
}
}
-#endif
return SDL_FALSE;
}
+#else
+static SDL_bool yuv_rgb_sse(
+ Uint32 src_format, Uint32 dst_format,
+ Uint32 width, Uint32 height,
+ const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
+ Uint8 *rgb, Uint32 rgb_stride,
+ YCbCrType yuv_type)
+{
+ return SDL_FALSE;
+}
+#endif
+#if HAVE_LSX_INTRINSICS
static SDL_bool yuv_rgb_lsx(
Uint32 src_format, Uint32 dst_format,
Uint32 width, Uint32 height,
@@ -419,7 +430,6 @@ static SDL_bool yuv_rgb_lsx(
Uint8 *rgb, Uint32 rgb_stride,
YCbCrType yuv_type)
{
-#if HAVE_LSX_INTRINSICS
if (!SDL_HasLSX()) {
return SDL_FALSE;
}
@@ -450,9 +460,19 @@ static SDL_bool yuv_rgb_lsx(
break;
}
}
-#endif
return SDL_FALSE;
}
+#else
+static SDL_bool yuv_rgb_lsx(
+ Uint32 src_format, Uint32 dst_format,
+ Uint32 width, Uint32 height,
+ const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
+ Uint8 *rgb, Uint32 rgb_stride,
+ YCbCrType yuv_type)
+{
+ return SDL_FALSE;
+}
+#endif
static SDL_bool yuv_rgb_std(
Uint32 src_format, Uint32 dst_format,
@@ -1102,7 +1122,8 @@ static int SDL_ConvertPixels_SwapUVPlanes(int width, int height, const void *src
return 0;
}
-static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
+#if HAVE_SSE2_INTRINSICS
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_PackUVPlanes_to_NV_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
{
int x, y;
const int UVwidth = (width + 1) / 2;
@@ -1114,9 +1135,6 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi
const Uint8 *src1, *src2;
Uint8 *dstUV;
Uint8 *tmp = NULL;
-#if HAVE_SSE2_INTRINSICS
- const SDL_bool use_SSE2 = SDL_HasSSE2();
-#endif
/* Skip the Y plane */
src = (const Uint8 *)src + height * src_pitch;
@@ -1144,22 +1162,76 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi
y = UVheight;
while (y--) {
x = UVwidth;
-#if HAVE_SSE2_INTRINSICS
- if (use_SSE2) {
- while (x >= 16) {
- __m128i u = _mm_loadu_si128((__m128i *)src1);
- __m128i v = _mm_loadu_si128((__m128i *)src2);
- __m128i uv1 = _mm_unpacklo_epi8(u, v);
- __m128i uv2 = _mm_unpackhi_epi8(u, v);
- _mm_storeu_si128((__m128i *)dstUV, uv1);
- _mm_storeu_si128((__m128i *)(dstUV + 16), uv2);
- src1 += 16;
- src2 += 16;
- dstUV += 32;
- x -= 16;
- }
+ while (x >= 16) {
+ __m128i u = _mm_loadu_si128((__m128i *)src1);
+ __m128i v = _mm_loadu_si128((__m128i *)src2);
+ __m128i uv1 = _mm_unpacklo_epi8(u, v);
+ __m128i uv2 = _mm_unpackhi_epi8(u, v);
+ _mm_storeu_si128((__m128i *)dstUV, uv1);
+ _mm_storeu_si128((__m128i *)(dstUV + 16), uv2);
+ src1 += 16;
+ src2 += 16;
+ dstUV += 32;
+ x -= 16;
+ }
+ while (x--) {
+ *dstUV++ = *src1++;
+ *dstUV++ = *src2++;
}
+ src1 += srcUVPitchLeft;
+ src2 += srcUVPitchLeft;
+ dstUV += dstUVPitchLeft;
+ }
+
+ if (tmp) {
+ SDL_free(tmp);
+ }
+ return 0;
+}
#endif
+
+static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
+{
+ if (SDL_HasSSE2()) {
+ return SDL_ConvertPixels_PackUVPlanes_to_NV_SSE2(width, height, src, src_pitch, dst, dst_pitch, reverseUV);
+ } else {
+ int x, y;
+ const int UVwidth = (width + 1) / 2;
+ const int UVheight = (height + 1) / 2;
+ const int srcUVPitch = ((src_pitch + 1) / 2);
+ const int srcUVPitchLeft = srcUVPitch - UVwidth;
+ const int dstUVPitch = ((dst_pitch + 1) / 2) * 2;
+ const int dstUVPitchLeft = dstUVPitch - UVwidth * 2;
+ const Uint8 *src1, *src2;
+ Uint8 *dstUV;
+ Uint8 *tmp = NULL;
+
+ /* Skip the Y plane */
+ src = (const Uint8 *)src + height * src_pitch;
+ dst = (Uint8 *)dst + height * dst_pitch;
+
+ if (src == dst) {
+ /* Need to make a copy of the buffer so we don't clobber it while converting */
+ tmp = (Uint8 *)SDL_malloc((size_t)2 * UVheight * srcUVPitch);
+ if (tmp == NULL) {
+ return SDL_OutOfMemory();
+ }
+ SDL_memcpy(tmp, src, (size_t)2 * UVheight * srcUVPitch);
+ src = tmp;
+ }
+
+ if (reverseUV) {
+ src2 = (const Uint8 *)src;
+ src1 = src2 + UVheight * srcUVPitch;
+ } else {
+ src1 = (const Uint8 *)src;
+ src2 = src1 + UVheight * srcUVPitch;
+ }
+ dstUV = (Uint8 *)dst;
+
+ y = UVheight;
+ while (y--) {
+ x = UVwidth;
while (x--) {
*dstUV++ = *src1++;
*dstUV++ = *src2++;
@@ -1173,9 +1245,11 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi
SDL_free(tmp);
}
return 0;
+ }
}
-static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
+#if HAVE_SSE2_INTRINSICS
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_SplitNV_to_UVPlanes_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
{
int x, y;
const int UVwidth = (width + 1) / 2;
@@ -1187,9 +1261,6 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo
const Uint8 *srcUV;
Uint8 *dst1, *dst2;
Uint8 *tmp = NULL;
-#if HAVE_SSE2_INTRINSICS
- const SDL_bool use_SSE2 = SDL_HasSSE2();
-#endif
/* Skip the Y plane */
src = (const Uint8 *)src + height * src_pitch;
@@ -1216,28 +1287,82 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo
y = UVheight;
while (y--) {
+ __m128i mask = _mm_set1_epi16(0x00FF);
x = UVwidth;
-#if HAVE_SSE2_INTRINSICS
- if (use_SSE2) {
- __m128i mask = _mm_set1_epi16(0x00FF);
- while (x >= 16) {
- __m128i uv1 = _mm_loadu_si128((__m128i *)srcUV);
- __m128i uv2 = _mm_loadu_si128((__m128i *)(srcUV + 16));
- __m128i u1 = _mm_and_si128(uv1, mask);
- __m128i u2 = _mm_and_si128(uv2, mask);
- __m128i u = _mm_packus_epi16(u1, u2);
- __m128i v1 = _mm_srli_epi16(uv1, 8);
- __m128i v2 = _mm_srli_epi16(uv2, 8);
- __m128i v = _mm_packus_epi16(v1, v2);
- _mm_storeu_si128((__m128i *)dst1, u);
- _mm_storeu_si128((__m128i *)dst2, v);
- srcUV += 32;
- dst1 += 16;
- dst2 += 16;
- x -= 16;
- }
+ while (x >= 16) {
+ __m128i uv1 = _mm_loadu_si128((__m128i *)srcUV);
+ __m128i uv2 = _mm_loadu_si128((__m128i *)(srcUV + 16));
+ __m128i u1 = _mm_and_si128(uv1, mask);
+ __m128i u2 = _mm_and_si128(uv2, mask);
+ __m128i u = _mm_packus_epi16(u1, u2);
+ __m128i v1 = _mm_srli_epi16(uv1, 8);
+ __m128i v2 = _mm_srli_epi16(uv2, 8);
+ __m128i v = _mm_packus_epi16(v1, v2);
+ _mm_storeu_si128((__m128i *)dst1, u);
+ _mm_storeu_si128((__m128i *)dst2, v);
+ srcUV += 32;
+ dst1 += 16;
+ dst2 += 16;
+ x -= 16;
+ }
+ while (x--) {
+ *dst1++ = *srcUV++;
+ *dst2++ = *srcUV++;
}
+ srcUV += srcUVPitchLeft;
+ dst1 += dstUVPitchLeft;
+ dst2 += dstUVPitchLeft;
+ }
+
+ if (tmp) {
+ SDL_free(tmp);
+ }
+ return 0;
+}
#endif
+
+static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
+{
+ if (SDL_HasSSE2()) {
+ return SDL_ConvertPixels_SplitNV_to_UVPlanes_SSE2(width, height, src, src_pitch, dst, dst_pitch, reverseUV);
+ } else {
+ int x, y;
+ const int UVwidth = (width + 1) / 2;
+ const int UVheight = (height + 1) / 2;
+ const int srcUVPitch = ((src_pitch + 1) / 2) * 2;
+ const int srcUVPitchLeft = srcUVPitch - UVwidth * 2;
+ const int dstUVPitch = ((dst_pitch + 1) / 2);
+ const int dstUVPitchLeft = dstUVPitch - UVwidth;
+ const Uint8 *srcUV;
+ Uint8 *dst1, *dst2;
+ Uint8 *tmp = NULL;
+
+ /* Skip the Y plane */
+ src = (const Uint8 *)src + height * src_pitch;
+ dst = (Uint8 *)dst + height * dst_pitch;
+
+ if (src == dst) {
+ /* Need to make a copy of the buffer so we don't clobber it while converting */
+ tmp = (Uint8 *)SDL_malloc((size_t)UVheight * srcUVPitch);
+ if (tmp == NULL) {
+ return SDL_OutOfMemory();
+ }
+ SDL_memcpy(tmp, src, (size_t)UVheight * srcUVPitch);
+ src = tmp;
+ }
+
+ if (reverseUV) {
+ dst2 = (Uint8 *)dst;
+ dst1 = dst2 + UVheight * dstUVPitch;
+ } else {
+ dst1 = (Uint8 *)dst;
+ dst2 = dst1 + UVheight * dstUVPitch;
+ }
+ srcUV = (const Uint8 *)src;
+
+ y = UVheight;
+ while (y--) {
+ x = UVwidth;
while (x--) {
*dst1++ = *srcUV++;
*dst2++ = *srcUV++;
@@ -1251,9 +1376,11 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo
SDL_free(tmp);
}
return 0;
+ }
}
-static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
+#if HAVE_SSE2_INTRINSICS
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_SwapNV_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{
int x, y;
const int UVwidth = (width + 1) / 2;
@@ -1264,9 +1391,6 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int
const int dstUVPitchLeft = (dstUVPitch - UVwidth * 2) / sizeof(Uint16);
const Uint16 *srcUV;
Uint16 *dstUV;
-#if HAVE_SSE2_INTRINSICS
- const SDL_bool use_SSE2 = SDL_HasSSE2();
-#endif
/* Skip the Y plane */
src = (const Uint8 *)src + height * src_pitch;
@@ -1277,20 +1401,50 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int
y = UVheight;
while (y--) {
x = UVwidth;
-#if HAVE_SSE2_INTRINSICS
- if (use_SSE2) {
- while (x >= 8) {
- __m128i uv = _mm_loadu_si128((__m128i *)srcUV);
- __m128i v = _mm_slli_epi16(uv, 8);
- __m128i u = _mm_srli_epi16(uv, 8);
- __m128i vu = _mm_or_si128(v, u);
- _mm_storeu_si128((__m128i *)dstUV, vu);
- srcUV += 8;
- dstUV += 8;
- x -= 8;
- }
+ while (x >= 8) {
+ __m128i uv = _mm_loadu_si128((__m128i *)srcUV);
+ __m128i v = _mm_slli_epi16(uv, 8);
+ __m128i u = _mm_srli_epi16(uv, 8);
+ __m128i vu = _mm_or_si128(v, u);
+ _mm_storeu_si128((__m128i *)dstUV, vu);
+ srcUV += 8;
+ dstUV += 8;
+ x -= 8;
}
+ while (x--) {
+ *dstUV++ = SDL_Swap16(*srcUV++);
+ }
+ srcUV += srcUVPitchLeft;
+ dstUV += dstUVPitchLeft;
+ }
+ return 0;
+}
#endif
+
+static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
+{
+ if (SDL_HasSSE2()) {
+ return SDL_ConvertPixels_SwapNV_SSE2(width, height, src, src_pitch, dst, dst_pitch);
+ } else {
+ int x, y;
+ const int UVwidth = (width + 1) / 2;
+ const int UVheight = (height + 1) / 2;
+ const int srcUVPitch = ((src_pitch + 1) / 2) * 2;
+ const int srcUVPitchLeft = (srcUVPitch - UVwidth * 2) / sizeof(Uint16);
+ const int dstUVPitch = ((dst_pitch + 1) / 2) * 2;
+ const int dstUVPitchLeft = (dstUVPitch - UVwidth * 2) / sizeof(Uint16);
+ const Uint16 *srcUV;
+ Uint16 *dstUV;
+
+ /* Skip the Y plane */
+ src = (const Uint8 *)src + height * src_pitch;
+ dst = (Uint8 *)dst + height * dst_pitch;
+
+ srcUV = (const Uint16 *)src;
+ dstUV = (Uint16 *)dst;
+ y = UVheight;
+ while (y--) {
+ x = UVwidth;
while (x--) {
*dstUV++ = SDL_Swap16(*srcUV++);
}
@@ -1298,6 +1452,7 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int
dstUV += dstUVPitchLeft;
}
return 0;
+ }
}
static int SDL_ConvertPixels_Planar2x2_to_Planar2x2(int width, int height,
@@ -1389,9 +1544,42 @@ static int SDL_ConvertPixels_Planar2x2_to_Planar2x2(int width, int height,
x -= 4; \
}
-#endif
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_YUY2_to_UYVY_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
+{
+ int x, y;
+ const int YUVwidth = (width + 1) / 2;
+ const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
+ const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
+ const Uint8 *srcYUV = (const Uint8 *)src;
+ Uint8 *dstYUV = (Uint8 *)dst;
-static int SDL_ConvertPixels_YUY2_to_UYVY(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
+ y = height;
+ x = YUVwidth;
+ while (y--) {
+ PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1));
+ while (x--) {
+ Uint8 Y1, U, Y2, V;
+
+ Y1 = srcYUV[0];
+ U = srcYUV[1];
+ Y2 = srcYUV[2];
+ V = srcYUV[3];
+ srcYUV += 4;
+
+ dstYUV[0] = U;
+ dstYUV[1] = Y1;
+ dstYUV[2] = V;
+ dstYUV[3] = Y2;
+ dstYUV += 4;
+ }
+ srcYUV += srcYUVPitchLeft;
+ dstYUV += dstYUVPitchLeft;
+ x = YUVwidth;
+ }
+ return 0;
+}
+
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_YUY2_to_YVYU_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
{
int x, y;
const int YUVwidth = (width + 1) / 2;
@@ -1399,18 +1587,189 @@ static int SDL_ConvertPixels_YUY2_to_UYVY(int width, int height, const void *src
const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
const Uint8 *srcYUV = (const Uint8 *)src;
Uint8 *dstYUV = (Uint8 *)dst;
-#if HAVE_SSE2_INTRINSICS
(Patch may be truncated, please check the link at the top of this post.)