SDL: use target attributes of gcc / clang for SIMD code.

From 69de6964e58d287919048e9ac0da41c206fd80bd Mon Sep 17 00:00:00 2001
From: Ozkan Sezer <[EMAIL REDACTED]>
Date: Fri, 17 Mar 2023 20:57:40 +0300
Subject: [PATCH] use target attributes of gcc / clang for SIMD code.

---
 CMakeLists.txt                       |   5 -
 include/SDL3/SDL_intrin.h            |  24 +-
 src/SDL_internal.h                   |  23 +-
 src/audio/SDL_audiocvt.c             |   4 +-
 src/audio/SDL_audiotypecvt.c         |  16 +-
 src/video/SDL_blit_A.c               |  10 +-
 src/video/SDL_blit_copy.c            |  21 +-
 src/video/SDL_fillrect.c             |   4 +-
 src/video/SDL_stretch.c              |   6 +-
 src/video/SDL_yuv.c                  | 560 +++++++++++++++++++++------
 src/video/yuv2rgb/yuv_rgb.c          |   4 +-
 src/video/yuv2rgb/yuv_rgb_sse_func.h |   2 +-
 12 files changed, 510 insertions(+), 169 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccc0ee23ece7..13af6c4d7a8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -735,7 +735,6 @@ if(SDL_ASSEMBLY)
       cmake_pop_check_state()
       if(CPU_SUPPORTS_AVX)
         set(HAVE_AVX TRUE)
-        target_compile_options(sdl-build-options INTERFACE "-mavx")
       endif()
     endif()
 
@@ -760,7 +759,6 @@ if(SDL_ASSEMBLY)
       cmake_pop_check_state()
       if(CPU_SUPPORTS_MMX)
         set(HAVE_MMX TRUE)
-        target_compile_options(sdl-build-options INTERFACE "-mmmx")
       endif()
     endif()
 
@@ -785,7 +783,6 @@ if(SDL_ASSEMBLY)
       cmake_pop_check_state()
       if(CPU_SUPPORTS_SSE)
         set(HAVE_SSE ON)
-        target_compile_options(sdl-build-options INTERFACE "-msse")
       endif()
     endif()
 
@@ -810,7 +807,6 @@ if(SDL_ASSEMBLY)
       cmake_pop_check_state()
       if(CPU_SUPPORTS_SSE2)
         set(HAVE_SSE2 TRUE)
-        target_compile_options(sdl-build-options INTERFACE "-msse2")
       endif()
     endif()
 
@@ -835,7 +831,6 @@ if(SDL_ASSEMBLY)
       cmake_pop_check_state()
       if(CPU_SUPPORTS_SSE3)
         set(HAVE_SSE3 TRUE)
-        target_compile_options(sdl-build-options INTERFACE "-msse3")
       endif()
     endif()
 
diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h
index 6fedc18b2ab5..adec1346dc7a 100644
--- a/include/SDL3/SDL_intrin.h
+++ b/include/SDL3/SDL_intrin.h
@@ -93,25 +93,39 @@ _m_prefetch(void *__P)
 #endif
 #endif /* compiler version */
 
+#if defined(__clang__) && defined(__has_attribute)
+# if __has_attribute(target)
+# define SDL_HAS_TARGET_ATTRIBS
+# endif
+#elif defined(__GNUC__) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */
+# define SDL_HAS_TARGET_ATTRIBS
+#endif
+
+#ifdef SDL_HAS_TARGET_ATTRIBS
+# define SDL_TARGETING(x) __attribute__((target(x)))
+#else
+# define SDL_TARGETING(x)
+#endif
+
 #if defined(__loongarch_sx) && !defined(SDL_DISABLE_LSX)
 #include <lsxintrin.h>
 #endif
 #if defined(__loongarch_asx) && !defined(SDL_DISABLE_LASX)
 #include <lasxintrin.h>
 #endif
-#if defined(__AVX__) && !defined(SDL_DISABLE_AVX)
+#if (defined(__AVX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_AVX)
 #include <immintrin.h>
 #endif
-#if defined(__MMX__) && !defined(SDL_DISABLE_MMX)
+#if (defined(__MMX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_MMX)
 #include <mmintrin.h>
 #endif
-#if defined(__SSE__) && !defined(SDL_DISABLE_SSE)
+#if (defined(__SSE__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE)
 #include <xmmintrin.h>
 #endif
-#if defined(__SSE2__) && !defined(SDL_DISABLE_SSE2)
+#if (defined(__SSE2__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE2)
 #include <emmintrin.h>
 #endif
-#if defined(__SSE3__) && !defined(SDL_DISABLE_SSE3)
+#if (defined(__SSE3__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE3)
 #include <pmmintrin.h>
 #endif
 
diff --git a/src/SDL_internal.h b/src/SDL_internal.h
index 89c6f224e036..fd65cf8643be 100644
--- a/src/SDL_internal.h
+++ b/src/SDL_internal.h
@@ -194,23 +194,23 @@
 #define HAVE_NEON_INTRINSICS 1
 #endif
 
-#if defined(__MMX__) && !defined(SDL_DISABLE_MMX)
+#if (defined(__MMX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_MMX)
 #define HAVE_MMX_INTRINSICS 1
 #endif
 
-#if defined(__SSE__) && !defined(SDL_DISABLE_SSE)
+#if (defined(__SSE__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE)
 #define HAVE_SSE_INTRINSICS 1
 #endif
 
-#if defined(__SSE2__) && !defined(SDL_DISABLE_SSE2)
+#if (defined(__SSE2__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE2)
 #define HAVE_SSE2_INTRINSICS 1
 #endif
 
-#if defined(__SSE3__) && !defined(SDL_DISABLE_SSE3)
+#if (defined(__SSE3__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_SSE3)
 #define HAVE_SSE3_INTRINSICS 1
 #endif
 
-#if defined(__AVX__) && !defined(SDL_DISABLE_AVX)
+#if (defined(__AVX__) || defined(SDL_HAS_TARGET_ATTRIBS)) && !defined(SDL_DISABLE_AVX)
 #define HAVE_AVX_INTRINSICS 1
 #endif
 
@@ -222,19 +222,6 @@
 #define HAVE_LASX_INTRINSICS 1
 #endif
 
-#if defined __clang__
-#if (!__has_attribute(target))
-#undef HAVE_AVX_INTRINSICS
-#endif
-#if (defined(_MSC_VER) || defined(__SCE__)) && !defined(__AVX__)
-#undef HAVE_AVX_INTRINSICS
-#endif
-#elif defined __GNUC__
-#if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 9)
-#undef HAVE_AVX_INTRINSICS
-#endif
-#endif
-
 
 #define SDL_MAIN_NOIMPL /* don't drag in header-only implementation of SDL_main */
 #include <SDL3/SDL_main.h>
diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c
index c29c80358077..2eb74f8540c3 100644
--- a/src/audio/SDL_audiocvt.c
+++ b/src/audio/SDL_audiocvt.c
@@ -146,7 +146,7 @@ static int SDL_ConvertAudio(SDL_AudioCVT * cvt);
 
 #if HAVE_SSE3_INTRINSICS
 /* Convert from stereo to mono. Average left and right. */
-static void SDLCALL SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse3") SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     const __m128 divby2 = _mm_set1_ps(0.5f);
     float *dst = (float *)cvt->buf;
@@ -183,7 +183,7 @@ static void SDLCALL SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT *cvt, SDL_AudioFor
 
 #if HAVE_SSE_INTRINSICS
 /* Convert from mono to stereo. Duplicate to stereo left and right. */
-static void SDLCALL SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse") SDL_ConvertMonoToStereo_SSE(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     float *dst = ((float *)(cvt->buf + (cvt->len_cvt * 2))) - 8;
     const float *src = ((const float *)(cvt->buf + cvt->len_cvt)) - 4;
diff --git a/src/audio/SDL_audiotypecvt.c b/src/audio/SDL_audiotypecvt.c
index 86ab19507f29..e94a890dc413 100644
--- a/src/audio/SDL_audiotypecvt.c
+++ b/src/audio/SDL_audiotypecvt.c
@@ -225,7 +225,7 @@ static void SDLCALL SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFo
 #endif
 
 #if HAVE_SSE2_INTRINSICS
-static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     const Sint8 *src = ((const Sint8 *)(cvt->buf + cvt->len_cvt)) - 1;
     float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
@@ -289,7 +289,7 @@ static void SDLCALL SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
     }
 }
 
-static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     const Uint8 *src = ((const Uint8 *)(cvt->buf + cvt->len_cvt)) - 1;
     float *dst = ((float *)(cvt->buf + cvt->len_cvt * 4)) - 1;
@@ -355,7 +355,7 @@ static void SDLCALL SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
     }
 }
 
-static void SDLCALL SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     const Sint16 *src = ((const Sint16 *)(cvt->buf + cvt->len_cvt)) - 1;
     float *dst = ((float *)(cvt->buf + cvt->len_cvt * 2)) - 1;
@@ -408,7 +408,7 @@ static void SDLCALL SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
     }
 }
 
-static void SDLCALL SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     const Sint32 *src = (const Sint32 *)cvt->buf;
     float *dst = (float *)cvt->buf;
@@ -451,7 +451,7 @@ static void SDLCALL SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
     }
 }
 
-static void SDLCALL SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     const float *src = (const float *)cvt->buf;
     Sint8 *dst = (Sint8 *)cvt->buf;
@@ -514,7 +514,7 @@ static void SDLCALL SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
     }
 }
 
-static void SDLCALL SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     const float *src = (const float *)cvt->buf;
     Uint8 *dst = cvt->buf;
@@ -577,7 +577,7 @@ static void SDLCALL SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioForma
     }
 }
 
-static void SDLCALL SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     const float *src = (const float *)cvt->buf;
     Sint16 *dst = (Sint16 *)cvt->buf;
@@ -638,7 +638,7 @@ static void SDLCALL SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioForm
     }
 }
 
-static void SDLCALL SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
+static void SDLCALL SDL_TARGETING("sse2") SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
 {
     const float *src = (const float *)cvt->buf;
     Sint32 *dst = (Sint32 *)cvt->buf;
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index b0e20743a056..a9324a3ebf85 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -169,7 +169,7 @@ static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
 #if HAVE_MMX_INTRINSICS
 
 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
-static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
 {
     int width = info->dst_w;
     int height = info->dst_h;
@@ -223,7 +223,7 @@ static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
 }
 
 /* fast RGB888->(A)RGB888 blending with surface alpha */
-static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
 {
     SDL_PixelFormat *df = info->dst_fmt;
     Uint32 chanmask;
@@ -318,7 +318,7 @@ static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
 }
 
 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
-static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
 {
     int width = info->dst_w;
     int height = info->dst_h;
@@ -753,7 +753,7 @@ static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
 #if HAVE_MMX_INTRINSICS
 
 /* fast RGB565->RGB565 blending with surface alpha */
-static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
 {
     unsigned alpha = info->a;
     if (alpha == 128) {
@@ -889,7 +889,7 @@ static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
 }
 
 /* fast RGB555->RGB555 blending with surface alpha */
-static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
+static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
 {
     unsigned alpha = info->a;
     if (alpha == 128) {
diff --git a/src/video/SDL_blit_copy.c b/src/video/SDL_blit_copy.c
index 45536b4b2f7b..cc8cd22040c5 100644
--- a/src/video/SDL_blit_copy.c
+++ b/src/video/SDL_blit_copy.c
@@ -25,7 +25,7 @@
 
 #if HAVE_SSE_INTRINSICS
 /* This assumes 16-byte aligned src and dst */
-static SDL_INLINE void SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
+static SDL_INLINE void SDL_TARGETING("sse") SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
 {
     int i;
 
@@ -54,7 +54,7 @@ static SDL_INLINE void SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
 #ifdef _MSC_VER
 #pragma warning(disable : 4799)
 #endif
-static SDL_INLINE void SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
+static SDL_INLINE void SDL_TARGETING("mmx") SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
 {
     const int remain = (len & 63);
     int i;
@@ -81,6 +81,16 @@ static SDL_INLINE void SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
         SDL_memcpy(dst + skip, src + skip, remain);
     }
 }
+
+static SDL_INLINE void SDL_TARGETING("mmx") SDL_BlitCopyMMX(Uint8 *dst, const Uint8 *src, const int dstskip, const int srcskip, const int w, int h)
+{
+    while (h--) {
+        SDL_memcpyMMX(dst, src, w);
+        src += srcskip;
+        dst += dstskip;
+    }
+    _mm_empty();
+}
 #endif /* HAVE_MMX_INTRINSICS */
 
 void SDL_BlitCopy(SDL_BlitInfo *info)
@@ -137,12 +147,7 @@ void SDL_BlitCopy(SDL_BlitInfo *info)
 
 #if HAVE_MMX_INTRINSICS
     if (SDL_HasMMX() && !(srcskip & 7) && !(dstskip & 7)) {
-        while (h--) {
-            SDL_memcpyMMX(dst, src, w);
-            src += srcskip;
-            dst += dstskip;
-        }
-        _mm_empty();
+        SDL_BlitCopyMMX(dst, src, w, h, dstskip, srcskip);
         return;
     }
 #endif
diff --git a/src/video/SDL_fillrect.c b/src/video/SDL_fillrect.c
index 595cf7c32a71..59c8128e9d01 100644
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@@ -55,7 +55,7 @@
 #define SSE_END
 
 #define DEFINE_SSE_FILLRECT(bpp, type) \
-static void SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
+static void SDL_TARGETING("sse") SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
 { \
     int i, n; \
     Uint8 *p = NULL; \
@@ -92,7 +92,7 @@ static void SDL_FillSurfaceRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color
     SSE_END; \
 }
 
-static void SDL_FillSurfaceRect1SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
+static void SDL_TARGETING("sse") SDL_FillSurfaceRect1SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
 {
     int i, n;
 
diff --git a/src/video/SDL_stretch.c b/src/video/SDL_stretch.c
index 36eb391a0c03..4b4091470443 100644
--- a/src/video/SDL_stretch.c
+++ b/src/video/SDL_stretch.c
@@ -349,7 +349,7 @@ static int scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch,
 #if defined(HAVE_SSE2_INTRINSICS)
 
 #if 0
-static void printf_128(const char *str, __m128i var)
+static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var)
 {
     uint16_t *val = (uint16_t*) &var;
     printf(" *   %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
@@ -367,7 +367,7 @@ static SDL_INLINE int hasSSE2(void)
     return val;
 }
 
-static SDL_INLINE void INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
+static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
 {
     __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
     __m128i v_frac_w0, k0, l0, d0, e0;
@@ -404,7 +404,7 @@ static SDL_INLINE void INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1,
     *dst = _mm_cvtsi128_si32(e0);
 }
 
-static int scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
+static int SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
 {
     BILINEAR___START
 
diff --git a/src/video/SDL_yuv.c b/src/video/SDL_yuv.c
index 06a7fa708b06..7c4d6f67bd68 100644
--- a/src/video/SDL_yuv.c
+++ b/src/video/SDL_yuv.c
@@ -303,14 +303,14 @@ static int GetYUVPlanes(int width, int height, Uint32 format, const void *yuv, i
     return 0;
 }
 
-static SDL_bool yuv_rgb_sse(
+#if HAVE_SSE2_INTRINSICS
+static SDL_bool SDL_TARGETING("sse2") yuv_rgb_sse(
     Uint32 src_format, Uint32 dst_format,
     Uint32 width, Uint32 height,
     const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
     Uint8 *rgb, Uint32 rgb_stride,
     YCbCrType yuv_type)
 {
-#if HAVE_SSE2_INTRINSICS
     if (!SDL_HasSSE2()) {
         return SDL_FALSE;
     }
@@ -408,10 +408,21 @@ static SDL_bool yuv_rgb_sse(
             break;
         }
     }
-#endif
     return SDL_FALSE;
 }
+#else
+static SDL_bool yuv_rgb_sse(
+    Uint32 src_format, Uint32 dst_format,
+    Uint32 width, Uint32 height,
+    const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
+    Uint8 *rgb, Uint32 rgb_stride,
+    YCbCrType yuv_type)
+{
+    return SDL_FALSE;
+}
+#endif
 
+#if HAVE_LSX_INTRINSICS
 static SDL_bool yuv_rgb_lsx(
     Uint32 src_format, Uint32 dst_format,
     Uint32 width, Uint32 height,
@@ -419,7 +430,6 @@ static SDL_bool yuv_rgb_lsx(
     Uint8 *rgb, Uint32 rgb_stride,
     YCbCrType yuv_type)
 {
-#if HAVE_LSX_INTRINSICS
     if (!SDL_HasLSX()) {
         return SDL_FALSE;
     }
@@ -450,9 +460,19 @@ static SDL_bool yuv_rgb_lsx(
             break;
         }
     }
-#endif
     return SDL_FALSE;
 }
+#else
+static SDL_bool yuv_rgb_lsx(
+    Uint32 src_format, Uint32 dst_format,
+    Uint32 width, Uint32 height,
+    const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
+    Uint8 *rgb, Uint32 rgb_stride,
+    YCbCrType yuv_type)
+{
+    return SDL_FALSE;
+}
+#endif
 
 static SDL_bool yuv_rgb_std(
     Uint32 src_format, Uint32 dst_format,
@@ -1102,7 +1122,8 @@ static int SDL_ConvertPixels_SwapUVPlanes(int width, int height, const void *src
     return 0;
 }
 
-static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
+#if HAVE_SSE2_INTRINSICS
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_PackUVPlanes_to_NV_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
 {
     int x, y;
     const int UVwidth = (width + 1) / 2;
@@ -1114,9 +1135,6 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi
     const Uint8 *src1, *src2;
     Uint8 *dstUV;
     Uint8 *tmp = NULL;
-#if HAVE_SSE2_INTRINSICS
-    const SDL_bool use_SSE2 = SDL_HasSSE2();
-#endif
 
     /* Skip the Y plane */
     src = (const Uint8 *)src + height * src_pitch;
@@ -1144,22 +1162,76 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi
     y = UVheight;
     while (y--) {
         x = UVwidth;
-#if HAVE_SSE2_INTRINSICS
-        if (use_SSE2) {
-            while (x >= 16) {
-                __m128i u = _mm_loadu_si128((__m128i *)src1);
-                __m128i v = _mm_loadu_si128((__m128i *)src2);
-                __m128i uv1 = _mm_unpacklo_epi8(u, v);
-                __m128i uv2 = _mm_unpackhi_epi8(u, v);
-                _mm_storeu_si128((__m128i *)dstUV, uv1);
-                _mm_storeu_si128((__m128i *)(dstUV + 16), uv2);
-                src1 += 16;
-                src2 += 16;
-                dstUV += 32;
-                x -= 16;
-            }
+        while (x >= 16) {
+            __m128i u = _mm_loadu_si128((__m128i *)src1);
+            __m128i v = _mm_loadu_si128((__m128i *)src2);
+            __m128i uv1 = _mm_unpacklo_epi8(u, v);
+            __m128i uv2 = _mm_unpackhi_epi8(u, v);
+            _mm_storeu_si128((__m128i *)dstUV, uv1);
+            _mm_storeu_si128((__m128i *)(dstUV + 16), uv2);
+            src1 += 16;
+            src2 += 16;
+            dstUV += 32;
+            x -= 16;
+        }
+        while (x--) {
+            *dstUV++ = *src1++;
+            *dstUV++ = *src2++;
         }
+        src1 += srcUVPitchLeft;
+        src2 += srcUVPitchLeft;
+        dstUV += dstUVPitchLeft;
+    }
+
+    if (tmp) {
+        SDL_free(tmp);
+    }
+    return 0;
+}
 #endif
+
+static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
+{
+  if (SDL_HasSSE2()) {
+    return SDL_ConvertPixels_PackUVPlanes_to_NV_SSE2(width, height, src, src_pitch, dst, dst_pitch, reverseUV);
+  } else {
+    int x, y;
+    const int UVwidth = (width + 1) / 2;
+    const int UVheight = (height + 1) / 2;
+    const int srcUVPitch = ((src_pitch + 1) / 2);
+    const int srcUVPitchLeft = srcUVPitch - UVwidth;
+    const int dstUVPitch = ((dst_pitch + 1) / 2) * 2;
+    const int dstUVPitchLeft = dstUVPitch - UVwidth * 2;
+    const Uint8 *src1, *src2;
+    Uint8 *dstUV;
+    Uint8 *tmp = NULL;
+
+    /* Skip the Y plane */
+    src = (const Uint8 *)src + height * src_pitch;
+    dst = (Uint8 *)dst + height * dst_pitch;
+
+    if (src == dst) {
+        /* Need to make a copy of the buffer so we don't clobber it while converting */
+        tmp = (Uint8 *)SDL_malloc((size_t)2 * UVheight * srcUVPitch);
+        if (tmp == NULL) {
+            return SDL_OutOfMemory();
+        }
+        SDL_memcpy(tmp, src, (size_t)2 * UVheight * srcUVPitch);
+        src = tmp;
+    }
+
+    if (reverseUV) {
+        src2 = (const Uint8 *)src;
+        src1 = src2 + UVheight * srcUVPitch;
+    } else {
+        src1 = (const Uint8 *)src;
+        src2 = src1 + UVheight * srcUVPitch;
+    }
+    dstUV = (Uint8 *)dst;
+
+    y = UVheight;
+    while (y--) {
+        x = UVwidth;
         while (x--) {
             *dstUV++ = *src1++;
             *dstUV++ = *src2++;
@@ -1173,9 +1245,11 @@ static int SDL_ConvertPixels_PackUVPlanes_to_NV(int width, int height, const voi
         SDL_free(tmp);
     }
     return 0;
+  }
 }
 
-static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
+#if HAVE_SSE2_INTRINSICS
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_SplitNV_to_UVPlanes_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
 {
     int x, y;
     const int UVwidth = (width + 1) / 2;
@@ -1187,9 +1261,6 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo
     const Uint8 *srcUV;
     Uint8 *dst1, *dst2;
     Uint8 *tmp = NULL;
-#if HAVE_SSE2_INTRINSICS
-    const SDL_bool use_SSE2 = SDL_HasSSE2();
-#endif
 
     /* Skip the Y plane */
     src = (const Uint8 *)src + height * src_pitch;
@@ -1216,28 +1287,82 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo
 
     y = UVheight;
     while (y--) {
+        __m128i mask = _mm_set1_epi16(0x00FF);
         x = UVwidth;
-#if HAVE_SSE2_INTRINSICS
-        if (use_SSE2) {
-            __m128i mask = _mm_set1_epi16(0x00FF);
-            while (x >= 16) {
-                __m128i uv1 = _mm_loadu_si128((__m128i *)srcUV);
-                __m128i uv2 = _mm_loadu_si128((__m128i *)(srcUV + 16));
-                __m128i u1 = _mm_and_si128(uv1, mask);
-                __m128i u2 = _mm_and_si128(uv2, mask);
-                __m128i u = _mm_packus_epi16(u1, u2);
-                __m128i v1 = _mm_srli_epi16(uv1, 8);
-                __m128i v2 = _mm_srli_epi16(uv2, 8);
-                __m128i v = _mm_packus_epi16(v1, v2);
-                _mm_storeu_si128((__m128i *)dst1, u);
-                _mm_storeu_si128((__m128i *)dst2, v);
-                srcUV += 32;
-                dst1 += 16;
-                dst2 += 16;
-                x -= 16;
-            }
+        while (x >= 16) {
+            __m128i uv1 = _mm_loadu_si128((__m128i *)srcUV);
+            __m128i uv2 = _mm_loadu_si128((__m128i *)(srcUV + 16));
+            __m128i u1 = _mm_and_si128(uv1, mask);
+            __m128i u2 = _mm_and_si128(uv2, mask);
+            __m128i u = _mm_packus_epi16(u1, u2);
+            __m128i v1 = _mm_srli_epi16(uv1, 8);
+            __m128i v2 = _mm_srli_epi16(uv2, 8);
+            __m128i v = _mm_packus_epi16(v1, v2);
+            _mm_storeu_si128((__m128i *)dst1, u);
+            _mm_storeu_si128((__m128i *)dst2, v);
+            srcUV += 32;
+            dst1 += 16;
+            dst2 += 16;
+            x -= 16;
+        }
+        while (x--) {
+            *dst1++ = *srcUV++;
+            *dst2++ = *srcUV++;
         }
+        srcUV += srcUVPitchLeft;
+        dst1 += dstUVPitchLeft;
+        dst2 += dstUVPitchLeft;
+    }
+
+    if (tmp) {
+        SDL_free(tmp);
+    }
+    return 0;
+}
 #endif
+
+static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch, SDL_bool reverseUV)
+{
+  if (SDL_HasSSE2()) {
+    return SDL_ConvertPixels_SplitNV_to_UVPlanes_SSE2(width, height, src, src_pitch, dst, dst_pitch, reverseUV);
+  } else {
+    int x, y;
+    const int UVwidth = (width + 1) / 2;
+    const int UVheight = (height + 1) / 2;
+    const int srcUVPitch = ((src_pitch + 1) / 2) * 2;
+    const int srcUVPitchLeft = srcUVPitch - UVwidth * 2;
+    const int dstUVPitch = ((dst_pitch + 1) / 2);
+    const int dstUVPitchLeft = dstUVPitch - UVwidth;
+    const Uint8 *srcUV;
+    Uint8 *dst1, *dst2;
+    Uint8 *tmp = NULL;
+
+    /* Skip the Y plane */
+    src = (const Uint8 *)src + height * src_pitch;
+    dst = (Uint8 *)dst + height * dst_pitch;
+
+    if (src == dst) {
+        /* Need to make a copy of the buffer so we don't clobber it while converting */
+        tmp = (Uint8 *)SDL_malloc((size_t)UVheight * srcUVPitch);
+        if (tmp == NULL) {
+            return SDL_OutOfMemory();
+        }
+        SDL_memcpy(tmp, src, (size_t)UVheight * srcUVPitch);
+        src = tmp;
+    }
+
+    if (reverseUV) {
+        dst2 = (Uint8 *)dst;
+        dst1 = dst2 + UVheight * dstUVPitch;
+    } else {
+        dst1 = (Uint8 *)dst;
+        dst2 = dst1 + UVheight * dstUVPitch;
+    }
+    srcUV = (const Uint8 *)src;
+
+    y = UVheight;
+    while (y--) {
+        x = UVwidth;
         while (x--) {
             *dst1++ = *srcUV++;
             *dst2++ = *srcUV++;
@@ -1251,9 +1376,11 @@ static int SDL_ConvertPixels_SplitNV_to_UVPlanes(int width, int height, const vo
         SDL_free(tmp);
     }
     return 0;
+  }
 }
 
-static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
+#if HAVE_SSE2_INTRINSICS
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_SwapNV_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
 {
     int x, y;
     const int UVwidth = (width + 1) / 2;
@@ -1264,9 +1391,6 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int
     const int dstUVPitchLeft = (dstUVPitch - UVwidth * 2) / sizeof(Uint16);
     const Uint16 *srcUV;
     Uint16 *dstUV;
-#if HAVE_SSE2_INTRINSICS
-    const SDL_bool use_SSE2 = SDL_HasSSE2();
-#endif
 
     /* Skip the Y plane */
     src = (const Uint8 *)src + height * src_pitch;
@@ -1277,20 +1401,50 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int
     y = UVheight;
     while (y--) {
         x = UVwidth;
-#if HAVE_SSE2_INTRINSICS
-        if (use_SSE2) {
-            while (x >= 8) {
-                __m128i uv = _mm_loadu_si128((__m128i *)srcUV);
-                __m128i v = _mm_slli_epi16(uv, 8);
-                __m128i u = _mm_srli_epi16(uv, 8);
-                __m128i vu = _mm_or_si128(v, u);
-                _mm_storeu_si128((__m128i *)dstUV, vu);
-                srcUV += 8;
-                dstUV += 8;
-                x -= 8;
-            }
+        while (x >= 8) {
+            __m128i uv = _mm_loadu_si128((__m128i *)srcUV);
+            __m128i v = _mm_slli_epi16(uv, 8);
+            __m128i u = _mm_srli_epi16(uv, 8);
+            __m128i vu = _mm_or_si128(v, u);
+            _mm_storeu_si128((__m128i *)dstUV, vu);
+            srcUV += 8;
+            dstUV += 8;
+            x -= 8;
         }
+        while (x--) {
+            *dstUV++ = SDL_Swap16(*srcUV++);
+        }
+        srcUV += srcUVPitchLeft;
+        dstUV += dstUVPitchLeft;
+    }
+    return 0;
+}
 #endif
+
+static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
+{
+  if (SDL_HasSSE2()) {
+    return SDL_ConvertPixels_SwapNV_SSE2(width, height, src, src_pitch, dst, dst_pitch);
+  } else {
+    int x, y;
+    const int UVwidth = (width + 1) / 2;
+    const int UVheight = (height + 1) / 2;
+    const int srcUVPitch = ((src_pitch + 1) / 2) * 2;
+    const int srcUVPitchLeft = (srcUVPitch - UVwidth * 2) / sizeof(Uint16);
+    const int dstUVPitch = ((dst_pitch + 1) / 2) * 2;
+    const int dstUVPitchLeft = (dstUVPitch - UVwidth * 2) / sizeof(Uint16);
+    const Uint16 *srcUV;
+    Uint16 *dstUV;
+
+    /* Skip the Y plane */
+    src = (const Uint8 *)src + height * src_pitch;
+    dst = (Uint8 *)dst + height * dst_pitch;
+
+    srcUV = (const Uint16 *)src;
+    dstUV = (Uint16 *)dst;
+    y = UVheight;
+    while (y--) {
+        x = UVwidth;
         while (x--) {
             *dstUV++ = SDL_Swap16(*srcUV++);
         }
@@ -1298,6 +1452,7 @@ static int SDL_ConvertPixels_SwapNV(int width, int height, const void *src, int
         dstUV += dstUVPitchLeft;
     }
     return 0;
+  }
 }
 
 static int SDL_ConvertPixels_Planar2x2_to_Planar2x2(int width, int height,
@@ -1389,9 +1544,42 @@ static int SDL_ConvertPixels_Planar2x2_to_Planar2x2(int width, int height,
         x -= 4;                                                   \
     }
 
-#endif
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_YUY2_to_UYVY_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
+{
+    int x, y;
+    const int YUVwidth = (width + 1) / 2;
+    const int srcYUVPitchLeft = (src_pitch - YUVwidth * 4);
+    const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
+    const Uint8 *srcYUV = (const Uint8 *)src;
+    Uint8 *dstYUV = (Uint8 *)dst;
 
-static int SDL_ConvertPixels_YUY2_to_UYVY(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
+    y = height;
+    x = YUVwidth;
+    while (y--) {
+        PACKED4_TO_PACKED4_ROW_SSE2(_MM_SHUFFLE(2, 3, 0, 1));
+        while (x--) {
+            Uint8 Y1, U, Y2, V;
+
+            Y1 = srcYUV[0];
+            U = srcYUV[1];
+            Y2 = srcYUV[2];
+            V = srcYUV[3];
+            srcYUV += 4;
+
+            dstYUV[0] = U;
+            dstYUV[1] = Y1;
+            dstYUV[2] = V;
+            dstYUV[3] = Y2;
+            dstYUV += 4;
+        }
+        srcYUV += srcYUVPitchLeft;
+        dstYUV += dstYUVPitchLeft;
+        x = YUVwidth;
+    }
+    return 0;
+}
+
+static int SDL_TARGETING("sse2") SDL_ConvertPixels_YUY2_to_YVYU_SSE2(int width, int height, const void *src, int src_pitch, void *dst, int dst_pitch)
 {
     int x, y;
     const int YUVwidth = (width + 1) / 2;
@@ -1399,18 +1587,189 @@ static int SDL_ConvertPixels_YUY2_to_UYVY(int width, int height, const void *src
     const int dstYUVPitchLeft = (dst_pitch - YUVwidth * 4);
     const Uint8 *srcYUV = (const Uint8 *)src;
     Uint8 *dstYUV = (Uint8 *)dst;
-#if HAVE_SSE2_INTRINSICS

(Patch may be truncated, please check the link at the top of this post.)