SDL: video: move standard c, sse and lsx implementations of yuv2rgb to its own source (472a4)

From 472a415c8fa0a77c26765ce0bee0ca2330f7d179 Mon Sep 17 00:00:00 2001
From: Anonymous Maarten <[EMAIL REDACTED]>
Date: Fri, 19 Jan 2024 00:01:36 +0100
Subject: [PATCH] video: move standard c, sse and lsx implementations of
 yuv2rgb to its own source

---
 Makefile.os2                                  |   5 +-
 Makefile.w32                                  |   5 +-
 VisualC-GDK/SDL/SDL.vcxproj                   |  10 +-
 VisualC-WinRT/SDL-UWP.vcxproj                 |  12 +-
 VisualC/SDL/SDL.vcxproj                       |  10 +-
 src/video/yuv2rgb/yuv_rgb.h                   | 401 +----------------
 src/video/yuv2rgb/yuv_rgb_common.h            |  13 +
 src/video/yuv2rgb/yuv_rgb_internal.h          |  74 ++++
 src/video/yuv2rgb/yuv_rgb_lsx.c               |  44 ++
 src/video/yuv2rgb/yuv_rgb_lsx.h               | 407 +++++++++++++++++
 .../yuv2rgb/{yuv_rgb.c => yuv_rgb_sse.c}      | 412 +++---------------
 src/video/yuv2rgb/yuv_rgb_sse.h               | 266 +++++++++++
 src/video/yuv2rgb/yuv_rgb_std.c               | 179 ++++++++
 src/video/yuv2rgb/yuv_rgb_std.h               | 131 ++++++
 14 files changed, 1230 insertions(+), 739 deletions(-)
 create mode 100644 src/video/yuv2rgb/yuv_rgb_common.h
 create mode 100644 src/video/yuv2rgb/yuv_rgb_internal.h
 create mode 100644 src/video/yuv2rgb/yuv_rgb_lsx.c
 create mode 100644 src/video/yuv2rgb/yuv_rgb_lsx.h
 rename src/video/yuv2rgb/{yuv_rgb.c => yuv_rgb_sse.c} (53%)
 create mode 100644 src/video/yuv2rgb/yuv_rgb_sse.h
 create mode 100644 src/video/yuv2rgb/yuv_rgb_std.c
 create mode 100644 src/video/yuv2rgb/yuv_rgb_std.h

diff --git a/Makefile.os2 b/Makefile.os2
index a063d683e8fe..c6ab898b003e 100644
--- a/Makefile.os2
+++ b/Makefile.os2
@@ -81,7 +81,7 @@ SRCS+= SDL_events.c SDL_quit.c SDL_keyboard.c SDL_mouse.c SDL_windowevents.c &
        SDL_clipboardevents.c SDL_dropevents.c SDL_displayevents.c SDL_gesture.c &
        SDL_sensor.c SDL_touch.c
 SRCS+= SDL_haptic.c SDL_hidapi.c SDL_gamecontroller.c SDL_joystick.c controller_type.c
-SRCS+= SDL_render.c yuv_rgb.c SDL_yuv.c SDL_yuv_sw.c SDL_blendfillrect.c &
+SRCS+= SDL_render.c yuv_rgb_lsx.c yuv_rgb_sse.c yuv_rgb_std.c SDL_yuv.c SDL_yuv_sw.c SDL_blendfillrect.c &
        SDL_blendline.c SDL_blendpoint.c SDL_drawline.c SDL_drawpoint.c &
        SDL_render_sw.c SDL_rotate.c SDL_triangle.c
 SRCS+= SDL_blit.c SDL_blit_0.c SDL_blit_1.c SDL_blit_A.c SDL_blit_auto.c &
@@ -158,6 +158,9 @@ SDL_hidapi.obj: SDL_hidapi.c
     wcc386 $(CFLAGS_DLL) -za99 -fo=$^@ $<
 !endif
 
+yuv_rgb_sse.obj: yuv_rgb_sse.c
+    wcc386 $(CFLAGS_DLL) -wcd=202 -fo=$^@ $<
+
 $(LIBICONV_LIB):  "src/core/os2/iconv2.lbc"
     @echo * Creating: $@
     wlib -q -b -n -c -pa -s -t -zld -ii -io $@ @$<
diff --git a/Makefile.w32 b/Makefile.w32
index 24efceb023b2..9567f4abf698 100644
--- a/Makefile.w32
+++ b/Makefile.w32
@@ -60,7 +60,7 @@ SRCS+= SDL_events.c SDL_quit.c SDL_keyboard.c SDL_mouse.c SDL_windowevents.c &
        SDL_clipboardevents.c SDL_dropevents.c SDL_displayevents.c SDL_gesture.c &
        SDL_sensor.c SDL_touch.c
 SRCS+= SDL_haptic.c SDL_hidapi.c SDL_gamecontroller.c SDL_joystick.c controller_type.c
-SRCS+= SDL_render.c yuv_rgb.c SDL_yuv.c SDL_yuv_sw.c SDL_blendfillrect.c &
+SRCS+= SDL_render.c  yuv_rgb_lsx.c yuv_rgb_sse.c yuv_rgb_std.c SDL_yuv.c SDL_yuv_sw.c SDL_blendfillrect.c &
        SDL_blendline.c SDL_blendpoint.c SDL_drawline.c SDL_drawpoint.c &
        SDL_render_sw.c SDL_rotate.c SDL_triangle.c
 SRCS+= SDL_blit.c SDL_blit_0.c SDL_blit_1.c SDL_blit_A.c SDL_blit_auto.c &
@@ -147,6 +147,9 @@ SDL_RLEaccel.obj: SDL_RLEaccel.c
 SDL_malloc.obj: SDL_malloc.c
     wcc386 $(CFLAGS_DLL) -wcd=201 -fo=$^@ $<
 
+yuv_rgb_sse.obj: yuv_rgb_sse.c
+    wcc386 $(CFLAGS_DLL) -wcd=202 -fo=$^@ $<
+
 # SDL2libm
 MSRCS= e_atan2.c e_exp.c e_fmod.c e_log10.c e_log.c e_pow.c e_rem_pio2.c e_sqrt.c &
        k_cos.c k_rem_pio2.c k_sin.c k_tan.c &
diff --git a/VisualC-GDK/SDL/SDL.vcxproj b/VisualC-GDK/SDL/SDL.vcxproj
index 6aee7ccb9b64..ed071aa9da4c 100644
--- a/VisualC-GDK/SDL/SDL.vcxproj
+++ b/VisualC-GDK/SDL/SDL.vcxproj
@@ -520,7 +520,13 @@
     <ClInclude Include="..\..\src\video\windows\SDL_windowswindow.h" />
     <ClInclude Include="..\..\src\video\windows\wmmsg.h" />
     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_common.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_internal.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_lsx.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_lsx_func.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_sse.h" />
     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_sse_func.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_std.h" />
     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_std_func.h" />
   </ItemGroup>
   <ItemGroup>
@@ -755,7 +761,9 @@
     <ClCompile Include="..\..\src\video\windows\SDL_windowsvideo.c" />
     <ClCompile Include="..\..\src\video\windows\SDL_windowsvulkan.c" />
     <ClCompile Include="..\..\src\video\windows\SDL_windowswindow.c" />
-    <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb.c" />
+    <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_lsx.c" />
+    <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_sse.c" />
+    <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_std.c" />
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\..\src\main\windows\version.rc" />
diff --git a/VisualC-WinRT/SDL-UWP.vcxproj b/VisualC-WinRT/SDL-UWP.vcxproj
index 20d4e63687e3..696505f35139 100644
--- a/VisualC-WinRT/SDL-UWP.vcxproj
+++ b/VisualC-WinRT/SDL-UWP.vcxproj
@@ -185,6 +185,14 @@
     <ClInclude Include="..\src\video\winrt\SDL_winrtopengles.h" />
     <ClInclude Include="..\src\video\winrt\SDL_winrtvideo_cpp.h" />
     <ClInclude Include="..\src\video\yuv2rgb\yuv_rgb.h" />
+    <ClInclude Include="..\src\video\yuv2rgb\yuv_rgb_common.h" />
+    <ClInclude Include="..\src\video\yuv2rgb\yuv_rgb_internal.h" />
+    <ClInclude Include="..\src\video\yuv2rgb\yuv_rgb_lsx.h" />
+    <ClInclude Include="..\src\video\yuv2rgb\yuv_rgb_lsx_func.h" />
+    <ClInclude Include="..\src\video\yuv2rgb\yuv_rgb_sse.h" />
+    <ClInclude Include="..\src\video\yuv2rgb\yuv_rgb_sse_func.h" />
+    <ClInclude Include="..\src\video\yuv2rgb\yuv_rgb_std.h" />
+    <ClInclude Include="..\src\video\yuv2rgb\yuv_rgb_std_func.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\src\atomic\SDL_atomic.c" />
@@ -346,7 +354,9 @@
     <ClCompile Include="..\src\video\winrt\SDL_winrtvideo.cpp">
       <CompileAsWinRT>true</CompileAsWinRT>
     </ClCompile>
-    <ClCompile Include="..\src\video\yuv2rgb\yuv_rgb.c" />
+    <ClCompile Include="..\src\video\yuv2rgb\yuv_rgb_lsx.c" />
+    <ClCompile Include="..\src\video\yuv2rgb\yuv_rgb_sse.c" />
+    <ClCompile Include="..\src\video\yuv2rgb\yuv_rgb_std.c" />
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{89e9b32e-a86a-47c3-a948-d2b1622925ce}</ProjectGuid>
diff --git a/VisualC/SDL/SDL.vcxproj b/VisualC/SDL/SDL.vcxproj
index 24f332939e25..d52b5c058e94 100644
--- a/VisualC/SDL/SDL.vcxproj
+++ b/VisualC/SDL/SDL.vcxproj
@@ -439,7 +439,13 @@
     <ClInclude Include="..\..\src\video\windows\SDL_windowswindow.h" />
     <ClInclude Include="..\..\src\video\windows\wmmsg.h" />
     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_common.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_internal.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_lsx.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_lsx_func.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_sse.h" />
     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_sse_func.h" />
+    <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_std.h" />
     <ClInclude Include="..\..\src\video\yuv2rgb\yuv_rgb_std_func.h" />
   </ItemGroup>
   <ItemGroup>
@@ -624,7 +630,9 @@
     <ClCompile Include="..\..\src\video\windows\SDL_windowsvideo.c" />
     <ClCompile Include="..\..\src\video\windows\SDL_windowsvulkan.c" />
     <ClCompile Include="..\..\src\video\windows\SDL_windowswindow.c" />
-    <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb.c" />
+    <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_lsx.c" />
+    <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_sse.c" />
+    <ClCompile Include="..\..\src\video\yuv2rgb\yuv_rgb_std.c" />
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\..\src\main\windows\version.rc" />
diff --git a/src/video/yuv2rgb/yuv_rgb.h b/src/video/yuv2rgb/yuv_rgb.h
index 5668c0fc419d..c3593168f848 100644
--- a/src/video/yuv2rgb/yuv_rgb.h
+++ b/src/video/yuv2rgb/yuv_rgb.h
@@ -1,412 +1,33 @@
+#ifndef YUV_RGB_H_
+#define YUV_RGB_H_
+
 // Copyright 2016 Adrien Descamps
 // Distributed under BSD 3-Clause License
 
 // Provide optimized functions to convert images from 8bits yuv420 to rgb24 format
 
-// There are a few slightly different variations of the YCbCr color space with different parameters that 
+// There are a few slightly different variations of the YCbCr color space with different parameters that
 // change the conversion matrix.
 // The three most common YCbCr color space, defined by BT.601, BT.709 and JPEG standard are implemented here.
 // See the respective standards for details
 // The matrix values used are derived from http://www.equasys.de/colorconversion.html
 
 // YUV420 is stored as three separate channels, with U and V (Cb and Cr) subsampled by a 2 factor
-// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This 
+// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This
 // is suboptimal for image quality, but by far the fastest method.
 
 // For all methods, width and height should be even, if not, the last row/column of the result image won't be affected.
 // For sse methods, if the width if not divisable by 32, the last (width%32) pixels of each line won't be affected.
 
-#include "SDL_stdinc.h"
 /*#include <stdint.h>*/
 
-typedef enum
-{
-	YCBCR_JPEG,
-	YCBCR_601,
-	YCBCR_709
-} YCbCrType;
-
 // yuv to rgb, standard c implementation
-void yuv420_rgb565_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_rgb24_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_rgba_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_bgra_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_argb_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_abgr_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_rgb565_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_rgb24_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_rgba_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_bgra_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_argb_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_abgr_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_rgb565_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_rgb24_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_rgba_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_bgra_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_argb_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_abgr_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-// yuv to rgb, sse implementation
-// pointers must be 16 byte aligned, and strides must be divisable by 16
-void yuv420_rgb565_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_rgb24_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_rgba_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_bgra_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_argb_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_abgr_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_rgb565_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_rgb24_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_rgba_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_bgra_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_argb_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_abgr_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_rgb565_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_rgb24_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_rgba_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_bgra_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_argb_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_abgr_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-// yuv to rgb, sse implementation
-// pointers do not need to be 16 byte aligned
-void yuv420_rgb565_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_rgb24_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_rgba_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_bgra_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_argb_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv420_abgr_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_rgb565_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_rgb24_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_rgba_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_bgra_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_argb_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuv422_abgr_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_rgb565_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_rgb24_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_rgba_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_bgra_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_argb_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-void yuvnv12_abgr_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	uint8_t *rgb, uint32_t rgb_stride, 
-	YCbCrType yuv_type);
-
-
-// rgb to yuv, standard c implementation
-void rgb24_yuv420_std(
-	uint32_t width, uint32_t height, 
-	const uint8_t *rgb, uint32_t rgb_stride, 
-	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	YCbCrType yuv_type);
-
-// rgb to yuv, sse implementation
-// pointers must be 16 byte aligned, and strides must be divisible by 16
-void rgb24_yuv420_sse(
-	uint32_t width, uint32_t height, 
-	const uint8_t *rgb, uint32_t rgb_stride, 
-	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	YCbCrType yuv_type);
-
-// rgb to yuv, sse implementation
-// pointers do not need to be 16 byte aligned
-void rgb24_yuv420_sseu(
-	uint32_t width, uint32_t height, 
-	const uint8_t *rgb, uint32_t rgb_stride, 
-	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
-	YCbCrType yuv_type);
-
-
-//yuv420 to bgra, lsx implementation
-void yuv420_rgb24_lsx(
-	uint32_t width, uint32_t height,
-	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
-	uint8_t *rgb, uint32_t rgb_stride,
-	YCbCrType yuv_type);
-
-void yuv420_rgba_lsx(
-	uint32_t width, uint32_t height,
-	const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
-	uint8_t *rgb, uint32_t rgb_stride,
-	YCbCrType yuv_type);
+#include "yuv_rgb_std.h"
 
-void yuv420_bgra_lsx(
-	uint32_t width, uint32_t height,
-	const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
-	uint8_t *rgb, uint32_t rgb_stride,
-	YCbCrType yuv_type);
+// yuv to rgb, sse2 implementation
+#include "yuv_rgb_sse.h"
 
-void yuv420_argb_lsx(
-	uint32_t width, uint32_t height,
-	const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
-	uint8_t *rgb, uint32_t rgb_stride,
-	YCbCrType yuv_type);
+// yuv to rgb, lsx implementation
+#include "yuv_rgb_lsx.h"
 
-void yuv420_abgr_lsx(
-	uint32_t width, uint32_t height,
-	const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
-	uint8_t *rgb, uint32_t rgb_stride,
-	YCbCrType yuv_type);
+#endif /* YUV_RGB_H_ */
diff --git a/src/video/yuv2rgb/yuv_rgb_common.h b/src/video/yuv2rgb/yuv_rgb_common.h
new file mode 100644
index 000000000000..ae787ed5f27a
--- /dev/null
+++ b/src/video/yuv2rgb/yuv_rgb_common.h
@@ -0,0 +1,13 @@
+#ifndef YUV_RGB_COMMON_H_
+#define YUV_RGB_COMMON_H_
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+
+typedef enum
+{
+    YCBCR_JPEG,
+    YCBCR_601,
+    YCBCR_709
+} YCbCrType;
+
+#endif /* YUV_RGB_COMMON_H_ */
diff --git a/src/video/yuv2rgb/yuv_rgb_internal.h b/src/video/yuv2rgb/yuv_rgb_internal.h
new file mode 100644
index 000000000000..cad978b5face
--- /dev/null
+++ b/src/video/yuv2rgb/yuv_rgb_internal.h
@@ -0,0 +1,74 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+
+#define PRECISION 6
+#define PRECISION_FACTOR (1<<PRECISION)
+
+typedef struct
+{
+    uint8_t y_shift;
+    int16_t matrix[3][3];
+} RGB2YUVParam;
+// |Y|   |y_shift|                        |matrix[0][0] matrix[0][1] matrix[0][2]|   |R|
+// |U| = |  128  | + 1/PRECISION_FACTOR * |matrix[1][0] matrix[1][1] matrix[1][2]| * |G|
+// |V|   |  128  |                        |matrix[2][0] matrix[2][1] matrix[2][2]|   |B|
+
+typedef struct
+{
+    uint8_t y_shift;
+    int16_t y_factor;
+    int16_t v_r_factor;
+    int16_t u_g_factor;
+    int16_t v_g_factor;
+    int16_t u_b_factor;
+} YUV2RGBParam;
+// |R|                        |y_factor      0       v_r_factor|   |Y-y_shift|
+// |G| = 1/PRECISION_FACTOR * |y_factor  u_g_factor  v_g_factor| * |  U-128  |
+// |B|                        |y_factor  u_b_factor      0     |   |  V-128  |
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 26451)
+#endif
+
+#define V(value) (int16_t)((value*PRECISION_FACTOR)+0.5)
+
+// for ITU-T T.871, values can be found in section 7
+// for ITU-R BT.601-7 values are derived from equations in sections 2.5.1-2.5.3, assuming RGB is encoded using full range ([0-1]<->[0-255])
+// for ITU-R BT.709-6 values are derived from equations in sections 3.2-3.4, assuming RGB is encoded using full range ([0-1]<->[0-255])
+// all values are rounded to the fourth decimal
+
+static const YUV2RGBParam YUV2RGB[3] = {
+    // ITU-T T.871 (JPEG)
+    {/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.402), /*.u_g_factor=*/ -V(0.3441), /*.v_g_factor=*/ -V(0.7141), /*.u_b_factor=*/ V(1.772)},
+    // ITU-R BT.601-7
+    {/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.596), /*.u_g_factor=*/ -V(0.3918), /*.v_g_factor=*/ -V(0.813), /*.u_b_factor=*/ V(2.0172)},
+    // ITU-R BT.709-6
+    {/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.7927), /*.u_g_factor=*/ -V(0.2132), /*.v_g_factor=*/ -V(0.5329), /*.u_b_factor=*/ V(2.1124)}
+};
+
+static const RGB2YUVParam RGB2YUV[3] = {
+    // ITU-T T.871 (JPEG)
+    {/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.299), V(0.587), V(0.114)}, {-V(0.1687), -V(0.3313), V(0.5)}, {V(0.5), -V(0.4187), -V(0.0813)}}},
+    // ITU-R BT.601-7
+    {/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.2568), V(0.5041), V(0.0979)}, {-V(0.1482), -V(0.291), V(0.4392)}, {V(0.4392), -V(0.3678), -V(0.0714)}}},
+    // ITU-R BT.709-6
+    {/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.1826), V(0.6142), V(0.062)}, {-V(0.1006), -V(0.3386), V(0.4392)}, {V(0.4392), -V(0.3989), -V(0.0403)}}}
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+/* The various layouts of YUV data we support */
+#define YUV_FORMAT_420	1
+#define YUV_FORMAT_422	2
+#define YUV_FORMAT_NV12	3
+
+/* The various formats of RGB pixel that we support */
+#define RGB_FORMAT_RGB565	1
+#define RGB_FORMAT_RGB24	2
+#define RGB_FORMAT_RGBA		3
+#define RGB_FORMAT_BGRA		4
+#define RGB_FORMAT_ARGB		5
+#define RGB_FORMAT_ABGR		6
diff --git a/src/video/yuv2rgb/yuv_rgb_lsx.c b/src/video/yuv2rgb/yuv_rgb_lsx.c
new file mode 100644
index 000000000000..8d84dd142aa1
--- /dev/null
+++ b/src/video/yuv2rgb/yuv_rgb_lsx.c
@@ -0,0 +1,44 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+#include "../../SDL_internal.h"
+
+#if SDL_HAVE_YUV
+#include "yuv_rgb.h"
+
+#include "SDL_cpuinfo.h"
+
+#ifdef __loongarch_sx
+
+#define LSX_FUNCTION_NAME	yuv420_rgb24_lsx
+#define STD_FUNCTION_NAME	yuv420_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_lsx_func.h"
+
+#define LSX_FUNCTION_NAME	yuv420_rgba_lsx
+#define STD_FUNCTION_NAME	yuv420_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_lsx_func.h"
+
+#define LSX_FUNCTION_NAME	yuv420_bgra_lsx
+#define STD_FUNCTION_NAME	yuv420_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_lsx_func.h"
+
+#define LSX_FUNCTION_NAME	yuv420_argb_lsx
+#define STD_FUNCTION_NAME	yuv420_argb_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_lsx_func.h"
+
+#define LSX_FUNCTION_NAME	yuv420_abgr_lsx
+#define STD_FUNCTION_NAME	yuv420_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_lsx_func.h"
+
+#endif  //__loongarch_sx
+
+#endif /* SDL_HAVE_YUV */
diff --git a/src/video/yuv2rgb/yuv_rgb_lsx.h b/src/video/yuv2rgb/yuv_rgb_lsx.h
new file mode 100644
index 000000000000..bcffd95c6ea9
--- /dev/null
+++ b/src/video/yuv2rgb/yuv_rgb_lsx.h
@@ -0,0 +1,407 @@
+// Copyright 2016 Adrien Descamps
+// Distributed under BSD 3-Clause License
+
+// Provide optimized functions to convert images from 8bits yuv420 to rgb24 format
+
+// There are a few slightly different variations of the YCbCr color space with different parameters that
+// change the conversion matrix.
+// The three most common YCbCr color space, defined by BT.601, BT.709 and JPEG standard are implemented here.
+// See the respective standards for details
+// The matrix values used are derived from http://www.equasys.de/colorconversion.html
+
+// YUV420 is stored as three separate channels, with U and V (Cb and Cr) subsampled by a 2 factor
+// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This
+// is suboptimal for image quality, but by far the fastest method.
+
+// For all methods, width and height should be even, if not, the last row/column of the result image won't be affected.
+// For sse methods, if the width if not divisable by 32, the last (width%32) pixels of each line won't be affected.
+
+/*#include <stdint.h>*/
+#include "yuv_rgb_common.h"
+
+#include "SDL_stdinc.h"
+
+// yuv to rgb, standard c implementation
+void yuv420_rgb565_std(
+    uint32_t width, uint32_t height,
+    const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+    uint8_t *rgb, uint32_t rgb_stride,
+    YCbCrType yuv_type);
+
+void yuv420_rgb24_std(
+    uint32_t width, uint32_t height,
+    const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+    uint8_t *rgb, uint32_t rgb_stride,
+    YCbCrType yuv_type);
+
+void yuv420_rgba_std(
+    uint32_t width, uint32_t height,
+    const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+    uint8_t *rgb, uint32_t rgb_stride,
+    YCbCrType yuv_type);
+
+void yuv420_bgra_std(
+    uint3

(Patch may be truncated, please check the link at the top of this post.)