SDL: Add optimiztion function with LSX in LoongArch

From 50d8642fa3da0a72d286359cc7f89b0a445b02d5 Mon Sep 17 00:00:00 2001
From: yuanhecai <[EMAIL REDACTED]>
Date: Mon, 6 Jun 2022 20:18:38 +0800
Subject: [PATCH] Add optimiztion function with LSX in LoongArch

1. yuv420_rgb24_lsx
1. yuv420_rgba_lsx
2. yuv420_bgra_lsx
3. yuv420_argb_lsx
4. yuv420_abgr_lsx
---
 src/video/SDL_yuv.c                  |  46 ++++
 src/video/yuv2rgb/yuv_rgb.c          |  34 +++
 src/video/yuv2rgb/yuv_rgb.h          |  31 +++
 src/video/yuv2rgb/yuv_rgb_lsx_func.h | 372 +++++++++++++++++++++++++++
 4 files changed, 483 insertions(+)
 create mode 100644 src/video/yuv2rgb/yuv_rgb_lsx_func.h

diff --git a/src/video/SDL_yuv.c b/src/video/SDL_yuv.c
index 72ed99d181d..50395ff488e 100644
--- a/src/video/SDL_yuv.c
+++ b/src/video/SDL_yuv.c
@@ -290,6 +290,48 @@ static SDL_bool yuv_rgb_sse(
     return SDL_FALSE;
 }
 
+static SDL_bool yuv_rgb_lsx(
+    Uint32 src_format, Uint32 dst_format,
+    Uint32 width, Uint32 height,
+    const Uint8 *y, const Uint8 *u, const Uint8 *v, Uint32 y_stride, Uint32 uv_stride,
+    Uint8 *rgb, Uint32 rgb_stride,
+    YCbCrType yuv_type)
+{
+#ifdef __loongarch_sx
+    if (!SDL_HasLSX()) {
+        return SDL_FALSE;
+    }
+    if (src_format == SDL_PIXELFORMAT_YV12 ||
+        src_format == SDL_PIXELFORMAT_IYUV) {
+
+        switch (dst_format) {
+        case SDL_PIXELFORMAT_RGB24:
+            yuv420_rgb24_lsx(width, height, y, u, v, y_stride, uv_stride, rgb, rgb_stride, yuv_type);
+            return SDL_TRUE;
+        case SDL_PIXELFORMAT_RGBX8888:
+        case SDL_PIXELFORMAT_RGBA8888:
+            yuv420_rgba_lsx(width, height, y, u, v, y_stride, uv_stride, rgb, rgb_stride, yuv_type);
+            return SDL_TRUE;
+        case SDL_PIXELFORMAT_BGRX8888:
+        case SDL_PIXELFORMAT_BGRA8888:
+            yuv420_bgra_lsx(width, height, y, u, v, y_stride, uv_stride, rgb, rgb_stride, yuv_type);
+            return SDL_TRUE;
+        case SDL_PIXELFORMAT_RGB888:
+        case SDL_PIXELFORMAT_ARGB8888:
+            yuv420_argb_lsx(width, height, y, u, v, y_stride, uv_stride, rgb, rgb_stride, yuv_type);
+            return SDL_TRUE;
+        case SDL_PIXELFORMAT_BGR888:
+        case SDL_PIXELFORMAT_ABGR8888:
+            yuv420_abgr_lsx(width, height, y, u, v, y_stride, uv_stride, rgb, rgb_stride, yuv_type);
+            return SDL_TRUE;
+        default:
+            break;
+        }
+    }
+#endif
+    return SDL_FALSE;
+}
+
 static SDL_bool yuv_rgb_std(
     Uint32 src_format, Uint32 dst_format,
     Uint32 width, Uint32 height, 
@@ -417,6 +459,10 @@ SDL_ConvertPixels_YUV_to_RGB(int width, int height,
         return 0;
     }
 
+    if (yuv_rgb_lsx(src_format, dst_format, width, height, y, u, v, y_stride, uv_stride, (Uint8*)dst, dst_pitch, yuv_type)) {
+        return 0;
+    }
+
     if (yuv_rgb_std(src_format, dst_format, width, height, y, u, v, y_stride, uv_stride, (Uint8*)dst, dst_pitch, yuv_type)) {
         return 0;
     }
diff --git a/src/video/yuv2rgb/yuv_rgb.c b/src/video/yuv2rgb/yuv_rgb.c
index 7908b8c4895..c7415aa7213 100644
--- a/src/video/yuv2rgb/yuv_rgb.c
+++ b/src/video/yuv2rgb/yuv_rgb.c
@@ -687,4 +687,38 @@ void rgb24_yuv420_sseu(uint32_t width, uint32_t height,
 
 #endif //__SSE2__
 
+#ifdef __loongarch_sx
+
+#define LSX_FUNCTION_NAME	yuv420_rgb24_lsx
+#define STD_FUNCTION_NAME	yuv420_rgb24_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGB24
+#include "yuv_rgb_lsx_func.h"
+
+#define LSX_FUNCTION_NAME	yuv420_rgba_lsx
+#define STD_FUNCTION_NAME	yuv420_rgba_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_RGBA
+#include "yuv_rgb_lsx_func.h"
+
+#define LSX_FUNCTION_NAME	yuv420_bgra_lsx
+#define STD_FUNCTION_NAME	yuv420_bgra_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_BGRA
+#include "yuv_rgb_lsx_func.h"
+
+#define LSX_FUNCTION_NAME	yuv420_argb_lsx
+#define STD_FUNCTION_NAME	yuv420_argb_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ARGB
+#include "yuv_rgb_lsx_func.h"
+
+#define LSX_FUNCTION_NAME	yuv420_abgr_lsx
+#define STD_FUNCTION_NAME	yuv420_abgr_std
+#define YUV_FORMAT			YUV_FORMAT_420
+#define RGB_FORMAT			RGB_FORMAT_ABGR
+#include "yuv_rgb_lsx_func.h"
+
+#endif  //__loongarch_sx
+
 #endif /* SDL_HAVE_YUV */
diff --git a/src/video/yuv2rgb/yuv_rgb.h b/src/video/yuv2rgb/yuv_rgb.h
index 81d97ebdcda..5668c0fc419 100644
--- a/src/video/yuv2rgb/yuv_rgb.h
+++ b/src/video/yuv2rgb/yuv_rgb.h
@@ -379,3 +379,34 @@ void rgb24_yuv420_sseu(
 	uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, 
 	YCbCrType yuv_type);
 
+
+//yuv420 to bgra, lsx implementation
+void yuv420_rgb24_lsx(
+	uint32_t width, uint32_t height,
+	const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride,
+	uint8_t *rgb, uint32_t rgb_stride,
+	YCbCrType yuv_type);
+
+void yuv420_rgba_lsx(
+	uint32_t width, uint32_t height,
+	const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
+	uint8_t *rgb, uint32_t rgb_stride,
+	YCbCrType yuv_type);
+
+void yuv420_bgra_lsx(
+	uint32_t width, uint32_t height,
+	const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
+	uint8_t *rgb, uint32_t rgb_stride,
+	YCbCrType yuv_type);
+
+void yuv420_argb_lsx(
+	uint32_t width, uint32_t height,
+	const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
+	uint8_t *rgb, uint32_t rgb_stride,
+	YCbCrType yuv_type);
+
+void yuv420_abgr_lsx(
+	uint32_t width, uint32_t height,
+	const uint8_t *y, const uint8_t *v, const uint8_t *u, uint32_t y_stride, uint32_t uv_stride,
+	uint8_t *rgb, uint32_t rgb_stride,
+	YCbCrType yuv_type);
diff --git a/src/video/yuv2rgb/yuv_rgb_lsx_func.h b/src/video/yuv2rgb/yuv_rgb_lsx_func.h
new file mode 100644
index 00000000000..89d582a6a4f
--- /dev/null
+++ b/src/video/yuv2rgb/yuv_rgb_lsx_func.h
@@ -0,0 +1,372 @@
+// Copyright 2016 Adrien Descamps
+// // Distributed under BSD 3-Clause License
+
+#include <lsxintrin.h>
+
+#if YUV_FORMAT == YUV_FORMAT_420
+
+#define READ_Y(y_ptr)                                     \
+    y = __lsx_vld(y_ptr, 0);                              \
+
+#define READ_UV                                           \
+    u_temp = __lsx_vld(u_ptr, 0);                         \
+    v_temp = __lsx_vld(v_ptr, 0);                         \
+
+#else
+#error READ_UV unimplemented
+#endif
+
+#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2,       \
+                     RGB3, RGB4, RGB5, RGB6, RGB7, RGB8)               \
+{                                       \
+    __m128i ab_l, ab_h, gr_l, gr_h;     \
+    ab_l = __lsx_vilvl_b(B1, A1);       \
+    ab_h = __lsx_vilvh_b(B1, A1);       \
+    gr_l = __lsx_vilvl_b(R1, G1);       \
+    gr_h = __lsx_vilvh_b(R1, G1);       \
+    RGB1 = __lsx_vilvl_h(gr_l, ab_l);   \
+    RGB2 = __lsx_vilvh_h(gr_l, ab_l);   \
+    RGB3 = __lsx_vilvl_h(gr_h, ab_h);   \
+    RGB4 = __lsx_vilvh_h(gr_h, ab_h);   \
+    ab_l = __lsx_vilvl_b(B2, A2);       \
+    ab_h = __lsx_vilvh_b(B2, A2);       \
+    gr_l = __lsx_vilvl_b(R2, G2);       \
+    gr_h = __lsx_vilvh_b(R2, G2);       \
+    RGB5 = __lsx_vilvl_h(gr_l, ab_l);   \
+    RGB6 = __lsx_vilvh_h(gr_l, ab_l);   \
+    RGB7 = __lsx_vilvl_h(gr_h, ab_h);   \
+    RGB8 = __lsx_vilvh_h(gr_h, ab_h);   \
+}
+
+#define PACK_RGB24_32_STEP(R, G, B, RGB1, RGB2, RGB3)        \
+    RGB1 = __lsx_vilvl_b(G, R);                              \
+    RGB1 = __lsx_vshuf_b(B, RGB1, mask1);                    \
+    RGB2 = __lsx_vshuf_b(B, G, mask2);                       \
+    RGB2 = __lsx_vshuf_b(R, RGB2, mask3);                    \
+    RGB3 = __lsx_vshuf_b(R, B, mask4);                       \
+    RGB3 = __lsx_vshuf_b(G, RGB3, mask5);                    \
+
+#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6)  \
+    PACK_RGB24_32_STEP(R1, G1, B1, RGB1, RGB2, RGB3);                              \
+    PACK_RGB24_32_STEP(R2, G2, B2, RGB4, RGB5, RGB6);                              \
+
+#if RGB_FORMAT == RGB_FORMAT_RGB24
+
+#define PACK_PIXEL                                                             \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6;                          \
+    __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12;                       \
+    PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12,              \
+                  rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6)                    \
+    PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22,              \
+                  rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12)                 \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGBA
+
+#define PACK_PIXEL                                                              \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8;             \
+    __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16;      \
+    __m128i a = __lsx_vldi(0xFF);                                               \
+    PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a,          \
+                 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8)        \
+    PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a,          \
+                 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#elif RGB_FORMAT == RGB_FORMAT_BGRA
+
+#define PACK_PIXEL                                                              \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8;             \
+    __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16;      \
+    __m128i a = __lsx_vldi(0xFF);                                               \
+    PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a,          \
+                 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8)        \
+    PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a,          \
+                 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#elif RGB_FORMAT == RGB_FORMAT_ARGB
+
+#define PACK_PIXEL                                                              \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8;             \
+    __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16;      \
+    __m128i a = __lsx_vldi(0xFF);                                               \
+    PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12,          \
+                 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8)        \
+    PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22,          \
+                 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#elif RGB_FORMAT == RGB_FORMAT_ABGR
+
+#define PACK_PIXEL                                                              \
+    __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8;             \
+    __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16;      \
+    __m128i a = __lsx_vldi(0xFF);                                               \
+    PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12,          \
+                 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8)        \
+    PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22,          \
+                 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
+
+#else
+#error PACK_PIXEL unimplemented
+#endif
+
+#define LSX_ST_UB2(in0, in1, pdst, stride)                      \
+{                                                               \
+    __lsx_vst(in0, pdst, 0);                                    \
+    __lsx_vst(in1, pdst + stride, 0);                           \
+}
+
+#if RGB_FORMAT == RGB_FORMAT_RGB24                              \
+
+#define SAVE_LINE1                                              \
+    LSX_ST_UB2(rgb_1, rgb_2, rgb_ptr1, 16);                     \
+    LSX_ST_UB2(rgb_3, rgb_4, rgb_ptr1 + 32, 16);                \
+    LSX_ST_UB2(rgb_5, rgb_6, rgb_ptr1 + 64, 16);                \
+
+#define SAVE_LINE2                                              \
+    LSX_ST_UB2(rgb_7, rgb_8, rgb_ptr2, 16);                     \
+    LSX_ST_UB2(rgb_9, rgb_10, rgb_ptr2 + 32, 16);               \
+    LSX_ST_UB2(rgb_11, rgb_12, rgb_ptr2 + 64, 16);              \
+
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA ||  \
+    RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR       \
+
+#define SAVE_LINE1                                              \
+    LSX_ST_UB2(rgb_1, rgb_2, rgb_ptr1, 16);                     \
+    LSX_ST_UB2(rgb_3, rgb_4, rgb_ptr1 + 32, 16);                \
+    LSX_ST_UB2(rgb_5, rgb_6, rgb_ptr1 + 64, 16);                \
+    LSX_ST_UB2(rgb_7, rgb_8, rgb_ptr1 + 96, 16);                \
+
+#define SAVE_LINE2                                              \
+    LSX_ST_UB2(rgb_9,  rgb_10, rgb_ptr2, 16);                   \
+    LSX_ST_UB2(rgb_11, rgb_12, rgb_ptr2 + 32, 16);              \
+    LSX_ST_UB2(rgb_13, rgb_14, rgb_ptr2 + 64, 16);              \
+    LSX_ST_UB2(rgb_15, rgb_16, rgb_ptr2 + 96, 16);              \
+
+#else
+#error SAVE_LINE unimplemented
+#endif
+
+// = u*vr g=u*ug+v*vg b=u*ub
+#define UV2RGB_16(U, V, R1, G1, B1, R2, G2, B2)     \
+    r_temp = __lsx_vmul_h(V, v2r);                  \
+    g_temp = __lsx_vmul_h(U, u2g);                  \
+    g_temp = __lsx_vmadd_h(g_temp, V, v2g);         \
+    b_temp = __lsx_vmul_h(U, u2b);                  \
+    R1     = __lsx_vilvl_h(r_temp, r_temp);         \
+    G1     = __lsx_vilvl_h(g_temp, g_temp);         \
+    B1     = __lsx_vilvl_h(b_temp, b_temp);         \
+    R2     = __lsx_vilvh_h(r_temp, r_temp);         \
+    G2     = __lsx_vilvh_h(g_temp, g_temp);         \
+    B2     = __lsx_vilvh_h(b_temp, b_temp);         \
+
+// Y=(Y-shift)*shift R=(Y+R)>>6,G=(Y+G)>>6,B=(B+Y)>>6
+#define ADD_Y2RGB_16(Y1, Y2, R1, G1, B1, R2, G2, B2)        \
+    Y1 = __lsx_vsub_h(Y1, shift);                           \
+    Y2 = __lsx_vsub_h(Y2, shift);                           \
+    Y1 = __lsx_vmul_h(Y1, yf);                              \
+    Y2 = __lsx_vmul_h(Y2, yf);                              \
+    R1 = __lsx_vadd_h(R1, Y1);                              \
+    G1 = __lsx_vadd_h(G1, Y1);                              \
+    B1 = __lsx_vadd_h(B1, Y1);                              \
+    R2 = __lsx_vadd_h(R2, Y2);                              \
+    G2 = __lsx_vadd_h(G2, Y2);                              \
+    B2 = __lsx_vadd_h(B2, Y2);                              \
+    R1 = __lsx_vsrai_h(R1, PRECISION);                      \
+    G1 = __lsx_vsrai_h(G1, PRECISION);                      \
+    B1 = __lsx_vsrai_h(B1, PRECISION);                      \
+    R2 = __lsx_vsrai_h(R2, PRECISION);                      \
+    G2 = __lsx_vsrai_h(G2, PRECISION);                      \
+    B2 = __lsx_vsrai_h(B2, PRECISION);                      \
+
+#define CLIP(in0, in1, in2, in3, in4, in5)       \
+{                                                \
+    in0 = __lsx_vmaxi_h(in0, 0);                 \
+    in1 = __lsx_vmaxi_h(in1, 0);                 \
+    in2 = __lsx_vmaxi_h(in2, 0);                 \
+    in3 = __lsx_vmaxi_h(in3, 0);                 \
+    in4 = __lsx_vmaxi_h(in4, 0);                 \
+    in5 = __lsx_vmaxi_h(in5, 0);                 \
+    in0 = __lsx_vsat_hu(in0, 7);                 \
+    in1 = __lsx_vsat_hu(in1, 7);                 \
+    in2 = __lsx_vsat_hu(in2, 7);                 \
+    in3 = __lsx_vsat_hu(in3, 7);                 \
+    in4 = __lsx_vsat_hu(in4, 7);                 \
+    in5 = __lsx_vsat_hu(in5, 7);                 \
+}
+
+#define YUV2RGB_32                                            \
+    __m128i y, u_temp, v_temp;                                \
+    __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21;   \
+    __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22;   \
+    __m128i u, v, r_temp, g_temp, b_temp;                     \
+    __m128i r_1, g_1, b_1, r_2, g_2, b_2;                     \
+    __m128i y_1, y_2;                                         \
+    __m128i r_uv_1, g_uv_1, b_uv_1, r_uv_2, g_uv_2, b_uv_2;   \
+                                                              \
+    READ_UV                                                   \
+                                                              \
+    /* process first 16 pixels of first line */               \
+    u = __lsx_vilvl_b(zero, u_temp);                          \
+    v = __lsx_vilvl_b(zero, v_temp);                          \
+    u = __lsx_vsub_h(u, bias);                                \
+    v = __lsx_vsub_h(v, bias);                                \
+    UV2RGB_16(u, v, r_1, g_1, b_1, r_2, g_2, b_2);            \
+    r_uv_1 = r_1; g_uv_1 = g_1; b_uv_1 = b_1;                 \
+    r_uv_2 = r_2; g_uv_2 = g_2; b_uv_2 = b_2;                 \
+    READ_Y(y_ptr1)                                            \
+    y_1 = __lsx_vilvl_b(zero, y);                             \
+    y_2 = __lsx_vilvh_b(zero, y);                             \
+    ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2)      \
+    CLIP(r_1, g_1, b_1, r_2, g_2, b_2);                       \
+    r_8_11 = __lsx_vpickev_b(r_2, r_1);                       \
+    g_8_11 = __lsx_vpickev_b(g_2, g_1);                       \
+    b_8_11 = __lsx_vpickev_b(b_2, b_1);                       \
+                                                              \
+    /* process first 16 pixels of second line */              \
+    r_1 = r_uv_1; g_1 = g_uv_1; b_1 = b_uv_1;                 \
+    r_2 = r_uv_2; g_2 = g_uv_2; b_2 = b_uv_2;                 \
+                                                              \
+    READ_Y(y_ptr2)                                            \
+    y_1 = __lsx_vilvl_b(zero, y);                             \
+    y_2 = __lsx_vilvh_b(zero, y);                             \
+    ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2)      \
+    CLIP(r_1, g_1, b_1, r_2, g_2, b_2);                       \
+    r_8_21 = __lsx_vpickev_b(r_2, r_1);                       \
+    g_8_21 = __lsx_vpickev_b(g_2, g_1);                       \
+    b_8_21 = __lsx_vpickev_b(b_2, b_1);                       \
+                                                              \
+    /* process last 16 pixels of first line */                \
+    u = __lsx_vilvh_b(zero, u_temp);                          \
+    v = __lsx_vilvh_b(zero, v_temp);                          \
+    u = __lsx_vsub_h(u, bias);                                \
+    v = __lsx_vsub_h(v, bias);                                \
+    UV2RGB_16(u, v, r_1, g_1, b_1, r_2, g_2, b_2);            \
+    r_uv_1 = r_1; g_uv_1 = g_1; b_uv_1 = b_1;                 \
+    r_uv_2 = r_2; g_uv_2 = g_2; b_uv_2 = b_2;                 \
+    READ_Y(y_ptr1 + 16 * y_pixel_stride)                      \
+    y_1 = __lsx_vilvl_b(zero, y);                             \
+    y_2 = __lsx_vilvh_b(zero, y);                             \
+    ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2)      \
+    CLIP(r_1, g_1, b_1, r_2, g_2, b_2);                       \
+    r_8_12 = __lsx_vpickev_b(r_2, r_1);                       \
+    g_8_12 = __lsx_vpickev_b(g_2, g_1);                       \
+    b_8_12 = __lsx_vpickev_b(b_2, b_1);                       \
+                                                              \
+   /* process last 16 pixels of second line */                \
+    r_1 = r_uv_1; g_1 = g_uv_1; b_1 = b_uv_1;                 \
+    r_2 = r_uv_2; g_2 = g_uv_2; b_2 = b_uv_2;                 \
+                                                              \
+    READ_Y(y_ptr2 + 16 * y_pixel_stride)                      \
+    y_1 = __lsx_vilvl_b(zero, y);                             \
+    y_2 = __lsx_vilvh_b(zero, y);                             \
+    ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2)      \
+    CLIP(r_1, g_1, b_1, r_2, g_2, b_2);                       \
+    r_8_22 = __lsx_vpickev_b(r_2, r_1);                       \
+    g_8_22 = __lsx_vpickev_b(g_2, g_1);                       \
+    b_8_22 = __lsx_vpickev_b(b_2, b_1);                       \
+                                                              \
+
+void LSX_FUNCTION_NAME(uint32_t width, uint32_t height, const uint8_t *Y,
+                       const uint8_t *U, const uint8_t *V, uint32_t Y_stride,
+                       uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride,
+                       YCbCrType yuv_type)
+{
+    const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
+#if YUV_FORMAT == YUV_FORMAT_420
+    const int y_pixel_stride = 1;
+    const int uv_pixel_stride = 1;
+    const int uv_x_sample_interval = 2;
+    const int uv_y_sample_interval = 2;
+#endif
+
+#if RGB_FORMAT == RGB_FORMAT_RGB565
+    const int rgb_pixel_stride = 2;
+#elif RGB_FORMAT == RGB_FORMAT_RGB24
+    const int rgb_pixel_stride = 3;
+    __m128i mask1 = {0x0504110302100100, 0x0A14090813070612};
+    __m128i mask2 = {0x1808170716061505, 0x00000000000A1909};
+    __m128i mask3 = {0x0504170302160100, 0x0A1A090819070618};
+    __m128i mask4 = {0x1E0D1D0C1C0B1B0A, 0x00000000000F1F0E};
+    __m128i mask5 = {0x05041C03021B0100, 0x0A1F09081E07061D};
+#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT_BGRA || \
+    RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT_ABGR
+    const int rgb_pixel_stride = 4;
+#else
+#error Unknown RGB pixel size
+#endif
+
+    uint32_t xpos, ypos;
+    __m128i v2r   = __lsx_vreplgr2vr_h(param->v_r_factor);
+    __m128i v2g   = __lsx_vreplgr2vr_h(param->v_g_factor);
+    __m128i u2g   = __lsx_vreplgr2vr_h(param->u_g_factor);
+    __m128i u2b   = __lsx_vreplgr2vr_h(param->u_b_factor);
+    __m128i bias  = __lsx_vreplgr2vr_h(128);
+    __m128i shift = __lsx_vreplgr2vr_h(param->y_shift);
+    __m128i yf    = __lsx_vreplgr2vr_h(param->y_factor);
+    __m128i zero  = __lsx_vldi(0);
+
+    if (width >= 32) {
+        for (ypos = 0; ypos < (height - (uv_y_sample_interval - 1)); ypos += uv_y_sample_interval) {
+            const uint8_t *y_ptr1 = Y + ypos * Y_stride,
+                          *y_ptr2 = Y + (ypos + 1) * Y_stride,
+                          *u_ptr  = U + (ypos/uv_y_sample_interval) * UV_stride,
+                          *v_ptr  = V + (ypos/uv_y_sample_interval) * UV_stride;
+            uint8_t *rgb_ptr1 = RGB + ypos * RGB_stride,
+                    *rgb_ptr2 = RGB + (ypos + 1) * RGB_stride;
+
+            for (xpos = 0; xpos < (width - 31); xpos += 32){
+                YUV2RGB_32
+                {
+                    PACK_PIXEL
+                    SAVE_LINE1
+                    if (uv_y_sample_interval > 1)
+                    {
+                        SAVE_LINE2
+                    }
+                }
+                y_ptr1   += 32 * y_pixel_stride;
+                y_ptr2   += 32 * y_pixel_stride;
+                u_ptr    += 32 * uv_pixel_stride/uv_x_sample_interval;
+                v_ptr    += 32 * uv_pixel_stride/uv_x_sample_interval;
+                rgb_ptr1 += 32 * rgb_pixel_stride;
+                rgb_ptr2 += 32 * rgb_pixel_stride;
+            }
+        }
+        if (uv_y_sample_interval == 2 && ypos == (height - 1)) {
+            const uint8_t *y_ptr = Y + ypos * Y_stride,
+                          *u_ptr = U + (ypos/uv_y_sample_interval) * UV_stride,
+                          *v_ptr = V + (ypos/uv_y_sample_interval) * UV_stride;
+            uint8_t *rgb_ptr = RGB + ypos * RGB_stride;
+            STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+        }
+    }
+    {
+        int converted = (width & ~31);
+        if (converted != width)
+        {
+            const uint8_t *y_ptr = Y + converted * y_pixel_stride,
+                          *u_ptr = U + converted * uv_pixel_stride / uv_x_sample_interval,
+                          *v_ptr = V + converted * uv_pixel_stride / uv_x_sample_interval;
+            uint8_t *rgb_ptr = RGB + converted * rgb_pixel_stride;
+
+            STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
+        }
+    }
+}
+
+#undef LSX_FUNCTION_NAME
+#undef STD_FUNCTION_NAME
+#undef YUV_FORMAT
+#undef RGB_FORMAT
+#undef LSX_ALIGNED
+#undef LSX_ST_UB2
+#undef UV2RGB_16
+#undef ADD_Y2RGB_16
+#undef PACK_RGB24_32_STEP
+#undef PACK_RGB24_32
+#undef PACK_PIXEL
+#undef PACK_RGBA_32
+#undef SAVE_LINE1
+#undef SAVE_LINE2
+#undef READ_Y
+#undef READ_UV
+#undef YUV2RGB_32