From 8d65942b9ce51097a1e0d1afbc09f4861935ce75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C4=81rti=C5=86=C5=A1=20Mo=C5=BEeiko?=
<martins.mozeiko@gmail.com>
Date: Tue, 27 Aug 2024 22:31:14 -0700
Subject: [PATCH] Faster sse4 and avx2 SIMD blitters
---
src/video/SDL_blit_A.c | 70 ++++++++++++++++++++++--------------------
1 file changed, 36 insertions(+), 34 deletions(-)
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index d67ce8778281a..db13e74cc42df 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -1036,28 +1036,29 @@ static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_Bli
// Set the alpha channels of src to 255
src128 = _mm_or_si128(src128, alpha_fill_mask);
- __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
- __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
+ // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
+ __m128i srca_lo = _mm_unpacklo_epi8(srcA, srcA);
+ __m128i srca_hi = _mm_unpackhi_epi8(srcA, srcA);
- __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
- __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
+ // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
+ srca_lo = _mm_xor_si128(srca_lo, _mm_set1_epi16(0xff00));
+ srca_hi = _mm_xor_si128(srca_hi, _mm_set1_epi16(0xff00));
- __m128i srca_lo = _mm_unpacklo_epi8(srcA, _mm_setzero_si128());
- __m128i srca_hi = _mm_unpackhi_epi8(srcA, _mm_setzero_si128());
+ // maddubs expects second argument to be signed, so subtract 128
+ src128 = _mm_sub_epi8(src128, _mm_set1_epi8((char)128));
+ dst128 = _mm_sub_epi8(dst128, _mm_set1_epi8((char)128));
- // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
- dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo),
- _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
- dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
- _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
+ // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255
+ __m128i dst_lo = _mm_maddubs_epi16(srca_lo, _mm_unpacklo_epi8(src128, dst128));
+ __m128i dst_hi = _mm_maddubs_epi16(srca_hi, _mm_unpackhi_epi8(src128, dst128));
- // dst += 0x1U (use 0x80 to round instead of floor)
- dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
- dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
+ // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result)
+ dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1 + 128*255));
+ dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1 + 128*255));
- // dst = (dst + (dst >> 8)) >> 8
- dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
- dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
+ // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16
+ dst_lo = _mm_mulhi_epu16(dst_lo, _mm_set1_epi16(257));
+ dst_hi = _mm_mulhi_epu16(dst_hi, _mm_set1_epi16(257));
// Blend the pixels together and save the result
_mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi));
@@ -1128,28 +1129,29 @@ static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitIn
// Set the alpha channels of src to 255
src256 = _mm256_or_si256(src256, alpha_fill_mask);
- __m256i src_lo = _mm256_unpacklo_epi8(src256, _mm256_setzero_si256());
- __m256i src_hi = _mm256_unpackhi_epi8(src256, _mm256_setzero_si256());
+ // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
+ __m256i alpha_lo = _mm256_unpacklo_epi8(srcA, srcA);
+ __m256i alpha_hi = _mm256_unpackhi_epi8(srcA, srcA);
- __m256i dst_lo = _mm256_unpacklo_epi8(dst256, _mm256_setzero_si256());
- __m256i dst_hi = _mm256_unpackhi_epi8(dst256, _mm256_setzero_si256());
+ // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
+ alpha_lo = _mm256_xor_si256(alpha_lo, _mm256_set1_epi16(0xff00));
+ alpha_hi = _mm256_xor_si256(alpha_hi, _mm256_set1_epi16(0xff00));
- __m256i srca_lo = _mm256_unpacklo_epi8(srcA, _mm256_setzero_si256());
- __m256i srca_hi = _mm256_unpackhi_epi8(srcA, _mm256_setzero_si256());
+ // maddubs expects second argument to be signed, so subtract 128
+ src256 = _mm256_sub_epi8(src256, _mm256_set1_epi8((char)128));
+ dst256 = _mm256_sub_epi8(dst256, _mm256_set1_epi8((char)128));
- // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
- dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo),
- _mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo));
- dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
- _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
+ // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255
+ __m256i dst_lo = _mm256_maddubs_epi16(alpha_lo, _mm256_unpacklo_epi8(src256, dst256));
+ __m256i dst_hi = _mm256_maddubs_epi16(alpha_hi, _mm256_unpackhi_epi8(src256, dst256));
- // dst += 0x1U (use 0x80 to round instead of floor)
- dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
- dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
+ // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result)
+ dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1 + 128*255));
+ dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1 + 128*255));
- // dst = (dst + (dst >> 8)) >> 8
- dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
- dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
+ // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16
+ dst_lo = _mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(257));
+ dst_hi = _mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(257));
// Blend the pixels together and save the result
_mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi));