Two small blitting patches

  1. an obvious elimination of two branches in the inner RLE loop for x86
    by introducing SDL_memcpy4(). I don’t have an x86 handy so please test
    (running Maelstrom in 32bpp should be enough)
  2. the promised optimised handling of per-surface alpha=128. until now
    this had only been optimised for colourkeyed RLE blits; this patch adds
    it for rectangular blits (888, 565 and 555)

None of this is rocket science, but I wrote an automatic blit tester to
verify the second part so it has a good chance of being correct for a change.
The basic idea is to grow that into a set of tests that we can run whenever
something changes

CETERUM CENSEO, DUFFS_LOOP ESSE DELENDAM

Index: src/video/SDL_RLEaccel.c===================================================================
RCS file: /cvs/SDL/src/video/SDL_RLEaccel.c,v
retrieving revision 1.3.2.22
diff -u -r1.3.2.22 SDL_RLEaccel.c
— src/video/SDL_RLEaccel.c 2001/02/17 17:52:09 1.3.2.22
+++ src/video/SDL_RLEaccel.c 2001/04/05 13:06:55
@@ -109,12 +109,21 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif

+#define PIXEL_COPY(to, from, len, bpp)
+do { \

  • if(bpp == 4) { \
  • SDL_memcpy4(to, from, (unsigned)(len)); \
  • } else { \
  • SDL_memcpy(to, from, (unsigned)(len) * (bpp)); \
  • }
    +} while(0)

/*

  • Various colorkey blit methods, for opaque and per-surface alpha
    */

#define OPAQUE_BLIT(to, from, length, bpp, alpha) \

  • SDL_memcpy(to, from, (unsigned)(length * bpp))
  • PIXEL_COPY(to, from, length, bpp)

/*

  • For 32bpp pixels on the form 0x00rrggbb:
    @@ -657,9 +666,9 @@
    if(crun > right - cofs)
    crun = right - cofs;
    if(crun > 0) \
  •   	SDL_memcpy(dstbuf + cofs * sizeof(Ptype),	  \
    
  •   	PIXEL_COPY(dstbuf + cofs * sizeof(Ptype),	  \
      		   srcbuf + (cofs - ofs) * sizeof(Ptype), \
    
  •   		   (unsigned)crun * sizeof(Ptype));	  \
    
  •   		   (unsigned)crun, sizeof(Ptype));	  \
          srcbuf += run * sizeof(Ptype);			  \
          ofs += run;						  \
      } else if(!ofs)						  \
    

@@ -816,8 +825,8 @@
run = ((Ctype *)srcbuf)[1];
srcbuf += 2 * sizeof(Ctype);
if(run) { \

  •   	SDL_memcpy(dstbuf + ofs * sizeof(Ptype), srcbuf, \
    
  •   		   run * sizeof(Ptype));		 \
    
  •   	PIXEL_COPY(dstbuf + ofs * sizeof(Ptype), srcbuf, \
    
  •   		   run, sizeof(Ptype));			 \
      	srcbuf += run * sizeof(Ptype);			 \
      	ofs += run;					 \
          } else if(!ofs)					 \
    

Index: src/video/SDL_blit_A.c

RCS file: /cvs/SDL/src/video/SDL_blit_A.c,v
retrieving revision 1.3.2.9
diff -u -r1.3.2.9 SDL_blit_A.c
— src/video/SDL_blit_A.c 2001/03/04 17:36:19 1.3.2.9
+++ src/video/SDL_blit_A.c 2001/04/05 13:06:55
@@ -195,8 +195,8 @@
}
}

-/* fast RGB888->(A)RGB888 blending with surface alpha */
-static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo info)
+/
fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
+static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
{
int width = info->d_width;
int height = info->d_height;
@@ -204,32 +204,58 @@
int srcskip = info->s_skip >> 2;
Uint32 *dstp = (Uint32 *)info->d_pixels;
int dstskip = info->d_skip >> 2;

  • SDL_PixelFormat *srcfmt = info->src;

  • unsigned alpha = srcfmt->alpha;

    while(height–) {
    DUFFS_LOOP4({

  •   Uint32 s;
    
  •   Uint32 d;
    
  •   Uint32 s1;
    
  •   Uint32 d1;
    
  •   s = *srcp;
    
  •   d = *dstp;
    
  •   s1 = s & 0xff00ff;
    
  •   d1 = d & 0xff00ff;
    
  •   d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
    
  •   s &= 0xff00;
    
  •   d &= 0xff00;
    
  •   d = (d + ((s - d) * alpha >> 8)) & 0xff00;
    
  •   *dstp = d1 | d | 0xff000000;
    
  •   ++srcp;
    
  •   ++dstp;
    
  •       Uint32 s = *srcp++;
    
  •       Uint32 d = *dstp;
    
  •       *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    
  •   	    + (s & d & 0x00010101);
      }, width);
      srcp += srcskip;
      dstp += dstskip;
    
    }
    }

+/* fast RGB888->(A)RGB888 blending with surface alpha */
+static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
+{

  • unsigned alpha = info->src->alpha;
  • if(alpha == 128) {
  •   BlitRGBtoRGBSurfaceAlpha128(info);
    
  • } else {
  •   int width = info->d_width;
    
  •   int height = info->d_height;
    
  •   Uint32 *srcp = (Uint32 *)info->s_pixels;
    
  •   int srcskip = info->s_skip >> 2;
    
  •   Uint32 *dstp = (Uint32 *)info->d_pixels;
    
  •   int dstskip = info->d_skip >> 2;
    
  •   while(height--) {
    
  •   	DUFFS_LOOP4({
    
  •   		Uint32 s;
    
  •   		Uint32 d;
    
  •   		Uint32 s1;
    
  •   		Uint32 d1;
    
  •   		s = *srcp;
    
  •   		d = *dstp;
    
  •   		s1 = s & 0xff00ff;
    
  •   		d1 = d & 0xff00ff;
    
  •   		d1 = (d1 + ((s1 - d1) * alpha >> 8))
    
  •   		     & 0xff00ff;
    
  •   		s &= 0xff00;
    
  •   		d &= 0xff00;
    
  •   		d = (d + ((s - d) * alpha >> 8)) & 0xff00;
    
  •   		*dstp = d1 | d | 0xff000000;
    
  •   		++srcp;
    
  •   		++dstp;
    
  •   	}, width);
    
  •   	srcp += srcskip;
    
  •   	dstp += dstskip;
    
  •   }
    
  • }
    +}

/* fast ARGB888->(A)RGB888 blending with pixel alpha */
static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
{
@@ -277,8 +303,18 @@
}
}

-/* fast RGB565->RGB565 blending with surface alpha */
-static void Blit565to565SurfaceAlpha(SDL_BlitInfo info)
+/
16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel /
+
+/
blend a single 16 bit pixel at 50% */
+#define BLEND16_50(d, s, mask) \

  • ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))

+/* blend two 16 bit pixels at 50% */
+#define BLEND2x16_50(d, s, mask) \

  • (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
    • (s & d & (~(mask | mask << 16))))

+static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
{
int width = info->d_width;
int height = info->d_height;
@@ -286,56 +322,163 @@
int srcskip = info->s_skip >> 1;
Uint16 *dstp = (Uint16 *)info->d_pixels;
int dstskip = info->d_skip >> 1;

  • unsigned alpha = info->src->alpha >> 3; /* downscale alpha to 5 bits */

    while(height–) {

  •   DUFFS_LOOP4({
    
  •   Uint32 s = *srcp++;
    
  •   Uint32 d = *dstp;
    
  •   /*
    
  •    * shift out the middle component (green) to the high 16
    
  •    * bits, and process all three RGB components at the same
    
  •    * time.
    
  •    */
    
  •   s = (s | s << 16) & 0x07e0f81f;
    
  •   d = (d | d << 16) & 0x07e0f81f;
    
  •   d += (s - d) * alpha >> 5;
    
  •   d &= 0x07e0f81f;
    
  •   *dstp++ = d | d >> 16;
    
  •   }, width);
    
  •   srcp += srcskip;
    
  •   dstp += dstskip;
    
  •   if(((unsigned long)srcp ^ (unsigned long)dstp) & 2) {
    
  •   	/*
    
  •   	 * Source and destination not aligned, pipeline it.
    
  •   	 * This is mostly a win for big blits but no loss for
    
  •   	 * small ones
    
  •   	 */
    
  •   	Uint32 prev_sw;
    
  •   	int w = width;
    
  •   	/* handle odd destination */
    
  •   	if((unsigned long)dstp & 2) {
    
  •   		Uint16 d = *dstp, s = *srcp;
    
  •   		*dstp = BLEND16_50(d, s, mask);
    
  •   		dstp++;
    
  •   		srcp++;
    
  •   		w--;
    
  •   	}
    
  •   	srcp++;	/* srcp is now 32-bit aligned */
    
  •   	/* bootstrap pipeline with first halfword */
    
  •   	prev_sw = ((Uint32 *)srcp)[-1];
    
  •   	while(w > 1) {
    
  •   		Uint32 sw, dw, s;
    
  •   		sw = *(Uint32 *)srcp;
    
  •   		dw = *(Uint32 *)dstp;
    
  •   		if(SDL_BYTEORDER == SDL_BIG_ENDIAN)
    
  •   			s = (prev_sw << 16) + (sw >> 16);
    
  •   		else
    
  •   			s = (prev_sw >> 16) + (sw << 16);
    
  •   		prev_sw = sw;
    
  •   		*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
    
  •   		dstp += 2;
    
  •   		srcp += 2;
    
  •   		w -= 2;
    
  •   	}
    
  •   	/* final pixel if any */
    
  •   	if(w) {
    
  •   		Uint16 d = *dstp, s;
    
  •   		if(SDL_BYTEORDER == SDL_BIG_ENDIAN)
    
  •   			s = prev_sw;
    
  •   		else
    
  •   			s = prev_sw >> 16;
    
  •   		*dstp = BLEND16_50(d, s, mask);
    
  •   		srcp++;
    
  •   		dstp++;
    
  •   	}
    
  •   	srcp += srcskip - 1;
    
  •   	dstp += dstskip;
    
  •   } else {
    
  •   	/* source and destination are aligned */
    
  •   	int w = width;
    
  •   	/* first odd pixel? */
    
  •   	if((unsigned long)srcp & 2) {
    
  •   		Uint16 d = *dstp, s = *srcp;
    
  •   		*dstp = BLEND16_50(d, s, mask);
    
  •   		srcp++;
    
  •   		dstp++;
    
  •   		w--;
    
  •   	}
    
  •   	/* srcp and dstp are now 32-bit aligned */
    
  •   	while(w > 1) {
    
  •   		Uint32 sw = *(Uint32 *)srcp;
    
  •   		Uint32 dw = *(Uint32 *)dstp;
    
  •   		*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
    
  •   		srcp += 2;
    
  •   		dstp += 2;
    
  •   		w -= 2;
    
  •   	}
    
  •   	/* last odd pixel? */
    
  •   	if(w) {
    
  •   		Uint16 d = *dstp, s = *srcp;
    
  •   		*dstp = BLEND16_50(d, s, mask);
    
  •   		srcp++;
    
  •   		dstp++;
    
  •   	}
    
  •   	srcp += srcskip;
    
  •   	dstp += dstskip;
    
  •   }
    
  • }
    +}

+/* fast RGB565->RGB565 blending with surface alpha */
+static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
+{

  • unsigned alpha = info->src->alpha;
  • if(alpha == 128) {
  •   Blit16to16SurfaceAlpha128(info, 0xf7de);
    
  • } else {
  •   int width = info->d_width;
    
  •   int height = info->d_height;
    
  •   Uint16 *srcp = (Uint16 *)info->s_pixels;
    
  •   int srcskip = info->s_skip >> 1;
    
  •   Uint16 *dstp = (Uint16 *)info->d_pixels;
    
  •   int dstskip = info->d_skip >> 1;
    
  •   alpha >>= 3;	/* downscale alpha to 5 bits */
    
  •   while(height--) {
    
  •   	DUFFS_LOOP4({
    
  •   		Uint32 s = *srcp++;
    
  •   		Uint32 d = *dstp;
    
  •   		/*
    
  •   		 * shift out the middle component (green) to
    
  •   		 * the high 16 bits, and process all three RGB
    
  •   		 * components at the same time.
    
  •   		 */
    
  •   		s = (s | s << 16) & 0x07e0f81f;
    
  •   		d = (d | d << 16) & 0x07e0f81f;
    
  •   		d += (s - d) * alpha >> 5;
    
  •   		d &= 0x07e0f81f;
    
  •   		*dstp++ = d | d >> 16;
    
  •   	}, width);
    
  •   	srcp += srcskip;
    
  •   	dstp += dstskip;
    
  •   }
    
    }
    }

/* fast RGB555->RGB555 blending with surface alpha */
static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
{

  • int width = info->d_width;
  • int height = info->d_height;
  • Uint16 *srcp = (Uint16 *)info->s_pixels;
  • int srcskip = info->s_skip >> 1;
  • Uint16 *dstp = (Uint16 *)info->d_pixels;
  • int dstskip = info->d_skip >> 1;
  • unsigned alpha = info->src->alpha >> 3; /* downscale alpha to 5 bits */
  • unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  • if(alpha == 128) {
  •   Blit16to16SurfaceAlpha128(info, 0xfbde);
    
  • } else {
  •   int width = info->d_width;
    
  •   int height = info->d_height;
    
  •   Uint16 *srcp = (Uint16 *)info->s_pixels;
    
  •   int srcskip = info->s_skip >> 1;
    
  •   Uint16 *dstp = (Uint16 *)info->d_pixels;
    
  •   int dstskip = info->d_skip >> 1;
    
  •   alpha >>= 3;		/* downscale alpha to 5 bits */
    
  • while(height–) {
  •   DUFFS_LOOP4({
    
  •   Uint32 s = *srcp++;
    
  •   Uint32 d = *dstp;
    
  •   /*
    
  •    * shift out the middle component (green) to the high 16
    
  •    * bits, and process all three RGB components at the same
    
  •    * time.
    
  •    */
    
  •   s = (s | s << 16) & 0x03e07c1f;
    
  •   d = (d | d << 16) & 0x03e07c1f;
    
  •   d += (s - d) * alpha >> 5;
    
  •   d &= 0x03e07c1f;
    
  •   *dstp++ = d | d >> 16;
    
  •   }, width);
    
  •   srcp += srcskip;
    
  •   dstp += dstskip;
    
  •   while(height--) {
    
  •   	DUFFS_LOOP4({
    
  •   		Uint32 s = *srcp++;
    
  •   		Uint32 d = *dstp;
    
  •   		/*
    
  •   		 * shift out the middle component (green) to
    
  •   		 * the high 16 bits, and process all three RGB
    
  •   		 * components at the same time.
    
  •   		 */
    
  •   		s = (s | s << 16) & 0x03e07c1f;
    
  •   		d = (d | d << 16) & 0x03e07c1f;
    
  •   		d += (s - d) * alpha >> 5;
    
  •   		d &= 0x03e07c1f;
    
  •   		*dstp++ = d | d >> 16;
    
  •   	}, width);
    
  •   	srcp += srcskip;
    
  •   	dstp += dstskip;
    
  •   }
    
    }
    }

Index: src/video/SDL_memops.h

RCS file: /cvs/SDL/src/video/Attic/SDL_memops.h,v
retrieving revision 1.1.2.16
diff -u -r1.1.2.16 SDL_memops.h
— src/video/SDL_memops.h 2001/02/22 01:14:05 1.1.2.16
+++ src/video/SDL_memops.h 2001/04/05 13:06:55
@@ -53,6 +53,17 @@
: “memory” );
} while(0)

+#define SDL_memcpy4(dst, src, len)
+do {

  • int ecx, edi, esi;
  • asm volatile (
  •   "cld\n\t"
    
  •   "rep ; movsl"
    
  •   : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
    
  •   : "0" ((unsigned)(len)), "1" (dst), "2" (src)
    
  •   : "memory" );
    

+} while(0)
+
#define SDL_revcpy(dst, src, len)
do {
int u0, u1, u2;
@@ -104,9 +115,15 @@
#ifndef SDL_memcpy
#define SDL_memcpy(dst, src, len) memcpy(dst, src, len)
#endif
+
+#ifndef SDL_memcpy4
+#define SDL_memcpy4(dst, src, len) memcpy(dst, src, (len) << 2)
+#endif
+
#ifndef SDL_revcpy
#define SDL_revcpy(dst, src, len) memmove(dst, src, len)
#endif
+
#ifndef SDL_memset4
#define SDL_memset4(dst, val, len)
do { \