FillRect patch

Here’s an improvement to SDL_FillRect(), mainly for PowerPC but other
platforms should see some improvement for 16bpp as well. CC:ed to the list
so that PPC-owners can flame me. Many thanks for Fingolfin for his heroic
testing efforts despite my silly typos

Index: SDL_surface.c===================================================================
RCS file: /cvs/SDL/src/video/SDL_surface.c,v
retrieving revision 1.8.2.25
diff -u -r1.8.2.25 SDL_surface.c
— SDL_surface.c 2000/12/04 21:53:58 1.8.2.25
+++ SDL_surface.c 2001/01/27 00:20:50
@@ -474,92 +474,123 @@
dstrect->xdst->format->BytesPerPixel;
if ( dst->format->palette || (color == 0) ) {
x = dstrect->w
dst->format->BytesPerPixel;
-#ifdef SDL_memset4
if ( !color && !((long)row&3) && !(x&3) && !(dst->pitch&3) ) {

  •   	int n = x >> 2;
      	for ( y=dstrect->h; y; --y ) {
    
  •   		SDL_memset4(row, 0, x);
    
  •   		SDL_memset4(row, 0, n);
      		row += dst->pitch;
      	}
    
  •   } else
    

-#endif

  •   for ( y=dstrect->h; y; --y ) {
    

-#ifdef powerpc /* SIGBUS when using memset() ?? */

  •   	if ( color ) {
    
  •   		Uint8 *rowp = row;
    
  •   		int left = x;
    
  •   		while ( left-- ) { *rowp++ = color; }
    
  •   	} else {
    
  •   		Uint8 *rowp = row;
    
  •   		int left = x;
    
  •   		while ( left-- ) { *rowp++ = 0; }
    
  •   } else {
    

+#ifdef powerpc

  •   	/*
    
  •   	 * memset() on PPC (both glibc and codewarrior) uses
    
  •   	 * the dcbz (Data Cache Block Zero) instruction, which
    
  •   	 * causes an alignment exception if the destination is
    
  •   	 * uncachable, so only use it on software surfaces
    
  •   	 */
    
  •   	if((dst->flags & SDL_HWSURFACE) == SDL_HWSURFACE) {
    
  •   		if(dstrect->w >= 8) {
    
  •   			/*
    
  •   			 * 64-bit stores are probably most
    
  •   			 * efficient to uncached video memory
    
  •   			 */
    
  •   			double fill;
    
  •   			memset(&fill, color, 8);
    
  •   			for(y = dstrect->h; y; y--) {
    
  •   				Uint8 *d = row;
    
  •   				unsigned n = x;
    
  •   				unsigned nn;
    
  •   				Uint8 c = color;
    
  •   				double f = fill;
    
  •   				while((unsigned long)d
    
  •   				      & (sizeof(double) - 1)) {
    
  •   					*d++ = c;
    
  •   					n--;
    
  •   				}
    
  •   				nn = n / (sizeof(double) * 4);
    
  •   				while(nn) {
    
  •   					((double *)d)[0] = f;
    
  •   					((double *)d)[1] = f;
    
  •   					((double *)d)[2] = f;
    
  •   					((double *)d)[3] = f;
    
  •   					d += 4*sizeof(double);
    
  •   					nn--;
    
  •   				}
    
  •   				n &= ~(sizeof(double) * 4 - 1);
    
  •   				nn = n / sizeof(double);
    
  •   				while(nn) {
    
  •   					*(double *)d = f;
    
  •   					d += sizeof(double);
    
  •   					nn--;
    
  •   				}
    
  •   				n &= ~(sizeof(double) - 1);
    
  •   				while(n) {
    
  •   					*d++ = c;
    
  •   					n--;
    
  •   				}
    
  •   				row += dst->pitch;
    
  •   			}
    
  •   		} else {
    
  •   			/* narrow boxes */
    
  •   			for(y = dstrect->h; y; y--) {
    
  •   				Uint8 *d = row;
    
  •   				Uint8 c = color;
    
  •   				int n = x;
    
  •   				while(n) {
    
  •   					*d++ = c;
    
  •   					n--;
    
  •   				}
    
  •   				row += dst->pitch;
    
  •   			}
    
  •   		}
    
  •   	} else
    

+#endif /* powerpc */

  •   	{
    
  •   		for(y = dstrect->h; y; y--) {
    
  •   			memset(row, color, x);
    
  •   			row += dst->pitch;
    
  •   		}
      	}
    

-#else

  •   	memset(row, color, x);
    

-#endif

  •   	row += dst->pitch;
      }
    
    } else {
    switch (dst->format->BytesPerPixel) {
  •       case 2: {
    
  •   	Uint16 *pixels;
    
  •   case 2:
      	for ( y=dstrect->h; y; --y ) {
    
  •   		pixels = (Uint16 *)row;
    
  •   		for ( x=dstrect->w/4; x; --x ) {
    
  •   			*pixels++ = color;
    
  •   			*pixels++ = color;
    
  •   			*pixels++ = color;
    
  •   			*pixels++ = color;
    
  •   		}
    
  •   		switch (dstrect->w%4) {
    
  •   			case 3:
    
  •   				*pixels++ = color;
    
  •   			case 2:
    
  •   				*pixels++ = color;
    
  •   			case 1:
    
  •   				*pixels++ = color;
    
  •   		Uint16 *pixels = (Uint16 *)row;
    
  •   		Uint16 c = color;
    
  •   		Uint32 cc = (Uint32)c << 16 | c;
    
  •   		int n = dstrect->w;
    
  •   		if((unsigned long)pixels & 3) {
    
  •   			*pixels++ = c;
    
  •   			n--;
      		}
    
  •   		if(n >> 1)
    
  •   			SDL_memset4(pixels, cc, n >> 1);
    
  •   		if(n & 1)
    
  •   			pixels[n - 1] = c;
      		row += dst->pitch;
      	}
    
  •       }
    
  •       break;
    
  •   	break;
    
  •       case 3: {
    
  •   	Uint8 *pixels;
    
  •   case 3:
      	if(SDL_BYTEORDER == SDL_BIG_ENDIAN)
    
  •   	    color <<= 8;
    
  •   		color <<= 8;
      	for ( y=dstrect->h; y; --y ) {
    
  •   		pixels = row;
    
  •   		Uint8 *pixels = row;
      		for ( x=dstrect->w; x; --x ) {
      			memcpy(pixels, &color, 3);
      			pixels += 3;
      		}
      		row += dst->pitch;
      	}
    
  •       }
    
  •       break;
    
  •   	break;
    
  •       case 4: {
    
  •   	Uint32 *pixels;
    
  •   	for ( y=dstrect->h; y; --y ) {
    
  •   		pixels = (Uint32 *)row;
    
  •   		for ( x=dstrect->w/4; x; --x ) {
    
  •   			*pixels++ = color;
    
  •   			*pixels++ = color;
    
  •   			*pixels++ = color;
    
  •   			*pixels++ = color;
    
  •   		}
    
  •   		switch (dstrect->w%4) {
    
  •   			case 3:
    
  •   				*pixels++ = color;
    
  •   			case 2:
    
  •   				*pixels++ = color;
    
  •   			case 1:
    
  •   				*pixels++ = color;
    
  •   		}
    
  •   case 4:
    
  •   	for(y = dstrect->h; y; --y) {
    
  •   		SDL_memset4(row, color, dstrect->w);
      		row += dst->pitch;
      	}
    
  •       }
    
  •       break;
    
  •   	break;
      }
    
    }
    SDL_UnlockSurface(dst);
    Index: SDL_memops.h
    ===================================================================
    RCS file: /cvs/SDL/src/video/Attic/SDL_memops.h,v
    retrieving revision 1.1.2.12
    diff -u -r1.1.2.12 SDL_memops.h
    — SDL_memops.h 2000/04/03 22:35:38 1.1.2.12
    +++ SDL_memops.h 2001/01/27 00:20:50
    @@ -35,65 +35,68 @@
    #if defined(GNUC) && defined(i386)
    /* Thanks to Brennan “Bas” Underwood, for the inspiration. :slight_smile:
    */
    -#define SDL_memcpy(dst, src, len)
    -{ int u0, u1, u2; \
  • asm volatile ( \
  •   "cld\n\t"						\
    
  •   "rep ; movsl\n\t"					\
    
  •   "testb $2,%b4\n\t"					\
    
  •   "je 1f\n\t"						\
    
  •   "movsw\n"						\
    
  •   "1:\ttestb $1,%b4\n\t"					\
    
  •   "je 2f\n\t"						\
    
  •   "movsb\n"						\
    
  •   "2:"							\
    
  •   : "=&c" (u0), "=&D" (u1), "=&S" (u2)			\
    
  •   : "0" ((len)/4), "q" (len), "1" (dst),"2" (src)		\
    
  •   : "memory" );						\
    

-}
+#define SDL_memcpy(dst, src, len)
+do{ \

  • Uint32 u0, u1, u2; \
  • asm volatile ( \
  •   "cld\n\t"						  \
    
  •   "rep ; movsl\n\t"					  \
    
  •   "testb $2,%b4\n\t"					  \
    
  •   "je 1f\n\t"						  \
    
  •   "movsw\n"						  \
    
  •   "1:\ttestb $1,%b4\n\t"					  \
    
  •   "je 2f\n\t"						  \
    
  •   "movsb\n"						  \
    
  •   "2:"							  \
    
  •   : "=&c" (u0), "=&D" (u1), "=&S" (u2)			  \
    
  •   : "0" ((unsigned)(len)/4), "q" (len), "1" (dst),"2" (src) \
    
  •   : "memory" );						  \
    

+}while(0)

-#define SDL_revcpy(dst, src, len)
-{ int u0, u1, u2; \

  • char *dstp = (char *)(dst); \
  • char *srcp = (char *)(src); \
  • int n = (len); \
  • if ( n >= 4 ) { \
  • asm volatile ( \
  •   "std\n\t"						\
    
  •   "rep ; movsl\n\t"					\
    
  •   : "=&c" (u0), "=&D" (u1), "=&S" (u2)			\
    
  •   : "0" (n/4),						\
    
  •     "1" (dstp+(n-4)), "2" (srcp+(n-4))			\
    
  •   : "memory" );						\
    
  • } \
  • switch (n%4) { \
  •   case 3: dstp[2] = srcp[2];				\
    
  •   case 2: dstp[1] = srcp[1];				\
    
  •   case 1: dstp[0] = srcp[0];				\
    
  •   	break;						\
    
  •   default:						\
    
  •   	break;						\
    
  • }
    -}
    +#define SDL_revcpy(dst, src, len)
    +do { \
  • int u0, u1, u2; \
  • char *dstp = (char *)(dst); \
  • char *srcp = (char *)(src); \
  • int n = (len); \
  • if ( n >= 4 ) { \
  • asm volatile ( \
  •   "std\n\t"				\
    
  •   "rep ; movsl\n\t"			\
    
  •   : "=&c" (u0), "=&D" (u1), "=&S" (u2)	\
    
  •   : "0" (n >> 2),				\
    
  •     "1" (dstp+(n-4)), "2" (srcp+(n-4))	\
    
  •   : "memory" );				\
    
  • } \
  • switch (n & 3) { \
  •   case 3: dstp[2] = srcp[2];		\
    
  •   case 2: dstp[1] = srcp[1];		\
    
  •   case 1: dstp[0] = srcp[0];		\
    
  •   	break;				\
    
  •   default:				\
    
  •   	break;				\
    
  • }
    +}while(0)

-#define SDL_memmove(dst, src, len)
-{ \

  • if ( dst < src ) { \
  •   SDL_memcpy(dst, src, len);				\
    
  • } else { \
  •   SDL_revcpy(dst, src, len);				\
    
  • }
    -}
    +#define SDL_memmove(dst, src, len)
    +do { \
  • if ( (dst) < (src) ) { \
  •   SDL_memcpy((dst), (src), (len));	\
    
  • } else { \
  •   SDL_revcpy((dst), (src), (len));	\
    
  • }
    +}while(0)

-#define SDL_memset4(dst, val, len)
-{ int u0, u1, u2; \

  • asm volatile ( \
  •   "cld\n\t"						\
    
  •   "rep ; stosl\n\t"					\
    
  •   : "=&D" (u0), "=&a" (u1), "=&c" (u2)			\
    
  •   : "0" (dst), "1" (val), "2" (len/4)			\
    
  •   : "memory" );						\
    

-}
+#define SDL_memset4(dst, val, len)
+do { \

  • Uint32 u0, u1, u2; \
  • asm volatile ( \
  •   "cld\n\t"					\
    
  •   "rep ; stosl\n\t"				\
    
  •   : "=&D" (u0), "=&a" (u1), "=&c" (u2)		\
    
  •   : "0" (dst), "1" (val), "2" (len)		\
    
  •   : "memory" );					\
    

+}while(0)

#endif /* GNU C and x86 */

@@ -105,18 +108,20 @@
#define SDL_revcpy(dst, src, len) memmove(dst, src, len)
#endif
#ifndef SDL_memset4
-#define SDL_memset4(dst, val, len)
-{ int count = (len)/4; \

  • int n = (count+3)/4; \
  • Uint32 *p = (Uint32 *)(dst); \
  •    switch (count % 4) {						\
    
  •    case 0: do {    *p++ = val;					\
    
  •    case 3:         *p++ = val;					\
    
  •    case 2:         *p++ = val;					\
    
  •    case 1:         *p++ = val;					\
    
  •       } while ( --n > 0 );					\
    
  • }
    -}
    +#define SDL_memset4(dst, val, len)
    +do{ \
  • unsigned _count = (len); \
  • unsigned _n = (_count + 3) / 4; \
  • Uint32 *_p = (Uint32 *)(dst); \
  • Uint32 _val = (val); \
  •    switch (_count % 4) {			\
    
  •    case 0: do {    *_p++ = _val;		\
    
  •    case 3:         *_p++ = _val;		\
    
  •    case 2:         *_p++ = _val;		\
    
  •    case 1:         *_p++ = _val;		\
    
  •   } while ( --_n );		\
    
  • }
    +}while(0)
    #endif

#endif /* _SDL_memops_h */

Here’s an improvement to SDL_FillRect(), mainly for PowerPC but other
platforms should see some improvement for 16bpp as well. CC:ed to the list
so that PPC-owners can flame me. Many thanks for Fingolfin for his heroic
testing efforts despite my silly typos

Thanks! It’s in CVS, along with a patch to make surface locks recursive.

See ya!
-Sam Lantinga, Lead Programmer, Loki Entertainment Software