SDL-1.2: atari:video: use optimized c2p routines

From 0237f339e6bec39c56e1364787fabdb0245d478f Mon Sep 17 00:00:00 2001
From: Miro Kropacek <[EMAIL REDACTED]>
Date: Sun, 27 Oct 2024 13:18:21 +0100
Subject: [PATCH] atari:video: use optimized c2p routines

They are well proven in ScummVM and much more efficient than the
previous ones.

TT version skips every second line which leads to a different look but
also it is much faster. Perhaps in the future I'll add an env variable
to switch between double lines and skipped lines.

ST version has been fixed and heavily optimized in the process, too.
---
 src/video/ataricommon/SDL_ataric2p.S   | 1468 ++++++++++++++++++------
 src/video/ataricommon/SDL_ataric2p_s.h |  108 +-
 src/video/gem/SDL_gemvideo.c           |   48 +-
 src/video/xbios/SDL_xbios.c            |   53 +-
 src/video/xbios/SDL_xbios_st.c         |    8 +-
 5 files changed, 1216 insertions(+), 469 deletions(-)

diff --git a/src/video/ataricommon/SDL_ataric2p.S b/src/video/ataricommon/SDL_ataric2p.S
index 97f374115..d50f29706 100644
--- a/src/video/ataricommon/SDL_ataric2p.S
+++ b/src/video/ataricommon/SDL_ataric2p.S
@@ -26,442 +26,1172 @@
 	Chunky to planar conversion routine
 	1 byte/pixel -> 4 or 8 bit planes
 
-	Patrice Mandin
-	Xavier Joubert
 	Mikael Kalms
+	Miro Kropacek
 */
 
-	.globl	SYM(SDL_Atari_C2pConvert)
 	.globl	SYM(SDL_Atari_C2pConvert8)
+	.globl	SYM(SDL_Atari_C2pConvert8_tt)
+	.globl	SYM(SDL_Atari_C2pConvert8_rect)
 	.globl	SYM(SDL_Atari_C2pConvert4)
-	.globl	SYM(SDL_Atari_C2pConvert4_pal)
+	.globl	SYM(SDL_Atari_C2pConvert4_rect)
+
+	.globl	SYM(SDL_Atari_C2pPalette4)
 
 /* ------------	Conversion C2P, 8 bits ------------ */
 
 	.text
+
+| void SDL_Atari_C2pConvert8(const byte *pChunky, const byte *pChunkyEnd, byte *pScreen);
 SYM(SDL_Atari_C2pConvert8):
 #if !defined(__mcoldfire__)
-	movel	sp@(4),c2p_source
-	movel	sp@(8),c2p_dest
-	movel	sp@(12),c2p_width
-	movel	sp@(16),c2p_height
-	movel	sp@(20),c2p_dblligne
-	movel	sp@(24),c2p_srcpitch
-	movel	sp@(28),c2p_dstpitch
-
-	moveml	d2-d7/a2-a6,sp@-
-
-	movel	c2p_source,c2p_cursrc
-	movel	c2p_dest,c2p_curdst
-	movel	#0x0f0f0f0f,d4
-	movel	#0x00ff00ff,d5
-	movel	#0x55555555,d6
-	movew	c2p_height+2,c2p_row
-	movew	c2p_width+2,d0
-	andw	#-8,d0
-	movew	d0,c2p_rowlen
-
-SDL_Atari_C2p8_rowloop:
-
-	movel	c2p_cursrc,a0
-	movel	c2p_curdst,a1
-
-	movel	a0,a2
-	addw	c2p_rowlen,a2
-
-	movel	a0@+,d0
-	movel	a0@+,d1
-	movel	a0@+,d2
-	movel	a0@+,d3
-/*
-	d0 = a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
-	d1 = e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
-	d2 = i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
-	d3 = m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
-*/
-	movel	d1,d7
-	lsrl	#4,d7
-	eorl	d0,d7
-	andl	d4,d7
-	eorl	d7,d0
-	lsll	#4,d7
-	eorl	d7,d1
-
-	movel	d3,d7
-	lsrl	#4,d7
-	eorl	d2,d7
-	andl	d4,d7
-	eorl	d7,d2
-	lsll	#4,d7
-	eorl	d7,d3
-
-	movel	d2,d7
-	lsrl	#8,d7
-	eorl	d0,d7
- 	andl	d5,d7
-	eorl	d7,d0
-	lsll	#8,d7
-	eorl	d7,d2
-
-	movel	d3,d7
-	lsrl	#8,d7
-	eorl	d1,d7
- 	andl	d5,d7
-	eorl	d7,d1
-	lsll	#8,d7
-	eorl	d7,d3
-/*
-	d0 = a7a6a5a4e7e6e5e4 i7i6i5i4m7m6m5m4 c7c6c5c4g7g6g5g4 k7k6k5k4o7o6o5o4
-	d1 = a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0 c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
-	d2 = b7b6b5b4f7f6f5f4 j7j6j5j4n7n6n5n4 d7d6d5d4h7h6h5h4 l7l6l5l4p7p6p5p4
-	d3 = b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0 d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
-*/
-	bras	SDL_Atari_C2p8_start
+#ifdef __FASTCALL__
+									| a0: chunky
+	move.l	a1,d0					| chunky end
+	move.l	4(sp),a1				| screen
+#else
+	move.l	(4,sp),a0				| chunky
+	move.l	(8,sp),d0				| chunky end
+	move.l	(12,sp),a1				| screen
+#endif
+	movem.l	d2-d7/a2-a6,-(sp)
+	move.l	d0,a2
+	move.l	#0x0f0f0f0f,d4
+	move.l	#0x00ff00ff,d5
+	move.l	#0x55555555,d6
+
+	move.l	(a0)+,d0
+	move.l	(a0)+,d1
+	move.l	(a0)+,d2
+	move.l	(a0)+,d3
+
+	| a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
+	| e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
+	| i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
+	| m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
+
+	move.l	d1,d7
+	lsr.l	#4,d7
+	eor.l	d0,d7
+	and.l	d4,d7
+	eor.l	d7,d0
+	lsl.l	#4,d7
+	eor.l	d7,d1
+	move.l	d3,d7
+	lsr.l	#4,d7
+	eor.l	d2,d7
+	and.l	d4,d7
+	eor.l	d7,d2
+	lsl.l	#4,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 b7b6b5b4f7f6f5f4 c7c6c5c4g7g6g5g4 d7d6d5d4h7h6h5h4
+	| a3a2a1a0e3e2e1e0 b3b2b1b0f3f2f1f0 c3c2c1c0g3g2g1g0 d3d2d1d0h3h2h1h0
+	| i7i6i5i4m7m6m5m4 j7j6j5j4n7n6n5n4 k7k6k5k4o7o6o5o4 l7l6l5l4p7p6p5p4
+	| i3i2i1i0m3m2m1m0 j3j2j1j0n3n2n1n0 k3k2k1k0o3o2o1o0 l3l2l1l0p3p2p1p0
+
+	move.l	d2,d7
+	lsr.l	#8,d7
+	eor.l	d0,d7
+	and.l	d5,d7
+	eor.l	d7,d0
+	lsl.l	#8,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#8,d7
+	eor.l	d1,d7
+	and.l	d5,d7
+	eor.l	d7,d1
+	lsl.l	#8,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 i7i6i5i4m7m6m5m4 c7c6c5c4g7g6g5g4 k7k6k5k4o7o6o5o4
+	| a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0 c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
+	| b7b6b5b4f7f6f5f4 j7j6j5j4n7n6n5n4 d7d6d5d4h7h6h5h4 l7l6l5l4p7p6p5p4
+	| b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0 d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
+
+	bra.s	c2p1x1_8_start
+
+c2p1x1_8_pix16:
+	move.l	(a0)+,d0
+	move.l	(a0)+,d1
+	move.l	(a0)+,d2
+	move.l	(a0)+,d3
+
+	| a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
+	| e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
+	| i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
+	| m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
+
+	move.l	d1,d7
+	lsr.l	#4,d7
+	move.l	a3,(a1)+
+	eor.l	d0,d7
+	and.l	d4,d7
+	eor.l	d7,d0
+	lsl.l	#4,d7
+	eor.l	d7,d1
+	move.l	d3,d7
+	lsr.l	#4,d7
+	eor.l	d2,d7
+	and.l	d4,d7
+	eor.l	d7,d2
+	move.l	a4,(a1)+
+	lsl.l	#4,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 b7b6b5b4f7f6f5f4 c7c6c5c4g7g6g5g4 d7d6d5d4h7h6h5h4
+	| a3a2a1a0e3e2e1e0 b3b2b1b0f3f2f1f0 c3c2c1c0g3g2g1g0 d3d2d1d0h3h2h1h0
+	| i7i6i5i4m7m6m5m4 j7j6j5j4n7n6n5n4 k7k6k5k4o7o6o5o4 l7l6l5l4p7p6p5p4
+	| i3i2i1i0m3m2m1m0 j3j2j1j0n3n2n1n0 k3k2k1k0o3o2o1o0 l3l2l1l0p3p2p1p0
+
+	move.l	d2,d7
+	lsr.l	#8,d7
+	eor.l	d0,d7
+	and.l	d5,d7
+	eor.l	d7,d0
+	move.l	a5,(a1)+
+	lsl.l	#8,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#8,d7
+	eor.l	d1,d7
+	and.l	d5,d7
+	eor.l	d7,d1
+	move.l	a6,(a1)+
+	lsl.l	#8,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 i7i6i5i4m7m6m5m4 c7c6c5c4g7g6g5g4 k7k6k5k4o7o6o5o4
+	| a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0 c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
+	| b7b6b5b4f7f6f5f4 j7j6j5j4n7n6n5n4 d7d6d5d4h7h6h5h4 l7l6l5l4p7p6p5p4
+	| b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0 d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
+
+c2p1x1_8_start:
+	move.l	d2,d7
+	lsr.l	#1,d7
+	eor.l	d0,d7
+	and.l	d6,d7
+	eor.l	d7,d0
+	add.l	d7,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#1,d7
+	eor.l	d1,d7
+	and.l	d6,d7
+	eor.l	d7,d1
+	add.l	d7,d7
+	eor.l	d7,d3
+
+	| a7b7a5b5e7f7e5f5 i7j7i5j5m7n7m5n5 c7d7c5d5g7h7g5h5 k7l7k5l5o7p7o5p5
+	| a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1
+	| a6b6a4b4e6f6e4f4 i6j6i4j4m6n6m4n4 c6d6c4d4g6h6g4h4 k6l6k4l4o6p6o4p4
+	| a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
+
+	move.w	d2,d7
+	move.w	d0,d2
+	swap	d2
+	move.w	d2,d0
+	move.w	d7,d2
+	move.w	d3,d7
+	move.w	d1,d3
+	swap	d3
+	move.w	d3,d1
+	move.w	d7,d3
+
+	| a7b7a5b5e7f7e5f5 i7j7i5j5m7n7m5n5 a6b6a4b4e6f6e4f4 i6j6i4j4m6n6m4n4
+	| a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0
+	| c7d7c5d5g7h7g5h5 k7l7k5l5o7p7o5p5 c6d6c4d4g6h6g4h4 k6l6k4l4o6p6o4p4
+	| c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
+
+	move.l	d2,d7
+	lsr.l	#2,d7
+	eor.l	d0,d7
+	and.l	#0x33333333,d7
+	eor.l	d7,d0
+	lsl.l	#2,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#2,d7
+	eor.l	d1,d7
+	and.l	#0x33333333,d7
+	eor.l	d7,d1
+	lsl.l	#2,d7
+	eor.l	d7,d3
+
+	| a7b7c7d7e7f7g7h7 i7j7k7l7m7n7o7p7 a6b6c6d6e6f6g6h6 i6j6k6l6m6n6o6p6
+	| a3b3c3d3e3f3g3h3 i3j3k3l3m3n3o3p3 a2b2c2d2e2f2g2h2 i2j2k2l2m2n2o2p2
+	| a5b5c5d5e5f5g5h5 i5j5k5l5m5n5o5p5 a4b4c4d4e4f4g4h4 i4j4k4l4m4n4o4p4
+	| a1b1c1d1e1f1g1h1 i1j1k1l1m1n1o1p1 a0b0c0d0e0f0g0h0 i0j0k0l0m0n0o0p0
 
-SDL_Atari_C2p8_pix16:
+	swap	d0
+	swap	d1
+	swap	d2
+	swap	d3
 
-	movel	a0@+,d0
-	movel	a0@+,d1
-	movel	a0@+,d2
-	movel	a0@+,d3
-/*
-	d0 = a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
-	d1 = e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
-	d2 = i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
-	d3 = m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
-*/
-	movel	d1,d7
-	lsrl	#4,d7
-	movel	a3,a1@+
-	eorl	d0,d7
-	andl	d4,d7
-	eorl	d7,d0
-	lsll	#4,d7
-	eorl	d7,d1
-
-	movel	d3,d7
-	lsrl	#4,d7
-	eorl	d2,d7
-	andl	d4,d7
-	eorl	d7,d2
-	movel	a4,a1@+
-	lsll	#4,d7
-	eorl	d7,d3
-
-	movel	d2,d7
-	lsrl	#8,d7
-	eorl	d0,d7
-	andl	d5,d7
-	eorl	d7,d0
-	movel	a5,a1@+
-	lsll	#8,d7
-	eorl	d7,d2
-
-	movel	d3,d7
-	lsrl	#8,d7
-	eorl	d1,d7
-	andl	d5,d7
-	eorl	d7,d1
-	movel	a6,a1@+
-	lsll	#8,d7
-	eorl	d7,d3
-/*
-	d0 = a7a6a5a4e7e6e5e4 i7i6i5i4m7m6m5m4 c7c6c5c4g7g6g5g4 k7k6k5k4o7o6o5o4
-	d1 = a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0 c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
-	d2 = b7b6b5b4f7f6f5f4 j7j6j5j4n7n6n5n4 d7d6d5d4h7h6h5h4 l7l6l5l4p7p6p5p4
-	d3 = b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0 d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
-*/
+	move.l	d0,a6
+	move.l	d2,a5
+	move.l	d1,a4
+	move.l	d3,a3
+
+	cmp.l	a0,a2
+	bne	c2p1x1_8_pix16
+
+	move.l	a3,(a1)+
+	move.l	a4,(a1)+
+	move.l	a5,(a1)+
+	move.l	a6,(a1)+
+
+	movem.l	(sp)+,d2-d7/a2-a6
+#endif
+	rts
 
-SDL_Atari_C2p8_start:
-
-	movel	d2,d7
-	lsrl	#1,d7
-	eorl	d0,d7
-	andl	d6,d7
-	eorl	d7,d0
-	addl	d7,d7
-	eorl	d7,d2
-
-	movel	d3,d7
-	lsrl	#1,d7
-	eorl	d1,d7
-	andl	d6,d7
-	eorl	d7,d1
-	addl	d7,d7
-	eorl	d7,d3
-/*
-	d0 = a7b7a5b5e7f7e5f5 i7j7i5j5m7n7m5n5 c7d7c5d5g7h7g5h5 k7l7k5l5o7p7o5p5
-	d1 = a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1
-	d2 = a6b6a4b4e6f6e4f4 i6j6i4j4m6n6m4n4 c6d6c4d4g6h6g4h4 k6l6k4l4o6p6o4p4
-	d3 = a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
-*/
-	movew	d2,d7
-	movew	d0,d2
-	swap	d2
-	movew	d2,d0
-	movew	d7,d2
 
-	movew	d3,d7
-	movew	d1,d3
+| void SDL_Atari_C2pConvert8_tt(const byte *pChunky, const byte *pChunkyEnd, byte *pScreen, uint32 screenPitch);
+SYM(SDL_Atari_C2pConvert8_tt):
+#if !defined(__mcoldfire__)
+	movem.l	d2-d7/a2-a6,-(sp)			| 6 + 5 = 11 longs
+
+#ifdef __FASTCALL__
+										| a0: chunky
+	move.l	a1,a2						| a2: chunky end
+	move.l	(11*4+4,sp),a1				| a1: screen
+										| d0.l: screen pitch (double width)
+#else
+	move.l	(11*4+4,sp),a0				| a0: chunky
+	move.l	(11*4+8,sp),a2				| a2: chunky end
+	move.l	(11*4+12,sp),a1				| a1: screen
+	move.l	(11*4+16,sp),d0				| d0.l: screen pitch (double width)
+#endif
+
+	move.l	sp,old_sp
+
+	move.l	d0,screen_pitch
+
+	lsr.l	#1,d0
+	lea	(a1,d0.l),a7				| a7: end of first dst line
+
+	move.l	d0,screen_offset
+
+	move.l	#0x0f0f0f0f,d4
+	move.l	#0x00ff00ff,d5
+	move.l	#0x55555555,d6
+
+	move.l	(a0)+,d0
+	move.l	(a0)+,d1
+	move.l	(a0)+,d2
+	move.l	(a0)+,d3
+
+	| a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
+	| e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
+	| i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
+	| m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
+
+	move.l	d1,d7
+	lsr.l	#4,d7
+	eor.l	d0,d7
+	and.l	d4,d7
+	eor.l	d7,d0
+	lsl.l	#4,d7
+	eor.l	d7,d1
+	move.l	d3,d7
+	lsr.l	#4,d7
+	eor.l	d2,d7
+	and.l	d4,d7
+	eor.l	d7,d2
+	lsl.l	#4,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 b7b6b5b4f7f6f5f4 c7c6c5c4g7g6g5g4 d7d6d5d4h7h6h5h4
+	| a3a2a1a0e3e2e1e0 b3b2b1b0f3f2f1f0 c3c2c1c0g3g2g1g0 d3d2d1d0h3h2h1h0
+	| i7i6i5i4m7m6m5m4 j7j6j5j4n7n6n5n4 k7k6k5k4o7o6o5o4 l7l6l5l4p7p6p5p4
+	| i3i2i1i0m3m2m1m0 j3j2j1j0n3n2n1n0 k3k2k1k0o3o2o1o0 l3l2l1l0p3p2p1p0
+
+	move.l	d2,d7
+	lsr.l	#8,d7
+	eor.l	d0,d7
+	and.l	d5,d7
+	eor.l	d7,d0
+	lsl.l	#8,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#8,d7
+	eor.l	d1,d7
+	and.l	d5,d7
+	eor.l	d7,d1
+	lsl.l	#8,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 i7i6i5i4m7m6m5m4 c7c6c5c4g7g6g5g4 k7k6k5k4o7o6o5o4
+	| a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0 c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
+	| b7b6b5b4f7f6f5f4 j7j6j5j4n7n6n5n4 d7d6d5d4h7h6h5h4 l7l6l5l4p7p6p5p4
+	| b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0 d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
+
+	bra.s	c2p1x1_8_tt_start
+
+c2p1x1_8_tt_pix16:
+	move.l	(a0)+,d0
+	move.l	(a0)+,d1
+	move.l	(a0)+,d2
+	move.l	(a0)+,d3
+
+	| a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
+	| e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
+	| i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
+	| m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
+
+	move.l	d1,d7
+	lsr.l	#4,d7
+	move.l	a3,(a1)+
+	eor.l	d0,d7
+	and.l	d4,d7
+	eor.l	d7,d0
+	lsl.l	#4,d7
+	eor.l	d7,d1
+	move.l	d3,d7
+	lsr.l	#4,d7
+	eor.l	d2,d7
+	and.l	d4,d7
+	eor.l	d7,d2
+	move.l	a4,(a1)+
+	lsl.l	#4,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 b7b6b5b4f7f6f5f4 c7c6c5c4g7g6g5g4 d7d6d5d4h7h6h5h4
+	| a3a2a1a0e3e2e1e0 b3b2b1b0f3f2f1f0 c3c2c1c0g3g2g1g0 d3d2d1d0h3h2h1h0
+	| i7i6i5i4m7m6m5m4 j7j6j5j4n7n6n5n4 k7k6k5k4o7o6o5o4 l7l6l5l4p7p6p5p4
+	| i3i2i1i0m3m2m1m0 j3j2j1j0n3n2n1n0 k3k2k1k0o3o2o1o0 l3l2l1l0p3p2p1p0
+
+	move.l	d2,d7
+	lsr.l	#8,d7
+	eor.l	d0,d7
+	and.l	d5,d7
+	eor.l	d7,d0
+	move.l	a5,(a1)+
+	lsl.l	#8,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#8,d7
+	eor.l	d1,d7
+	and.l	d5,d7
+	eor.l	d7,d1
+	move.l	a6,(a1)+
+	lsl.l	#8,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 i7i6i5i4m7m6m5m4 c7c6c5c4g7g6g5g4 k7k6k5k4o7o6o5o4
+	| a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0 c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
+	| b7b6b5b4f7f6f5f4 j7j6j5j4n7n6n5n4 d7d6d5d4h7h6h5h4 l7l6l5l4p7p6p5p4
+	| b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0 d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
+
+	cmp.l	a1,a7					| end of dst line?
+	bne.s	c2p1x1_8_tt_start
+
+	add.l	(screen_offset,pc),a1
+	add.l	(screen_pitch,pc),a7
+
+c2p1x1_8_tt_start:
+	move.l	d2,d7
+	lsr.l	#1,d7
+	eor.l	d0,d7
+	and.l	d6,d7
+	eor.l	d7,d0
+	add.l	d7,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#1,d7
+	eor.l	d1,d7
+	and.l	d6,d7
+	eor.l	d7,d1
+	add.l	d7,d7
+	eor.l	d7,d3
+
+	| a7b7a5b5e7f7e5f5 i7j7i5j5m7n7m5n5 c7d7c5d5g7h7g5h5 k7l7k5l5o7p7o5p5
+	| a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1
+	| a6b6a4b4e6f6e4f4 i6j6i4j4m6n6m4n4 c6d6c4d4g6h6g4h4 k6l6k4l4o6p6o4p4
+	| a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
+
+	move.w	d2,d7
+	move.w	d0,d2
+	swap	d2
+	move.w	d2,d0
+	move.w	d7,d2
+	move.w	d3,d7
+	move.w	d1,d3
 	swap	d3
-	movew	d3,d1
-	movew	d7,d3
-/*
-	d0 = a7b7a5b5e7f7e5f5 i7j7i5j5m7n7m5n5 a6b6a4b4e6f6e4f4 i6j6i4j4m6n6m4n4
-	d1 = a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0
-	d2 = c7d7c5d5g7h7g5h5 k7l7k5l5o7p7o5p5 c6d6c4d4g6h6g4h4 k6l6k4l4o6p6o4p4
-	d3 = c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
-*/
-	movel	d2,d7
-	lsrl	#2,d7
-	eorl	d0,d7
-	andl	#0x33333333,d7
-	eorl	d7,d0
-	lsll	#2,d7
-	eorl	d7,d2
-
-	movel	d3,d7
-	lsrl	#2,d7
-	eorl	d1,d7
-	andl	#0x33333333,d7
-	eorl	d7,d1
-	lsll	#2,d7
-	eorl	d7,d3
-/*
-	d0 = a7b7c7d7e7f7g7h7 i7j7k7l7m7n7o7p7 a6b6c6d6e6f6g6h6 i6j6k6l6m6n6o6p6
-	d1 = a3b3c3d3e3f3g3h3 i3j3k3l3m3n3o3p3 a2b2c2d2e2f2g2h2 i2j2k2l2m2n2o2p2
-	d2 = a5b5c5d5e5f5g5h5 i5j5k5l5m5n5o5p5 a4b4c4d4e4f4g4h4 i4j4k4l4m4n4o4p4
-	d3 = a1b1c1d1e1f1g1h1 i1j1k1l1m1n1o1p1 a0b0c0d0e0f0g0h0 i0j0k0l0m0n0o0p0
-*/
+	move.w	d3,d1
+	move.w	d7,d3
+
+	| a7b7a5b5e7f7e5f5 i7j7i5j5m7n7m5n5 a6b6a4b4e6f6e4f4 i6j6i4j4m6n6m4n4
+	| a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0
+	| c7d7c5d5g7h7g5h5 k7l7k5l5o7p7o5p5 c6d6c4d4g6h6g4h4 k6l6k4l4o6p6o4p4
+	| c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
+
+	move.l	d2,d7
+	lsr.l	#2,d7
+	eor.l	d0,d7
+	and.l	#0x33333333,d7
+	eor.l	d7,d0
+	lsl.l	#2,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#2,d7
+	eor.l	d1,d7
+	and.l	#0x33333333,d7
+	eor.l	d7,d1
+	lsl.l	#2,d7
+	eor.l	d7,d3
+
+	| a7b7c7d7e7f7g7h7 i7j7k7l7m7n7o7p7 a6b6c6d6e6f6g6h6 i6j6k6l6m6n6o6p6
+	| a3b3c3d3e3f3g3h3 i3j3k3l3m3n3o3p3 a2b2c2d2e2f2g2h2 i2j2k2l2m2n2o2p2
+	| a5b5c5d5e5f5g5h5 i5j5k5l5m5n5o5p5 a4b4c4d4e4f4g4h4 i4j4k4l4m4n4o4p4
+	| a1b1c1d1e1f1g1h1 i1j1k1l1m1n1o1p1 a0b0c0d0e0f0g0h0 i0j0k0l0m0n0o0p0
+
 	swap	d0
 	swap	d1
 	swap	d2
 	swap	d3
 
-	movel	d0,a6
-	movel	d2,a5
-	movel	d1,a4
-	movel	d3,a3
+	move.l	d0,a6
+	move.l	d2,a5
+	move.l	d1,a4
+	move.l	d3,a3
 
-	cmpl	a0,a2
-	bgt		SDL_Atari_C2p8_pix16
+	cmp.l	a0,a2
+	bne	c2p1x1_8_tt_pix16
 
-	movel	a3,a1@+
-	movel	a4,a1@+
-	movel	a5,a1@+
-	movel	a6,a1@+
+	move.l	a3,(a1)+
+	move.l	a4,(a1)+
+	move.l	a5,(a1)+
+	move.l	a6,(a1)+
 
-	/* Double the line ? */
+	move.l	old_sp,sp
+	movem.l	(sp)+,d2-d7/a2-a6
+#endif
+	rts
 
-	movel	c2p_srcpitch,d0
-	movel	c2p_dstpitch,d1
 
-	tstl	c2p_dblligne
-	beqs	SDL_Atari_C2p8_nodblline
+| void SDL_Atari_C2pConvert8_rect(const byte *pChunky, const byte *pChunkyEnd, uint32 chunkyWidth, uint32 chunkyPitch, byte *pScreen, uint32 screenPitch);
+SYM(SDL_Atari_C2pConvert8_rect):
+#if !defined(__mcoldfire__)
+	movem.l	d2-d7/a2-a6,-(sp)			| 6 + 5 = 11 longs
+
+#ifdef __FASTCALL__
+										| a0: chunky
+	move.l	a1,chunky_end
+										| d0.l: chunky width
+	move.l	(11*4+4,sp),a1				| a1: screen
+	exg		d1,d2						| d2.l: chunky pitch
+										| d1.l: screen pitch
+#else
+	move.l	(11*4+4,sp),a0				| a0: chunky
+	move.l	(11*4+8,sp),chunky_end
+	move.l	(11*4+12,sp),d0				| d0.l: chunky width
+	move.l	(11*4+16,sp),d2				| d2.l: chunky pitch
+	move.l	(11*4+20,sp),a1				| a1: screen
+	move.l	(11*4+24,sp),d1				| d1.l: screen pitch
+#endif
 
-	movel	c2p_curdst,a0
-	movel	a0,a1
-	addl	d1,a1
+	move.l	sp,old_sp
+
+	lea	(a0,d0.l),a2				| a2: end of first src line
+	lea	(a1,d0.l),a7				| a7: end of first dst line
+
+	move.l	d1,screen_pitch
+
+	sub.l	d0,d1
+	move.l	d1,screen_offset
+
+	move.l	d2,chunky_pitch
+
+	sub.l	d0,d2
+	move.l	d2,chunky_offset
+
+	move.l	#0x0f0f0f0f,d4
+	move.l	#0x00ff00ff,d5
+	move.l	#0x55555555,d6
+
+	move.l	(a0)+,d0
+	move.l	(a0)+,d1
+	move.l	(a0)+,d2
+	move.l	(a0)+,d3
+
+	| a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
+	| e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
+	| i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
+	| m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
+
+	move.l	d1,d7
+	lsr.l	#4,d7
+	eor.l	d0,d7
+	and.l	d4,d7
+	eor.l	d7,d0
+	lsl.l	#4,d7
+	eor.l	d7,d1
+	move.l	d3,d7
+	lsr.l	#4,d7
+	eor.l	d2,d7
+	and.l	d4,d7
+	eor.l	d7,d2
+	lsl.l	#4,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 b7b6b5b4f7f6f5f4 c7c6c5c4g7g6g5g4 d7d6d5d4h7h6h5h4
+	| a3a2a1a0e3e2e1e0 b3b2b1b0f3f2f1f0 c3c2c1c0g3g2g1g0 d3d2d1d0h3h2h1h0
+	| i7i6i5i4m7m6m5m4 j7j6j5j4n7n6n5n4 k7k6k5k4o7o6o5o4 l7l6l5l4p7p6p5p4
+	| i3i2i1i0m3m2m1m0 j3j2j1j0n3n2n1n0 k3k2k1k0o3o2o1o0 l3l2l1l0p3p2p1p0
+
+	move.l	d2,d7
+	lsr.l	#8,d7
+	eor.l	d0,d7
+	and.l	d5,d7
+	eor.l	d7,d0
+	lsl.l	#8,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#8,d7
+	eor.l	d1,d7
+	and.l	d5,d7
+	eor.l	d7,d1
+	lsl.l	#8,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 i7i6i5i4m7m6m5m4 c7c6c5c4g7g6g5g4 k7k6k5k4o7o6o5o4
+	| a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0 c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
+	| b7b6b5b4f7f6f5f4 j7j6j5j4n7n6n5n4 d7d6d5d4h7h6h5h4 l7l6l5l4p7p6p5p4
+	| b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0 d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
+
+	bra.s	c2p1x1_8_rect_start
+
+c2p1x1_8_rect_pix16:
+	move.l	(a0)+,d0
+	move.l	(a0)+,d1
+	move.l	(a0)+,d2
+	move.l	(a0)+,d3
+
+	| a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
+	| e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
+	| i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
+	| m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
+
+	move.l	d1,d7
+	lsr.l	#4,d7
+	move.l	a3,(a1)+
+	eor.l	d0,d7
+	and.l	d4,d7
+	eor.l	d7,d0
+	lsl.l	#4,d7
+	eor.l	d7,d1
+	move.l	d3,d7
+	lsr.l	#4,d7
+	eor.l	d2,d7
+	and.l	d4,d7
+	eor.l	d7,d2
+	move.l	a4,(a1)+
+	lsl.l	#4,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 b7b6b5b4f7f6f5f4 c7c6c5c4g7g6g5g4 d7d6d5d4h7h6h5h4
+	| a3a2a1a0e3e2e1e0 b3b2b1b0f3f2f1f0 c3c2c1c0g3g2g1g0 d3d2d1d0h3h2h1h0
+	| i7i6i5i4m7m6m5m4 j7j6j5j4n7n6n5n4 k7k6k5k4o7o6o5o4 l7l6l5l4p7p6p5p4
+	| i3i2i1i0m3m2m1m0 j3j2j1j0n3n2n1n0 k3k2k1k0o3o2o1o0 l3l2l1l0p3p2p1p0
+
+	move.l	d2,d7
+	lsr.l	#8,d7
+	eor.l	d0,d7
+	and.l	d5,d7
+	eor.l	d7,d0
+	move.l	a5,(a1)+
+	lsl.l	#8,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#8,d7
+	eor.l	d1,d7
+	and.l	d5,d7
+	eor.l	d7,d1
+	move.l	a6,(a1)+
+	lsl.l	#8,d7
+	eor.l	d7,d3
+
+	| a7a6a5a4e7e6e5e4 i7i6i5i4m7m6m5m4 c7c6c5c4g7g6g5g4 k7k6k5k4o7o6o5o4
+	| a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0 c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
+	| b7b6b5b4f7f6f5f4 j7j6j5j4n7n6n5n4 d7d6d5d4h7h6h5h4 l7l6l5l4p7p6p5p4
+	| b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0 d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
+
+	cmp.l	a1,a7					| end of dst line?
+	bne.s	c2p1x1_8_rect_start
+
+	add.l	(screen_offset,pc),a1
+	add.l	(screen_pitch,pc),a7
+
+c2p1x1_8_rect_start:
+	move.l	d2,d7
+	lsr.l	#1,d7
+	eor.l	d0,d7
+	and.l	d6,d7
+	eor.l	d7,d0
+	add.l	d7,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#1,d7
+	eor.l	d1,d7
+	and.l	d6,d7
+	eor.l	d7,d1
+	add.l	d7,d7
+	eor.l	d7,d3
+
+	| a7b7a5b5e7f7e5f5 i7j7i5j5m7n7m5n5 c7d7c5d5g7h7g5h5 k7l7k5l5o7p7o5p5
+	| a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1
+	| a6b6a4b4e6f6e4f4 i6j6i4j4m6n6m4n4 c6d6c4d4g6h6g4h4 k6l6k4l4o6p6o4p4
+	| a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
+
+	move.w	d2,d7
+	move.w	d0,d2
+	swap	d2
+	move.w	d2,d0
+	move.w	d7,d2
+	move.w	d3,d7
+	move.w	d1,d3
+	swap	d3
+	move.w	d3,d1
+	move.w	d7,d3
+
+	| a7b7a5b5e7f7e5f5 i7j7i5j5m7n7m5n5 a6b6a4b4e6f6e4f4 i6j6i4j4m6n6m4n4
+	| a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0
+	| c7d7c5d5g7h7g5h5 k7l7k5l5o7p7o5p5 c6d6c4d4g6h6g4h4 k6l6k4l4o6p6o4p4
+	| c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
+
+	move.l	d2,d7
+	lsr.l	#2,d7
+	eor.l	d0,d7
+	and.l	#0x33333333,d7
+	eor.l	d7,d0
+	lsl.l	#2,d7
+	eor.l	d7,d2
+	move.l	d3,d7
+	lsr.l	#2,d7
+	eor.l	d1,d7
+	and.l	#0x33333333,d7
+	eor.l	d7,d1
+	lsl.l	#2,d7
+	eor.l	d7,d3
+
+	| a7b7c7d7e7f7g7h7 i7j7k7l7m7n7o7p7 a6b6c6d6e6f6g6h6 i6j6k6l6m6n6o6p6
+	| a3b3c3d3e3f3g3h3 i3j3k3l3m3n3o3p3 a2b2c2d2e2f2g2h2 i2j2k2l2m2n2o2p2
+	| a5b5c5d5e5f5g5h5 i5j5k5l5m5n5o5p5 a4b4c4d4e4f4g4h4 i4j4k4l4m4n4o4p4
+	| a1b1c1d1e1f1g1h1 i1j1k1l1m1n1o1p1 a0b0c0d0e0f0g0h0 i0j0k0l0m0n0o0p0
 
-	movew	c2p_width+2,d7
-	lsrw	#4,d7
-	subql	#1,d7
-SDL_Atari_C2p8_dblloop:
-	movel	a0@+,a1@+
-	movel	a0@+,a1@+
-	movel	a0@+,a1@+
-	movel	a0@+,a1@+
-	dbra	d7,SDL_Atari_C2p8_dblloop
+	swap	d0
+	swap	d1
+	swap	d2
+	swap	d3
 
-	addl	d1,c2p_curdst
+	move.l	d0,a6
+	move.l	d2,a5
+	move.l	d1,a4
+	move.l	d3,a3
 
-SDL_Atari_C2p8_nodblline:
+	cmp.l	a0,a2					| end of src line?
+	bne	c2p1x1_8_rect_pix16
 
-	/* Next line */
+	cmp.l	(chunky_end,pc),a2
+	beq.s	c2p1x1_8_rect_done
 
-	addl	d0,c2p_cursrc
-	addl	d1,c2p_curdst
+	add.l	(chunky_offset,pc),a0
+	add.l	(chunky_pitch,pc),a2
 
-	subqw	#1,c2p_row
-	bne		SDL_Atari_C2p8_rowloop	
+	bra	c2p1x1_8_rect_pix16
 
-	moveml	sp@+,d2-d7/a2-a6
+c2p1x1_8_rect_done:
+	move.l	a3,(a1)+
+	move.l	a4,(a1)+
+	move.l	a5,(a1)+
+	move.l	a6,(a1)+
+
+	move.l	old_sp,sp
+	movem.l	(sp)+,d2-d7/a2-a6
 #endif
 	rts
 
 /* ------------	Conversion C2P, 4 bits ------------ */
 
+| void SDL_Atari_C2pConvert4(const byte *pChunky, const byte *pChunkyEnd, byte *pScreen);
 SYM(SDL_Atari_C2pConvert4):
 #if !defined(__mcoldfire__)
-	movel	sp@(4),c2p_source
-	movel	sp@(8),c2p_dest
-	movel	sp@(12),c2p_width
-	movel	sp@(16),c2p_height
-	movel	sp@(20),c2p_dblligne
-	movel	sp@(24),c2p_srcpitch
-	movel	sp@(28),c2p_dstpitch
-
-	moveml	d2-d7/a2-a6,sp@-
-
-	movel	c2p_source,a0
-	movel	c2p_dest,a1
-	lea	c2p_table,a2
-	movel	#0x00070001,d3
-#if defined(__mc68020__)
-	moveq	#0,d0
-#endif
-	
-	movel	c2p_height,d7
-	subql	#1,d7
-c2p4_bcly:
-	movel	a0,a4	| Save start address of source
-	movel	a1,a5	| Save start address of dest
-
-	| Conversion
-                 			
-	movel	c2p_width,d6
-	lsrw	#4,d6
-	subql	#1,d6
-c2p4_bclx:
-	| Octets 0-7
-	
-	moveq	#0,d1
-	moveq	#7,d5
-c2p4_bcl07:
-#if defined(__mc68020__)
-	moveb	a0@+,d0
-	lea		a2@(0,d0:w:4),a3
+#ifdef __FASTCALL__
+									| a0: chunky
+	move.l	a1,d0					| chunky end
+	move.l	4(sp),a1				| screen
 #else
-	moveq	#0,d0
-	moveb	a0@+,d0
-	lslw	#2,d0
-	lea		a2@(0,d0:w),a3
+	move.l	(4,sp),a0				| chunky
+	move.l	(8,sp),d0				| chunky end
+	move.l	(12,sp),a1				| screen
 #endif
-	lsll	#1,d1
-	orl		a3@,d1
-	dbra	d5,c2p4_bcl07
+	movem.l	d2-d7/a2-a6,-(sp)
+	move.l	d0,a2
+	lea	SYM(SDL_Atari_C2pPalette4),a3
+	lea	(4,a0),a4
+
+	moveq	#0,d3
+	move.l	#0x33333333,d4
+	move.l	#0x00ff00ff,d5
+	move.l	#0x55555555,d6
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d0
+	addq.l	#4,a0
+	addq.l	#4,a4
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d1
+	addq.l	#4,a0
+	addq.l	#4,a4
+
+	bra	c2p1x1_4_start
+
+c2p1x1_4_pix16:
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d0
+
+	move.l	a5,(a1)+
+	addq.l	#4,a0
+	addq.l	#4,a4
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a3,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a4)+,d3
+	or.b	(a3,d3.l),d1
+
+	move.l	a6,(a1)+
+	addq.l	#4,a0
+	addq.l	#4,a4
+
+c2p1x1_4_start:
+
+	| a3a2a1a0e3e2e1e0 b3b2b1b0f3f2f1f0 c3c2c1c0g3g2g1g0 d3d2d1d0h3h2h1h0
+	| i3i2i1i0m3m2m1m0 j3j2j1j0n3n2n1n0 k3k2k1k0o3o2o1o0 l3l2l1l0p3p2p1p0
+
+	move.l	d1,d7
+	lsr.l	#8,d7
+	eor.l	d0,d7
+	and.l	d5,d7
+	eor.l	d7,d0
+	lsl.l	#8,d7
+	eor.l	d7,d1
+
+	| a3a2a1a0e3e2e1e0 i3i2i1i0m3m2m1m0 c3c2c1c0g3g2g1g0 k3k2k1k0o3o2o1o0
+	| b3b2b1b0f3f2f1f0 j3j2j1j0n3n2n1n0 d3d2d1d0h3h2h1h0 l3l2l1l0p3p2p1p0
+
+	move.l	d1,d7
+	lsr.l	#1,d7
+	eor.l	d0,d7
+	and.l	d6,d7
+	eor.l	d7,d0
+	add.l	d7,d7
+	eor.l	d7,d1
+
+	| a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1
+	| a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
+
+	move.w	d1,d7
+	move.w	d0,d1
+	swap	d1
+	move.w	d1,d0
+	move.w	d7,d1
 
-	movepl	d1,a1@(0)
-	addw	d3,a1
-	swap	d3
-	
-	| Octets 8-15
-
-	moveq	#0,d1
-	moveq	#7,d5
-c2p4_bcl815:
-#if defined(__mc68020__)
-	moveb	a0@+,d0
-	lea		a2@(0,d0:w:4),a3
-#else
-	moveq	#0,d0
-	moveb	a0@+,d0
-	lslw	#2,d0
-	lea		a2@(0,d0:w),a3
-#endif
-	lsll	#1,d1
-	orl		a3@,d1
-	dbra	d5,c2p4_bcl815
+	| a3b3a1b1e3f3e1f1 i3j3i1j1m3n3m1n1 a2b2a0b0e2f2e0f0 i2j2i0j0m2n2m0n0
+	| c3d3c1d1g3h3g1h1 k3l3k1l1o3p3o1p1 c2d2c0d0g2h2g0h0 k2l2k0l0o2p2o0p0
 
-	movepl	d1,a1@(0)
-	addw	d3,a1
-	swap	d3
+	move.l	d1,d7
+	lsr.l	#2,d7
+	eor.l	d0,d7
+	and.l	d4,d7
+	eor.l	d7,d0
+	lsl.l	#2,d7
+	eor.l	d7,d1
 
-	dbra	d6,c2p4_bclx
+	| a3b3c3d3e3f3g3h3 i3j3k3l3m3n3o3p3 a2b2c2d2e2f2g2h2 i2j2k2l2m2n2o2p2
+	| a1b1c1d1e1f1g1h1 i1j1k1l1m1n1o1p1 a0b0c0d0e0f0g0h0 i0j0k0l0m0n0o0p0
 
-	| Double line ?
+	swap	d0
+	swap	d1
 
-	tstl	c2p_dblligne
-	beqs	c2p4_nodblligne
+	move.l	d1,a5
+	move.l	d0,a6
 
-	movel	a5,a6			| src line
-	movel	a5,a1			| dest line
-	addl	c2p_dstpitch,a1
+	cmp.l	a0,a2
+	bne	c2p1x1_4_pix16
 
-	movel	c2p_width,d6
-	lsrw	#3,d6
-	subql	#1,d6
-c2p4_copydbl:
-	movel	a6@+,a1@+
-	dbra	d6,c2p4_copydbl
+	move.l	a5,(a1)+
+	move.l	a6,(a1)+
 
-	addl	c2p_dstpitch,a5
-c2p4_nodblligne:
+	movem.l	(sp)+,d2-d7/a2-a6
+#endif
+	rts
 
-	| Next line
 
-	movel	a4,a0		
-	addl	c2p_srcpitch,a0
-	movel	a5,a1
-	addl	c2p_dstpitch,a1
+| void SDL_Atari_C2pConvert4_rect(const byte *pChunky, const byte *pChunkyEnd, uint32 chunkyWidth, uint32 chunkyPitch, byte *pScreen, uint32 screenPitch);
+SYM(SDL_Atari_C2pConvert4_rect):
+#if !defined(__mcoldfire__)
+	movem.l	d2-d7/a2-a6,-(sp)		| 6 + 5 = 11 longs
+
+#ifdef __FASTCALL__
+									| a0: chunky
+	move.l	a1,chunky_end
+									| d0.l: chunky width
+	move.l	(11*4+4,sp),a1			| a1: screen
+	exg		d1,d2					| d2.l: chunky pitch
+									| d1.l: screen pitch
+#else
+	move.l	(11*4+4,sp),a0			| a0: chunky
+	move.l	(11*4+8,sp),chunky_end
+	move.l	(11*4+12,sp),d0			| d0.l: chunky width
+	move.l	(11*4+16,sp),d2			| d2.l: chunky pitch
+	move.l	(11*4+20,sp),a1			| a1: screen
+	move.l	(11*4+24,sp),d1			| d1.l: screen pitch
+#endif
 
-	dbra	d7,c2p4_bcly
+	move.l	d0,d3					| d3.l: screen width
+	lsr.l	#1,d3					|
+
+	lea	(a0,d0.l),a2				| a2: end of first src line
+	lea	(a1,d3.l),a3				| a3: end of first dst line
+	lea	SYM(SDL_Atari_C2pPalette4),a4
+
+	move.l	d1,screen_pitch
+
+	sub.l	d3,d1
+	move.l	d1,screen_offset
+
+	move.l	d2,chunky_pitch
+
+	sub.l	d0,d2
+	move.l	d2,chunky_offset
+
+	moveq	#0,d3
+	move.l	#0x33333333,d4
+	move.l	#0x00ff00ff,d5
+	move.l	#0x55555555,d6
+
+	lea	(a0,4),a5
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d0
+
+	addq.l	#4,a0
+	addq.l	#4,a5
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d1
+
+	addq.l	#4,a0
+
+	bra	c2p1x1_4_rect_start
+
+c2p1x1_4_rect_pix16:
+	move.l	a5,(a1)+
+
+	lea	(a0,4),a5
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d0
+	lsl.l	#8,d0
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d0
+	lsl.b	#4,d0
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d0
+
+	move.l	a6,(a1)+
+	addq.l	#4,a0
+	addq.l	#4,a5
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d1
+	lsl.l	#8,d1
+
+	move.b	(a0)+,d3
+	move.b	(a4,d3.l),d1
+	lsl.b	#4,d1
+	move.b	(a5)+,d3
+	or.b	(a4,d3.l),d1
+
+	addq.l	#4,a0
+
+	cmp.l	a1,a3				| end of dst line?
+	bne.s	c2p1x1_4_rect_start
+
+	add.l	(screen_offset,pc),a1
+	add.l	(screen_pitch,pc),a3
+
+c2p1x1_4_rect_start:
+
+	| a3a2a1a0e3e2e1e0 b3b2b1b0f3f2f1f0 c3c2c1c0g3g2g1g0 d3d2d1d0h3h2h1h0
+	| i3i2i1i0m3m2m1m0 j3j2j1j0n3n2n1n0 k3k2k1k0o3o2o1o0 l3l2l1l0p3p2p1p0
+
+	

(Patch may be truncated, please check the link at the top of this post.)