This patch is split in three parts:
-
000_nasm_include.diff: add -I$(srcdir)/ to the nasm flags so that
we can include files from asm files. -
004_propagate_pic_to_nasm.diff: fix strip_fPIC.sh so that it does
not strip -DPIC, which is actually useful for nasm. -
013_hermes_pic_support.diff: fix the Hermes assembly code so that it
generates Position-Independent Code when -DPIC is used.
I do not know what the stance of the developers about PIC is, because
there is a slight performance hit on x86, but I believe it is negligible
in the present case. Some x86 systems (such as BeOS and OS X AFAIK)
require shared libraries to be PIC. Others such as Linux can cope
with non-PIC dynamic objects, but then they are not mapped on shared
segments.
I have also had great trouble testing all these MMX and x86 routines,
is there a program somewhere that tests all of them?
Regards,–
Sam.
-------------- next part --------------
— SDL-1.2.9.patched/src/hermes/Makefile.am 2003-10-06 09:13:16 +0200
+++ SDL-1.2.9/src/hermes/Makefile.am 2006-03-01 14:00:50 +0100
@@ -7,7 +7,7 @@
SUFFIXES = .asm
.asm.lo:
- $(LIBTOOL) --tag=CC --mode=compile $(STRIP_FPIC) $(NASM) @NASMFLAGS@ $< -o $*.o
- $(LIBTOOL) --tag=CC --mode=compile $(STRIP_FPIC) $(NASM) -I$(srcdir)/ @NASMFLAGS@ $< -o $*.o
###########################################################################
-------------- next part --------------
— SDL-1.2.9.patched/strip_fPIC.sh 2001-05-10 22:19:50 +0200
+++ SDL-1.2.9/strip_fPIC.sh 2006-03-01 13:48:03 +0100
@@ -5,8 +5,8 @@
command=""
while [ $# -gt 0 ]; do
case “$1” in
-
-?PIC)
-
# Ignore -fPIC and -DPIC options
-
-fPIC)
-
# Ignore -fPIC option ;; *) command="$command $1"
-------------- next part --------------
diff -puriN SDL-1.2.9.orig/src/hermes/common.asm SDL-1.2.9/src/hermes/common.asm
— SDL-1.2.9.orig/src/hermes/common.asm 1970-01-01 01:00:00.000000000 +0100
+++ SDL-1.2.9/src/hermes/common.asm 2006-03-01 16:24:45.000000000 +0100
@@ -0,0 +1,84 @@
+;
+; PIC support for HERMES
+; Copyright © 2006 Sam Hocevar
+; This source code is licensed under the GNU LGPL
+;
+; Please refer to the file COPYING.LIB contained in the distribution for
+; licensing conditions
+
+
+; These macros are totally harmless when PIC is not defined but can ruin
+; everything if misused in PIC mode. On x86, shared objects cannot directly
+; access global variables by address, they need to go through the GOT (global
+; offset table). Most OSes do not care about it and let you load non-shared
+; .so objects (Linux, Win32…). However, OS X requires PIC code in its
+; .dylib objects.
+;
+; - GOT_* should be used as a suffix for global addressing, eg.
+; mov eax, [foo GOT_ebx]
+; instead of
+; mov eax, [foo]
+;
+; - picgetgot computes the GOT address into the given register in PIC
+; mode, otherwise does nothing. You need to do this before using GLOBAL.
+;
+; - picpush and picpop respectively push and pop the given register
+; in PIC mode, otherwise do nothing. You should always use them around
+; picgetgot except when sure that the register is no longer used and is
+; being restored later by other means.
+;
+; - picesp is defined to compensate the changing of esp when pushing
+; a register into the stack, eg.
+; mov eax, [esp + 8]
+; pushpic ebx
+; mov eax, [picesp + 12]
+; instead of
+; mov eax, [esp + 8]
+; pushpic ebx
+; mov eax, [esp + 12]
+;
+; - picjmp jumps to a global address:
+; picgetgot ebx
+; picjmp ebx, far_address
+; instead of
+; jmp far_address
+;
+%ifdef PIC
- EXTERN GLOBAL_OFFSET_TABLE
- %define GOT_eax + eax wrt …gotoff
- %define GOT_ebx + ebx wrt …gotoff
- %define GOT_ecx + ecx wrt …gotoff
- %define GOT_edx + edx wrt …gotoff
- %macro picgetgot 1
-
call %%getgot
-
%%getgot:
-
pop %1
-
add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
- %endmacro
- %macro picpush 1
-
push %1
- %endmacro
- %macro picpop 1
-
pop %1
- %endmacro
- %define picesp esp+4
- %macro picjmp 2
-
jmp [%2 + %1 wrt ..got]
- %endmacro
+%else - %define GOT_eax
- %define GOT_ebx
- %define GOT_ecx
- %define GOT_edx
- %macro picgetgot 1
- %endmacro
- %macro picpush 1
- %endmacro
- %macro picpop 1
- %endmacro
- %define picesp esp
- %macro picjmp 2
-
jmp %2
- %endmacro
+%endif
diff -puriN SDL-1.2.9.orig/src/hermes/mmxp2_32.asm SDL-1.2.9/src/hermes/mmxp2_32.asm
— SDL-1.2.9.orig/src/hermes/mmxp2_32.asm 2006-03-01 14:30:22.000000000 +0100
+++ SDL-1.2.9/src/hermes/mmxp2_32.asm 2006-03-01 16:24:45.000000000 +0100
@@ -18,6 +18,8 @@
; are, they’re terrible on p5 MMXs, but less so on pIIs. Someone needs to
; optimise them for p5 MMXs…
+%include “common.asm”
+
BITS 32
@@ -50,8 +52,9 @@ mmx32_bgr555_mul dd 00082000h,00082000h
_ConvertMMXpII32_24RGB888:
-
picgetgot edx ; edx is not used yet ; set up mm6 as the mask, mm7 as zero
-
movq mm6, qword [mmx32_rgb888_mask]
-
movq mm6, qword [mmx32_rgb888_mask GOT_edx] pxor mm7, mm7 mov edx, ecx ; save ecx
@@ -108,16 +112,18 @@ _ConvertMMXpII32_24RGB888:
dec ecx
jnz .L3
.L4:
-
jmp _mmxreturn
-
picgetgot ecx
-
picjmp ecx, _mmxreturn
_ConvertMMXpII32_16RGB565:
-
picgetgot edx ; edx is not used yet ; set up masks
-
movq mm5, [mmx32_rgb565_b]
-
movq mm6, [mmx32_rgb565_g]
-
movq mm7, [mmx32_rgb565_r]
-
movq mm5, [mmx32_rgb565_b GOT_edx]
-
movq mm6, [mmx32_rgb565_g GOT_edx]
-
movq mm7, [mmx32_rgb565_r GOT_edx] mov edx, ecx shr ecx, 2
@@ -176,14 +182,16 @@ _ConvertMMXpII32_16RGB565:
jnz .L3
.L4:
- jmp _mmxreturn
- picgetgot ecx
- picjmp ecx, _mmxreturn
_ConvertMMXpII32_16BGR565:
-
movq mm5, [mmx32_rgb565_r]
-
movq mm6, [mmx32_rgb565_g]
-
movq mm7, [mmx32_rgb565_b]
-
picgetgot edx ; edx is not used yet
-
movq mm5, [mmx32_rgb565_r GOT_edx]
-
movq mm6, [mmx32_rgb565_g GOT_edx]
-
movq mm7, [mmx32_rgb565_b GOT_edx] mov edx, ecx shr ecx, 2
@@ -245,7 +253,8 @@ _ConvertMMXpII32_16BGR565:
jnz .L3
.L4:
-
jmp _mmxreturn
-
picgetgot ecx
-
picjmp ecx, _mmxreturn
_ConvertMMXpII32_16BGR555:
@@ -253,7 +262,8 @@ _ConvertMMXpII32_16BGR555:
; except it uses a different multiplier for the pmaddwd
; instruction. cool huh.
-
movq mm7, qword [mmx32_bgr555_mul]
-
picgetgot ebx ; ebx is not used yet
-
movq mm7, qword [mmx32_bgr555_mul GOT_ebx] jmp _convert_bgr555_cheat
; This is the same as the Intel version… they obviously went to
@@ -263,9 +273,10 @@ _ConvertMMXpII32_16BGR555:
; (I think) a more accurate name…
_ConvertMMXpII32_16RGB555:
-
movq mm7,qword [mmx32_rgb555_mul]
-
picgetgot ebx ; ebx is not used yet
-
movq mm7,qword [mmx32_rgb555_mul GOT_ebx]
_convert_bgr555_cheat:
-
movq mm6,qword [mmx32_rgb555_g]
-
movq mm6,qword [mmx32_rgb555_g GOT_ebx]
mov edx,ecx ; Save ecx
@@ -280,10 +291,10 @@ _convert_bgr555_cheat:
movq mm0,[esi]
movq mm3,mm2
- pand mm3,qword [mmx32_rgb555_rb]
- pand mm3,qword [mmx32_rgb555_rb GOT_ebx]
movq mm1,mm0
- pand mm1,qword [mmx32_rgb555_rb]
-
pand mm1,qword [mmx32_rgb555_rb GOT_ebx]
pmaddwd mm3,mm7pmaddwd mm1,mm7
@@ -302,13 +313,13 @@ _convert_bgr555_cheat:
movq mm0,mm4
psrld mm1,6
- pand mm0,qword [mmx32_rgb555_rb]
-
pand mm0,qword [mmx32_rgb555_rb GOT_ebx]
packssdw mm1,mm3movq mm3,mm5
pmaddwd mm0,mm7
- pand mm3,qword [mmx32_rgb555_rb]
-
pand mm3,qword [mmx32_rgb555_rb GOT_ebx]
pand mm4,mm6movq [edi],mm1
@@ -329,10 +340,10 @@ _convert_bgr555_cheat:
movq mm3,mm2
movq mm1,mm0
- pand mm3,qword [mmx32_rgb555_rb]
- pand mm3,qword [mmx32_rgb555_rb GOT_ebx]
packssdw mm5,mm4
- pand mm1,qword [mmx32_rgb555_rb]
-
pand mm1,qword [mmx32_rgb555_rb GOT_ebx]
pand mm2,mm6movq [edi+8],mm5
@@ -380,7 +391,8 @@ _convert_bgr555_cheat:
jnz .L3
.L4:
- jmp _mmxreturn
- picgetgot ecx
- picjmp ecx, _mmxreturn
SECTION .note.GNU-stack
diff -puriN SDL-1.2.9.orig/src/hermes/x86_main.asm SDL-1.2.9/src/hermes/x86_main.asm
— SDL-1.2.9.orig/src/hermes/x86_main.asm 2006-03-01 14:12:59.000000000 +0100
+++ SDL-1.2.9/src/hermes/x86_main.asm 2006-03-01 16:28:44.000000000 +0100
@@ -9,6 +9,8 @@
; Most routines are © Glenn Fiedler (ptc at gaffer.org), used with permission
;
+%include “common.asm”
+
BITS 32
GLOBAL _ConvertX86
@@ -17,11 +19,6 @@ GLOBAL _x86return
GLOBAL _Hermes_X86_CPU
-SECTION .data
-cpu_flags dd 0
SECTION .text
;; _ConvertX86:
@@ -110,16 +107,18 @@ _Hermes_X86_CPU:
xor eax,ecx
je .L1
- pusha
-
push ebx
-
push ecx
-
push edx
mov eax,1
cpuid
- mov [cpu_flags],edx
- popa
- mov eax,edx
- mov eax,[cpu_flags]
- pop edx
- pop ecx
- pop ebx
.L1:
ret
diff -puriN SDL-1.2.9.orig/src/hermes/x86p_16.asm SDL-1.2.9/src/hermes/x86p_16.asm
— SDL-1.2.9.orig/src/hermes/x86p_16.asm 2006-03-01 14:12:59.000000000 +0100
+++ SDL-1.2.9/src/hermes/x86p_16.asm 2006-03-01 16:24:45.000000000 +0100
@@ -10,7 +10,8 @@
; Used with permission.
;
+%include “common.asm”
+
BITS 32
GLOBAL _ConvertX86p16_32RGB888
@@ -57,7 +58,8 @@ _ConvertX86p16_16BGR565:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; head
mov eax,edi
@@ -135,7 +137,8 @@ _ConvertX86p16_16BGR565:
add edi,BYTE 2
.L7
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -164,7 +167,8 @@ _ConvertX86p16_16RGB555:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; head
mov eax,edi
@@ -243,7 +247,8 @@ _ConvertX86p16_16RGB555:
jmp SHORT .L6
.L7 pop ebp
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -277,7 +282,8 @@ _ConvertX86p16_16BGR555:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; head
mov eax,edi
@@ -361,7 +367,8 @@ _ConvertX86p16_16BGR555:
add edi,BYTE 2
.L7
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -394,7 +401,8 @@ _ConvertX86p16_8RGB332:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 mov eax,edi
and eax,BYTE 11b
@@ -492,6 +500,7 @@ _ConvertX86p16_8RGB332:
jnz .L6
.L7 pop ebp
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
SECTION .note.GNU-stack
diff -puriN SDL-1.2.9.orig/src/hermes/x86p_32.asm SDL-1.2.9/src/hermes/x86p_32.asm
— SDL-1.2.9.orig/src/hermes/x86p_32.asm 2006-03-01 14:12:59.000000000 +0100
+++ SDL-1.2.9/src/hermes/x86p_32.asm 2006-03-01 16:24:45.000000000 +0100
@@ -9,6 +9,7 @@
; Most routines are © Glenn Fiedler (ptc at gaffer.org), used with permission
;
+%include “common.asm”
BITS 32
@@ -53,7 +54,8 @@ _ConvertX86p32_32BGR888:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; save ebp
push ebp
@@ -113,7 +115,8 @@ _ConvertX86p32_32BGR888:
jnz .L5
.L6 pop ebp
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -133,7 +136,8 @@ _ConvertX86p32_32RGBA888:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; save ebp
push ebp
@@ -184,7 +188,8 @@ _ConvertX86p32_32RGBA888:
jnz .L5
.L6 pop ebp
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -204,7 +209,8 @@ _ConvertX86p32_32BGRA888:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; save ebp
push ebp
@@ -257,7 +263,8 @@ _ConvertX86p32_32BGRA888:
jnz .L5
.L6 pop ebp
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -282,7 +289,8 @@ _ConvertX86p32_24RGB888:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; head
mov edx,edi
@@ -354,7 +362,8 @@ _ConvertX86p32_24RGB888:
jnz .L6
.L7 pop ebp
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -380,7 +389,8 @@ _ConvertX86p32_24BGR888:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; head
mov edx,edi
@@ -455,7 +465,8 @@ _ConvertX86p32_24BGR888:
.L7
pop ebp
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -484,7 +495,8 @@ _ConvertX86p32_16RGB565:
jnz .L1
.L2: ; End of short loop
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; head
@@ -570,7 +582,8 @@ _ConvertX86p32_16RGB565:
add edi,BYTE 2
.L7:
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -599,7 +612,8 @@ _ConvertX86p32_16BGR565:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; head
mov ebx,edi
@@ -684,7 +698,8 @@ _ConvertX86p32_16BGR565:
add edi,BYTE 2
.L7
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -713,7 +728,8 @@ _ConvertX86p32_16RGB555:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; head
mov ebx,edi
@@ -795,7 +811,8 @@ _ConvertX86p32_16RGB555:
add edi,BYTE 2
.L7
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -825,7 +842,8 @@ _ConvertX86p32_16BGR555:
dec ecx
jnz .L1
.L2
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
.L3 ; head
mov ebx,edi
@@ -910,7 +928,8 @@ _ConvertX86p32_16BGR555:
add edi,BYTE 2
.L7
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return
@@ -1040,5 +1059,6 @@ _ConvertX86p32_8RGB332:
jnz .L3
.L4:
- jmp _x86return
- picgetgot ebx
- picjmp ebx, _x86return