tge/engine/terrain/blender_asm.asm
2017-04-17 06:17:10 -06:00

1368 lines
35 KiB
NASM
Executable File

;-----------------------------------------------------------------------------
; Torque Game Engine
; Copyright (C) GarageGames.com, Inc.
;-----------------------------------------------------------------------------
; syntax: export_fn <function name>
%macro export_fn 1
%ifdef LINUX
; No underscore needed for ELF object files
global %1
%1:
%else
global _%1
_%1:
%endif
%endmacro
%macro global_var 1
%ifdef LINUX
global %1
%else
global _%1
%define %1 _%1
%endif
%endmacro
%macro extern_var 1
%ifdef LINUX
extern %1
%else
extern _%1
%define %1 _%1
%endif
%endmacro
; push registers
%macro prologue 0
push ebp
mov ebp, esp ; set up ebp for parameter access
; pushad ;
push ebx
push esi
push edi
%endmacro
; pop registers
%macro epilogue 0
pop edi
pop esi
pop ebx
; popad
pop ebp
%endmacro
segment .data
ix dd 0
iy dd 0
; times 2 dd == 64 bits == sizeof(QWORD) in blender.cc
delta_a times 2 dd 0
delta_b times 2 dd 0
delta_c times 2 dd 0
delta_d times 2 dd 0
alpha_a0 times 2 dd 0
alpha_b0 times 2 dd 0
alpha_c0 times 2 dd 0
alpha_d0 times 2 dd 0
alpha_a1 times 2 dd 0
alpha_b1 times 2 dd 0
alpha_c1 times 2 dd 0
alpha_d1 times 2 dd 0
alpha_c2 times 2 dd 0
alpha_d2 times 2 dd 0
ldelt_a times 2 dd 0
ldelt_b times 2 dd 0
ldelt_c times 2 dd 0
ldelt_d times 2 dd 0
rdelt_a times 2 dd 0
rdelt_b times 2 dd 0
rdelt_c times 2 dd 0
rdelt_d times 2 dd 0
zero times 2 dd 0
redLightMask dw 0xf800, 0, 0, 0
greenLightMask dw 0x07c0, 0, 0, 0
blueLightMask dw 0x003e, 0, 0, 0
delta2 times 2 dd 0
delta3 times 2 dd 0
rdelt_x2 times 2 dd 0
ldelt_x2 times 2 dd 0
leftq times 2 dd 0
rightq times 2 dd 0
rdeltq times 2 dd 0
ldeltq times 2 dd 0
mulfact dw 0x2000, 0x0008, 0x2000, 0x0008
redblue dw 0x00f8, 0x00f8, 0x00f8, 0x00f8
green dw 0xf800, 0, 0xf800, 0
alpha dw 0x0001, 0x0001, 0, 0
mask_7c0 dw 0x07c0, 0, 0, 0
mask_f8 dw 0x00f8, 0, 0, 0
mask_f800000000 dw 0, 0, 0x00f8, 0
mask_0000ffff0000ffff dw 0xffff, 0, 0xffff, 0
mask_00007fff00007fff dw 0x7fff, 0, 0x7fff, 0
; externs for global variables declared in blender.cc
extern_var lumels
; declare global variables
global_var sTargetTexelsPerLumel_log2
global_var sTargetTexelsPerLumel
global_var sTargetTexelsPerLumelDiv2
global_var nextsrcrow
global_var nextdstrow
global_var mip0_dstrowadd
global_var mip1_dstrowadd
global_var minus1srcrowsPlus8
global_var srcrows_x2_MinusTPL
; define global variables
sTargetTexelsPerLumel_log2 dd 0
sTargetTexelsPerLumel dd 0
sTargetTexelsPerLumelDiv2 dd 0
nextsrcrow dd 0
nextdstrow dd 0
mip0_dstrowadd dd 0
mip1_dstrowadd dd 0
minus1srcrowsPlus8 dd 0
srcrows_x2_MinusTPL dd 0
segment .text
; parameter accessors for all of the doSquareX functions
%define dst [ebp+8]
%define sq_shift [ebp+12]
%define aoff [ebp+16]
%define bmp_ptrs [ebp+20]
%define alpha_ptrs [ebp+24]
; void doSquare4(
; U32 *dst,
; int sq_shift,
; const int *aoff,
; const U32 *const *bmp_ptrs,
; const U8 *const *alpha_ptrs );
export_fn doSquare4
prologue
; init iy
mov eax, 1
mov cl, sq_shift
shl eax, cl
mov dword [iy], eax
; init ix
shr eax, 1
mov dword [ix], eax
movd mm1, sq_shift
; get alpha values for the corners of the square for each texture type.
; replicate the values into 4 words of the qwords. Also calc vertical
; stepping values for the alpha values on left and right edges.
; load alpha value into bh to mul by 256 for precision. then
; punpcklwd mm0, mm0 followed by punpckldq mm0, mm0
; to replicate the low word into all words of mm0.
; shift down difference by sqshift to divide by pixels per square to get
; increment.
mov esi, aoff
mov edi, alpha_ptrs
mov eax, [edi]
mov edx, eax
add eax, [esi]
xor ebx,ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm2, ebx
punpcklwd mm2, mm2
add eax, [esi+8]
punpckldq mm2, mm2
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm0, ebx
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [alpha_a0], mm2
psubw mm0, mm2
add eax, [esi+4]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm4, ebx
punpcklwd mm4, mm4
add eax, [esi+12]
punpckldq mm4, mm4
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
movd mm3, ebx
movq [alpha_a1], mm4
punpcklwd mm3, mm3
punpckldq mm3, mm3
psraw mm0, mm1
psubw mm3, mm4
movq [ldelt_a], mm0
psraw mm3, mm1
movq [rdelt_a], mm3
mov eax, [edi+4]
mov edx, eax
add eax, [esi]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm2, ebx
punpcklwd mm2, mm2
add eax, [esi+8]
punpckldq mm2, mm2
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm0, ebx
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [alpha_b0], mm2
psubw mm0, mm2
add eax, [esi+4]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm4, ebx
punpcklwd mm4, mm4
add eax, [esi+12]
punpckldq mm4, mm4
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
movd mm3, ebx
movq [alpha_b1], mm4
punpcklwd mm3, mm3
punpckldq mm3, mm3
psraw mm0, mm1
psubw mm3, mm4
movq [ldelt_b], mm0
psraw mm3, mm1
movq [rdelt_b], mm3
mov eax, [edi+8]
mov edx, eax
add eax, [esi]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm2, ebx
punpcklwd mm2, mm2
add eax, [esi+8]
punpckldq mm2, mm2
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm0, ebx
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [alpha_c0], mm2
movq [alpha_c2], mm0
psubw mm0, mm2
add eax, [esi+4]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm4, ebx
punpcklwd mm4, mm4
add eax, [esi+12]
punpckldq mm4, mm4
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
movd mm3, ebx
movq [alpha_c1], mm4
punpcklwd mm3, mm3
punpckldq mm3, mm3
psraw mm0, mm1
psubw mm3, mm4
movq [ldelt_c], mm0
psraw mm3, mm1
movq [rdelt_c], mm3
mov eax, [edi+12]
mov edx, eax
add eax, [esi]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm2, ebx
punpcklwd mm2, mm2
add eax, [esi+8]
punpckldq mm2, mm2
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm0, ebx
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [alpha_d0], mm2
movq [alpha_d2], mm0
psubw mm0, mm2
add eax, [esi+4]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm4, ebx
punpcklwd mm4, mm4
add eax, [esi+12]
punpckldq mm4, mm4
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
movd mm3, ebx
movq [alpha_d1], mm4
punpcklwd mm3, mm3
punpckldq mm3, mm3
psraw mm0, mm1
psubw mm3, mm4
movq [ldelt_d], mm0
psraw mm3, mm1
movq [rdelt_d], mm3
mov esi, bmp_ptrs
mov eax, [esi]
mov ebx, [esi+4]
mov ecx, [esi+8]
mov edx, [esi+12]
movq mm0, [alpha_a1]
movq mm2, [alpha_b1]
movq mm3, [alpha_c1]
movq mm4, [alpha_a0]
movq mm5, [alpha_b0]
movq mm6, [alpha_c0]
movq mm7, [alpha_d0]
mov edi, dst
yloop4:
; mm1 should be sq_shift at this point
; calculate alpha step increments...word-size steps are replicated
; to fill qword.
psubw mm0, mm4
psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift
movq [delta_a], mm0 ;delta = ainc ainc ainc ainc
psubw mm2, mm5
psraw mm2, mm1 ;mm0 = (right-left) >> sq_shift
movq [delta_b], mm2 ;delta = ainc ainc ainc ainc
psubw mm3, mm6
psraw mm3, mm1 ;mm0 = (right-left) >> sq_shift
movq [delta_c], mm3 ;delta = ainc ainc ainc ainc
movq mm0, [alpha_d1]
psubw mm0, mm7
psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift
movq [delta_d], mm0 ;delta = ainc ainc ainc ainc
mov esi, [ix]
pxor mm2, mm2
xloop4:
movq mm0, [eax]
movq mm1, mm0
punpcklbw mm0, mm2
pmulhw mm0, mm4
paddw mm4, [delta_a]
punpckhbw mm1, mm2
pmulhw mm1, mm4
paddw mm4, [delta_a]
packuswb mm0, mm1
movq mm3, [ebx]
movq mm1, mm3
punpcklbw mm3, mm2
pmulhw mm3, mm5
paddw mm5, [delta_b]
punpckhbw mm1, mm2
pmulhw mm1, mm5
paddw mm5, [delta_b]
packuswb mm3, mm1
paddb mm0, mm3
movq mm3, [ecx]
movq mm1, mm3
punpcklbw mm3, mm2
pmulhw mm3, mm6
paddw mm6, [delta_c]
punpckhbw mm1, mm2
pmulhw mm1, mm6
paddw mm6, [delta_c]
packuswb mm3, mm1
paddb mm0, mm3
movq mm3, [edx]
movq mm1, mm3
punpcklbw mm3, mm2
pmulhw mm3, mm7
paddw mm7, [delta_d]
punpckhbw mm1, mm2
pmulhw mm1, mm7
paddw mm7, [delta_d]
packuswb mm3, mm1
paddb mm0, mm3
; double result, to make up for alpha vals being signed (max = 127)
; so our math turns out a bit short, example:
; (0x7f00 * 0xff) >> 16 = 0x7e....* 2 = 252...not quite 255
; would have been (0xff00 * 0xff) >> 16 = 0xfe = 254,
; if I could do an unsigned pmulhw...
; pmulhuw is in an intel document I found, but doesn't compile....
paddb mm0, mm0
movq [edi], mm0
add eax, 8
add ebx, 8
add ecx, 8
add edx, 8
add edi, 8
dec esi
jnz near xloop4
movq mm4, [alpha_a0]
paddw mm4, [ldelt_a]
movq [alpha_a0], mm4
movq mm5, [alpha_b0]
paddw mm5, [ldelt_b]
movq [alpha_b0], mm5
movq mm6, [alpha_c0]
paddw mm6, [ldelt_c]
movq [alpha_c0], mm6
movq mm7, [alpha_d0]
paddw mm7, [ldelt_d]
movq [alpha_d0], mm7
movq mm0, [alpha_d1]
paddw mm0, [rdelt_d]
movq [alpha_d1], mm0
movq mm2, [alpha_b1]
paddw mm2, [rdelt_b]
movq [alpha_b1], mm2
movq mm3, [alpha_c1]
paddw mm3, [rdelt_c]
movq [alpha_c1], mm3
movq mm0, [alpha_a1]
paddw mm0, [rdelt_a]
movq [alpha_a1], mm0
movd mm1, sq_shift ; top of loop expects this
dec dword [iy]
jnz near yloop4
emms
epilogue
ret
; void doSquare3(
; U32 *dst,
; int sq_shift,
; const int *aoff,
; const U32 *const *bmp_ptrs,
; const U8 *const *alpha_ptrs );
export_fn doSquare3
prologue
; init iy
mov eax, 1
mov cl, sq_shift
shl eax, cl
mov dword [iy], eax
; init ix
shr eax, 1
mov dword [ix], eax
movd mm1, sq_shift
; get alpha values for the corners of the square for each texture type.
; replicate the values into 4 words of the qwords. Also calc vertical
; stepping values for the alpha values on left and right edges.
; load alpha value into bh to mul by 256 for precision. then
; punpcklwd mm0, mm0 followed by punpckldq mm0, mm0
; to replicate the low word into all words of mm0.
; shift down difference by sqshift to divide by pixels per square to get
; increment.
mov esi, aoff
mov edi, alpha_ptrs
mov eax, [edi]
mov edx, eax
add eax, [esi]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm2, ebx
punpcklwd mm2, mm2
add eax, [esi+8]
punpckldq mm2, mm2
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm0, ebx
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [alpha_a0], mm2
psubw mm0, mm2
add eax, [esi+4]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm4, ebx
punpcklwd mm4, mm4
add eax, [esi+12]
punpckldq mm4, mm4
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
movd mm3, ebx
movq [alpha_a1], mm4
punpcklwd mm3, mm3
punpckldq mm3, mm3
psraw mm0, mm1
psubw mm3, mm4
movq [ldelt_a], mm0
psraw mm3, mm1
movq [rdelt_a], mm3
mov eax, [edi+4]
mov edx, eax
add eax, [esi]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm2, ebx
punpcklwd mm2, mm2
add eax, [esi+8]
punpckldq mm2, mm2
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm0, ebx
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [alpha_b0], mm2
psubw mm0, mm2
add eax, [esi+4]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm4, ebx
punpcklwd mm4, mm4
add eax, [esi+12]
punpckldq mm4, mm4
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
movd mm3, ebx
movq [alpha_b1], mm4
punpcklwd mm3, mm3
punpckldq mm3, mm3
psraw mm0, mm1
psubw mm3, mm4
movq [ldelt_b], mm0
psraw mm3, mm1
movq [rdelt_b], mm3
mov eax, [edi+8]
mov edx, eax
add eax, [esi]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm2, ebx
punpcklwd mm2, mm2
add eax, [esi+8]
punpckldq mm2, mm2
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm0, ebx
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [alpha_c0], mm2
movq [alpha_c2], mm0
psubw mm0, mm2
add eax, [esi+4]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm4, ebx
punpcklwd mm4, mm4
add eax, [esi+12]
punpckldq mm4, mm4
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
movd mm3, ebx
movq [alpha_c1], mm4
punpcklwd mm3, mm3
punpckldq mm3, mm3
psraw mm0, mm1
psubw mm3, mm4
movq [ldelt_c], mm0
psraw mm3, mm1
movq [rdelt_c], mm3
mov esi, bmp_ptrs
mov eax, [esi]
mov ebx, [esi+4]
mov ecx, [esi+8]
movq mm0, [alpha_a1]
movq mm2, [alpha_b1]
movq mm3, [alpha_c1]
movq mm4, [alpha_a0]
movq mm5, [alpha_b0]
movq mm6, [alpha_c0]
mov edi, dst
yloop3:
; mm1 should be sq_shift at this point
; mm0 should be alpha_a1
; mm2 should be alpha_b1
; mm3 should be alpha_c1
; calculate alpha step increments...word-size steps are replicated
; to fill qword.
psubw mm0, mm4
psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift
movq [delta_a], mm0 ;delta = ainc ainc ainc ainc
psubw mm2, mm5
psraw mm2, mm1 ;mm0 = (right-left) >> sq_shift
movq [delta_b], mm2 ;delta = ainc ainc ainc ainc
psubw mm3, mm6
psraw mm3, mm1 ;mm0 = (right-left) >> sq_shift
movq [delta_c], mm3 ;delta = ainc ainc ainc ainc
mov esi, [ix]
pxor mm2, mm2
movq mm7, [delta_a]
xloop3:
movq mm0, [eax]
movq mm1, mm0
punpcklbw mm0, mm2
pmulhw mm0, mm4
paddw mm4, mm7
punpckhbw mm1, mm2
pmulhw mm1, mm4
paddw mm4, mm7
packuswb mm0, mm1
movq mm3, [ebx]
movq mm1, mm3
punpcklbw mm3, mm2
pmulhw mm3, mm5
paddw mm5, [delta_b]
punpckhbw mm1, mm2
pmulhw mm1, mm5
paddw mm5, [delta_b]
packuswb mm3, mm1
paddb mm0, mm3
movq mm3, [ecx]
movq mm1, mm3
punpcklbw mm3, mm2
pmulhw mm3, mm6
paddw mm6, [delta_c]
punpckhbw mm1, mm2
pmulhw mm1, mm6
paddw mm6, [delta_c]
packuswb mm3, mm1
paddb mm0, mm3
paddb mm0, mm0
movq [edi], mm0
add eax, 8
add ebx, 8
add ecx, 8
add edi, 8
dec esi
jnz near xloop3
movq mm4, [alpha_a0]
paddw mm4, [ldelt_a]
movq [alpha_a0], mm4
movq mm5, [alpha_b0]
paddw mm5, [ldelt_b]
movq [alpha_b0], mm5
movq mm6, [alpha_c0]
paddw mm6, [ldelt_c]
movq [alpha_c0], mm6
movq mm2, [alpha_b1]
paddw mm2, [rdelt_b]
movq [alpha_b1], mm2
movq mm3, [alpha_c1]
paddw mm3, [rdelt_c]
movq [alpha_c1], mm3
movq mm0, [alpha_a1]
paddw mm0, [rdelt_a]
movq [alpha_a1], mm0
movd mm1, sq_shift ; top of loop expects this
dec dword [iy]
jnz near yloop3
emms
epilogue
ret
; void doSquare2(
; U32 *dst,
; int sq_shift,
; const int *aoff,
; const U32 *const *bmp_ptrs,
; const U8 *const *alpha_ptrs );
export_fn doSquare2
prologue
; init iy
mov eax, 1
mov cl, sq_shift
shl eax, cl
mov dword [iy], eax
; init ix
shr eax, 1
mov dword [ix], eax
movd mm1, sq_shift
; get alpha values for the corners of the square for each texture type.
; replicate the values into 4 words of the qwords. Also calc vertical
; stepping values for the alpha values on left and right edges.
; punpcklwd mm0, mm0 followed by punpckldq mm0, mm0
; to replicate the low word into all words of mm0.
; shift down difference by sqshift to divide by pixels per square to get
; increment.
mov esi, aoff
mov edi, alpha_ptrs
mov eax, [edi]
mov edx, eax
add eax, [esi]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm2, ebx
punpcklwd mm2, mm2
add eax, [esi+8]
punpckldq mm2, mm2
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm0, ebx
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [alpha_a0], mm2
psubw mm0, mm2
add eax, [esi+4]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm4, ebx
punpcklwd mm4, mm4
add eax, [esi+12]
punpckldq mm4, mm4
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
movd mm3, ebx
movq [alpha_a1], mm4
punpcklwd mm3, mm3
punpckldq mm3, mm3
psraw mm0, mm1
psubw mm3, mm4
movq [ldelt_a], mm0
psraw mm3, mm1
movq [rdelt_a], mm3
mov eax, [edi+4]
mov edx, eax
add eax, [esi]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm2, ebx
punpcklwd mm2, mm2
add eax, [esi+8]
punpckldq mm2, mm2
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm0, ebx
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [alpha_b0], mm2
psubw mm0, mm2
add eax, [esi+4]
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
mov eax, edx
movd mm4, ebx
punpcklwd mm4, mm4
add eax, [esi+12]
punpckldq mm4, mm4
xor ebx, ebx
mov bl, [eax]
shl ebx, 7
movd mm3, ebx
movq [alpha_b1], mm4
punpcklwd mm3, mm3
punpckldq mm3, mm3
psraw mm0, mm1
psubw mm3, mm4
movq [ldelt_b], mm0
psraw mm3, mm1
movq [rdelt_b], mm3
mov esi, bmp_ptrs
mov eax, [esi]
mov ebx, [esi+4]
movq mm0, [alpha_a1]
movq mm2, [alpha_b1]
movq mm4, [alpha_a0]
movq mm5, [alpha_b0]
mov edi, dst
yloop2:
; mm1 should be sq_shift at this point
; mm0 should be alpha_a1
; mm2 should be alpha_b1
; calculate alpha step increments...word-size steps are replicated
; to fill qword.
psubw mm0, mm4
psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift
movq [delta_a], mm0 ;delta = ainc ainc ainc ainc
psubw mm2, mm5
psraw mm2, mm1 ;mm0 = (right-left) >> sq_shift
movq [delta_b], mm2 ;delta = ainc ainc ainc ainc
mov esi, [ix]
pxor mm2, mm2
movq mm6, [delta_a]
movq mm7, [delta_b]
xloop2:
movq mm0, [eax]
movq mm3, [ebx]
movq mm1, mm0
punpcklbw mm0, mm2
pmulhw mm0, mm4
paddw mm4, mm6
punpckhbw mm1, mm2
pmulhw mm1, mm4
paddw mm4, mm6
packuswb mm0, mm1
movq mm1, mm3
punpcklbw mm3, mm2
pmulhw mm3, mm5
paddw mm5, mm7
punpckhbw mm1, mm2
pmulhw mm1, mm5
paddw mm5, mm7
packuswb mm3, mm1
paddb mm0, mm3
paddb mm0, mm0
movq [edi], mm0
add edi, 8
add eax, 8
add ebx, 8
dec esi
jnz xloop2
movq mm4, [alpha_a0]
paddw mm4, [ldelt_a]
movq [alpha_a0], mm4
movq mm5, [alpha_b0]
paddw mm5, [ldelt_b]
movq [alpha_b0], mm5
movq mm2, [alpha_b1]
paddw mm2, [rdelt_b]
movq [alpha_b1], mm2
movq mm0, [alpha_a1]
paddw mm0, [rdelt_a]
movq [alpha_a1], mm0
movd mm1, sq_shift ; top of loop expects this
dec dword [iy]
jnz near yloop2
emms
epilogue
ret
; params for doLumelPlus1Mip
%define dstmip0 [ebp+8]
%define dstmip1 [ebp+12]
%define srcptr [ebp+16]
; void doLumelPlus1Mip( U16 *dstmip0, U16 *dstmip1, const U32 *srcptr );
export_fn doLumelPlus1Mip
prologue
movd mm7, [sTargetTexelsPerLumel_log2]
movd mm0, [lumels]
movq mm4, mm0
pand mm0, [redLightMask]
movq mm5, mm4
pand mm4, [greenLightMask]
psllq mm0, 31
pand mm5, [blueLightMask]
psllq mm4, 20
paddw mm0, mm4
psllq mm5, 9
paddw mm0, mm5 ; mm0 = 0000rrrrggggbbbb qword for lp[0]
movq [leftq], mm0
movd mm1, [lumels+8] ; get lp2
movq mm4, mm1
pand mm1, [redLightMask]
movq mm5, mm4
pand mm4, [greenLightMask]
psllq mm1, 31
pand mm5, [blueLightMask]
psllq mm4, 20
paddw mm1, mm4
psllq mm5, 9
paddw mm1, mm5 ; mm1 = 0000rrrrggggbbbb qword for lp[2]
psubw mm1, mm0
psraw mm1, mm7
movq [ldeltq], mm1
psllw mm1, 1
movq [ldelt_x2], mm1
movd mm2, [lumels+4] ; get lp[1]
movq mm4, mm2
pand mm2, [redLightMask]
movq mm5, mm4
pand mm4, [greenLightMask]
psllq mm2, 31
pand mm5, [blueLightMask]
psllq mm4, 20
paddw mm2, mm4
psllq mm5, 9
paddw mm2, mm5 ; mm2 = 0000rrrrggggbbbb qword for lp[1]
movq [rightq], mm2
movd mm3, [lumels+12] ; get lp3
movq mm4, mm3
pand mm3, [redLightMask]
movq mm5, mm4
pand mm4, [greenLightMask]
psllq mm3, 31
pand mm5, [blueLightMask]
psllq mm4, 20
paddw mm3, mm4
psllq mm5, 9
paddw mm3, mm5 ; mm3 = 0000rrrrggggbbbb qword for lp[3]
psubw mm3, mm2
psraw mm3, mm7
movq [rdeltq], mm3
psllw mm3, 1
movq [rdelt_x2], mm3
mov edi, dstmip0
mov esi, srcptr
mov edx, dstmip1
pxor mm6, mm6
mov ecx, [sTargetTexelsPerLumelDiv2] ; yloop count
movq mm2, [leftq]
movq mm3, [rightq]
; mm2 is left, mm3 is right
yloop_dlpm:
movd mm7, [sTargetTexelsPerLumel_log2]
movq mm6, mm2
movq mm1, mm2 ; mm1 is light1
movq mm5, mm3
movq mm4, mm3
paddw mm5, [rdeltq] ; right + rdelt
psubw mm4, mm6 ; right - left
paddw mm6, [ldeltq] ; left + ldelt
psraw mm4, mm7
movq [delta2], mm4
psubw mm5, mm6 ; mm6 is light2
psraw mm5, mm7
movq [delta3], mm5
mov ebx, [sTargetTexelsPerLumelDiv2] ; loop count
; do 4 source pixels per loop
; mm1 is light1
; mm6 is light2
pxor mm7, mm7
xloop_dlpm:
; get first of source, col 0 and 1
movq mm4, [esi]
add esi, [nextsrcrow]
movq mm5, mm4
punpcklbw mm4, [zero]
pmulhw mm4, mm1 ; mm1 is light factor for first row
paddw mm1, [delta2]
punpckhbw mm5, [zero]
pmulhw mm5, mm1
paddw mm1, [delta2]
movq mm7, [esi]
add esi, [minus1srcrowsPlus8]
movq mm0, mm4
paddw mm0, mm5 ; mm0 is the avg, for mip1[0,1]
packuswb mm4, mm5 ; put both pixels in same qword
paddw mm4, mm4 ; double it, because lighting mul halved it
paddusb mm4, mm4
movq mm5, mm4 ; save the original data
pand mm4, [redblue] ; mask out all but the 5MSBits of red and blue
pmaddwd mm4, [mulfact] ; multiply each word by
; 2^13, 2^3, 2^13, 2^3 and add results
pand mm5, [green] ; mask out all but the 5MSBits of green
por mm4, mm5 ; combine the red, green, and blue bits
psrld mm4, 6 ; shift into position
packssdw mm4, [zero] ; pack into single dword
pslld mm4, 1 ; shift into final position
por mm4, [alpha] ; add the alpha bit
; write 2 pixels to mip0
movd [edi], mm4
; get second row, cols 0 and 1
movq mm5, mm7
pxor mm4, mm4
punpcklbw mm7, mm4
pmulhw mm7, mm6 ; mm6 is light factor for 2nd row
paddw mm6, [delta3]
punpckhbw mm5, mm4
pmulhw mm5, mm6
paddw mm6, [delta3]
paddw mm0, mm7
paddw mm0, mm5
psrlw mm0, 1 ; mm0 is mip1[0,1] average
paddusb mm0, mm0
packuswb mm7, mm5 ; put both pixels in same qword
paddw mm7, mm7 ; double it, because lighting mul halved it
paddusb mm7, mm7
movq mm5, mm7 ; save the original data
pand mm7, [redblue] ; mask out all but the 5MSBits of red and blue
pmaddwd mm7, [mulfact] ; multiply each word by
; 2^13, 2^3, 2^13, 2^3 and add results
pand mm5, [green] ; mask out all but the 5MSBits of green
por mm7, mm5 ; combine the red, green, and blue bits
psrld mm7, 6 ; shift into position
packssdw mm7, mm4 ; pack into single dword
pslld mm7, 1 ; shift into final position
por mm7, [alpha] ; add the alpha bit
; write 2 16-bit pixels to mip0, 2nd row
movd [edi+0x100], mm7
movq mm5, mm0
movq mm4, mm0
pand mm4, [mask_f8] ; red
psrlq mm5, 13
pand mm5, [mask_7c0] ; green
psllq mm4, 8
pand mm0, [mask_f800000000] ; blue
paddw mm5, mm4
psrlq mm0, 34
paddw mm0, mm5
; write 1 pixels to mip1
movd eax, mm0
mov [edx], ax
; increment ptrs
add edx, 2 ; mip1
add edi, 4 ; mip0
dec ebx
jnz near xloop_dlpm
add esi, [srcrows_x2_MinusTPL]
add edx, [mip1_dstrowadd]
add edi, [mip0_dstrowadd]
paddw mm2, [ldelt_x2] ; mm2 is left
paddw mm3, [rdelt_x2] ; mm3 is right
dec ecx
jnz near yloop_dlpm
emms
epilogue
ret
; params for do1x1Lumel
%define dstptr [ebp+8]
%define srcptr [ebp+12]
; void do1x1Lumel( U16 *dstptr, const U32 *srcptr );
export_fn do1x1Lumel
prologue
movd mm0, [lumels]
movq mm4, mm0
pand mm0, [redLightMask]
movq mm5, mm4
pand mm4, [greenLightMask]
psllq mm0, 31
pand mm5, [blueLightMask]
psllq mm4, 20
paddw mm0, mm4
psllq mm5, 9
paddw mm0, mm5 ; mm0 = 0000rrrrggggbbbb qword for lp[0]
mov edi, dstptr
mov esi, srcptr
pxor mm6, mm6
movd mm4, [esi]
punpcklbw mm4, mm6 ; mm6 is expected to be 0 here
pmulhw mm4, mm0
paddw mm4, mm4
paddusb mm4, mm4
movq mm7, mm4
movq mm6, mm4
psrlq mm4, 34
pand mm7, [mask_f8]
psrlq mm6, 13
psllq mm7, 8
pand mm6, [mask_7c0]
paddw mm4, mm7
paddw mm4, mm6
movd eax, mm4
mov [edi],ax
emms
epilogue
ret
; params for cheatmips
%define srcptr [ebp+8]
%define dstmip0 [ebp+12]
%define dstmip1 [ebp+16]
%define wid [ebp+20]
; void cheatmips( U16 *srcptr, U16 *dstmip0, U16 *dstmip1, int wid );
export_fn cheatmips
prologue
mov esi, srcptr
mov edi, dstmip0
mov edx, dstmip1
mov ecx, wid
shr ecx, 1
mov eax, ecx
shr eax, 3
shl dword wid, 1
movq mm6, [mask_0000ffff0000ffff]
movq mm7, [mask_00007fff00007fff]
yloop_cm:
mov ebx, eax
xloop_cm:
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [esi+16]
movq mm3, [esi+24]
pand mm0, mm6
psrlw mm1, 1
pand mm1, mm7
psrlw mm0, 1
pand mm2, mm6
psrlw mm3, 1
pand mm3, mm7
psrlw mm2, 1
packssdw mm0, mm1
packssdw mm2, mm3
psllw mm0, 1 ;mip1, qw 0
movq [edi], mm0
psllw mm2, 1 ;mip1, qw 1
movq [edi+8], mm2
test ecx, 1
jnz nomip2
movq mm1, mm0
movq mm3, mm2
pand mm1, mm6
psrlw mm3, 1
pand mm3, mm7
psrlw mm1, 1
packssdw mm1, mm3
psllw mm1, 1 ;mip2, qw 0
movq [edx], mm1
add edx, 8
nomip2:
add esi, 32
add edi, 16
dec ebx
jnz xloop_cm
add esi, wid
dec ecx
jnz near yloop_cm
emms
epilogue
ret
; params for cheatmips4x4
%define srcptr [ebp+8]
%define dstmip0 [ebp+12]
%define dstmip1 [ebp+16]
; void cheatmips4x4( U16 *srcptr, U16 *dstmip0, U16 *dstmip1 );
export_fn cheatmips4x4
prologue
mov esi, srcptr
mov edi, dstmip0
mov edx, dstmip1
movq mm0, [esi]
movq mm1, [esi+16]
pand mm0, [mask_0000ffff0000ffff]
psrlw mm1, 1
pand mm1, [mask_00007fff00007fff]
psrlw mm0, 1
packssdw mm0, mm1
psllw mm0, 1 ; mip1, qw 0
movq [edi], mm0
movd eax, mm0
mov [edx], ax
emms
epilogue
ret