;----------------------------------------------------------------------------- ; Torque Game Engine ; Copyright (C) GarageGames.com, Inc. ;----------------------------------------------------------------------------- ; syntax: export_fn %macro export_fn 1 %ifdef LINUX ; No underscore needed for ELF object files global %1 %1: %else global _%1 _%1: %endif %endmacro %macro global_var 1 %ifdef LINUX global %1 %else global _%1 %define %1 _%1 %endif %endmacro %macro extern_var 1 %ifdef LINUX extern %1 %else extern _%1 %define %1 _%1 %endif %endmacro ; push registers %macro prologue 0 push ebp mov ebp, esp ; set up ebp for parameter access ; pushad ; push ebx push esi push edi %endmacro ; pop registers %macro epilogue 0 pop edi pop esi pop ebx ; popad pop ebp %endmacro segment .data ix dd 0 iy dd 0 ; times 2 dd == 64 bits == sizeof(QWORD) in blender.cc delta_a times 2 dd 0 delta_b times 2 dd 0 delta_c times 2 dd 0 delta_d times 2 dd 0 alpha_a0 times 2 dd 0 alpha_b0 times 2 dd 0 alpha_c0 times 2 dd 0 alpha_d0 times 2 dd 0 alpha_a1 times 2 dd 0 alpha_b1 times 2 dd 0 alpha_c1 times 2 dd 0 alpha_d1 times 2 dd 0 alpha_c2 times 2 dd 0 alpha_d2 times 2 dd 0 ldelt_a times 2 dd 0 ldelt_b times 2 dd 0 ldelt_c times 2 dd 0 ldelt_d times 2 dd 0 rdelt_a times 2 dd 0 rdelt_b times 2 dd 0 rdelt_c times 2 dd 0 rdelt_d times 2 dd 0 zero times 2 dd 0 redLightMask dw 0xf800, 0, 0, 0 greenLightMask dw 0x07c0, 0, 0, 0 blueLightMask dw 0x003e, 0, 0, 0 delta2 times 2 dd 0 delta3 times 2 dd 0 rdelt_x2 times 2 dd 0 ldelt_x2 times 2 dd 0 leftq times 2 dd 0 rightq times 2 dd 0 rdeltq times 2 dd 0 ldeltq times 2 dd 0 mulfact dw 0x2000, 0x0008, 0x2000, 0x0008 redblue dw 0x00f8, 0x00f8, 0x00f8, 0x00f8 green dw 0xf800, 0, 0xf800, 0 alpha dw 0x0001, 0x0001, 0, 0 mask_7c0 dw 0x07c0, 0, 0, 0 mask_f8 dw 0x00f8, 0, 0, 0 mask_f800000000 dw 0, 0, 0x00f8, 0 mask_0000ffff0000ffff dw 0xffff, 0, 0xffff, 0 mask_00007fff00007fff dw 0x7fff, 0, 0x7fff, 0 ; externs for global variables declared in blender.cc extern_var lumels ; declare global variables global_var sTargetTexelsPerLumel_log2 global_var sTargetTexelsPerLumel global_var sTargetTexelsPerLumelDiv2 global_var nextsrcrow global_var nextdstrow global_var mip0_dstrowadd global_var mip1_dstrowadd global_var minus1srcrowsPlus8 global_var srcrows_x2_MinusTPL ; define global variables sTargetTexelsPerLumel_log2 dd 0 sTargetTexelsPerLumel dd 0 sTargetTexelsPerLumelDiv2 dd 0 nextsrcrow dd 0 nextdstrow dd 0 mip0_dstrowadd dd 0 mip1_dstrowadd dd 0 minus1srcrowsPlus8 dd 0 srcrows_x2_MinusTPL dd 0 segment .text ; parameter accessors for all of the doSquareX functions %define dst [ebp+8] %define sq_shift [ebp+12] %define aoff [ebp+16] %define bmp_ptrs [ebp+20] %define alpha_ptrs [ebp+24] ; void doSquare4( ; U32 *dst, ; int sq_shift, ; const int *aoff, ; const U32 *const *bmp_ptrs, ; const U8 *const *alpha_ptrs ); export_fn doSquare4 prologue ; init iy mov eax, 1 mov cl, sq_shift shl eax, cl mov dword [iy], eax ; init ix shr eax, 1 mov dword [ix], eax movd mm1, sq_shift ; get alpha values for the corners of the square for each texture type. ; replicate the values into 4 words of the qwords. Also calc vertical ; stepping values for the alpha values on left and right edges. ; load alpha value into bh to mul by 256 for precision. then ; punpcklwd mm0, mm0 followed by punpckldq mm0, mm0 ; to replicate the low word into all words of mm0. ; shift down difference by sqshift to divide by pixels per square to get ; increment. mov esi, aoff mov edi, alpha_ptrs mov eax, [edi] mov edx, eax add eax, [esi] xor ebx,ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_a0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_a1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_a], mm0 psraw mm3, mm1 movq [rdelt_a], mm3 mov eax, [edi+4] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_b0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_b1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_b], mm0 psraw mm3, mm1 movq [rdelt_b], mm3 mov eax, [edi+8] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_c0], mm2 movq [alpha_c2], mm0 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_c1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_c], mm0 psraw mm3, mm1 movq [rdelt_c], mm3 mov eax, [edi+12] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_d0], mm2 movq [alpha_d2], mm0 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_d1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_d], mm0 psraw mm3, mm1 movq [rdelt_d], mm3 mov esi, bmp_ptrs mov eax, [esi] mov ebx, [esi+4] mov ecx, [esi+8] mov edx, [esi+12] movq mm0, [alpha_a1] movq mm2, [alpha_b1] movq mm3, [alpha_c1] movq mm4, [alpha_a0] movq mm5, [alpha_b0] movq mm6, [alpha_c0] movq mm7, [alpha_d0] mov edi, dst yloop4: ; mm1 should be sq_shift at this point ; calculate alpha step increments...word-size steps are replicated ; to fill qword. psubw mm0, mm4 psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_a], mm0 ;delta = ainc ainc ainc ainc psubw mm2, mm5 psraw mm2, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_b], mm2 ;delta = ainc ainc ainc ainc psubw mm3, mm6 psraw mm3, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_c], mm3 ;delta = ainc ainc ainc ainc movq mm0, [alpha_d1] psubw mm0, mm7 psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_d], mm0 ;delta = ainc ainc ainc ainc mov esi, [ix] pxor mm2, mm2 xloop4: movq mm0, [eax] movq mm1, mm0 punpcklbw mm0, mm2 pmulhw mm0, mm4 paddw mm4, [delta_a] punpckhbw mm1, mm2 pmulhw mm1, mm4 paddw mm4, [delta_a] packuswb mm0, mm1 movq mm3, [ebx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm5 paddw mm5, [delta_b] punpckhbw mm1, mm2 pmulhw mm1, mm5 paddw mm5, [delta_b] packuswb mm3, mm1 paddb mm0, mm3 movq mm3, [ecx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm6 paddw mm6, [delta_c] punpckhbw mm1, mm2 pmulhw mm1, mm6 paddw mm6, [delta_c] packuswb mm3, mm1 paddb mm0, mm3 movq mm3, [edx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm7 paddw mm7, [delta_d] punpckhbw mm1, mm2 pmulhw mm1, mm7 paddw mm7, [delta_d] packuswb mm3, mm1 paddb mm0, mm3 ; double result, to make up for alpha vals being signed (max = 127) ; so our math turns out a bit short, example: ; (0x7f00 * 0xff) >> 16 = 0x7e....* 2 = 252...not quite 255 ; would have been (0xff00 * 0xff) >> 16 = 0xfe = 254, ; if I could do an unsigned pmulhw... ; pmulhuw is in an intel document I found, but doesn't compile.... paddb mm0, mm0 movq [edi], mm0 add eax, 8 add ebx, 8 add ecx, 8 add edx, 8 add edi, 8 dec esi jnz near xloop4 movq mm4, [alpha_a0] paddw mm4, [ldelt_a] movq [alpha_a0], mm4 movq mm5, [alpha_b0] paddw mm5, [ldelt_b] movq [alpha_b0], mm5 movq mm6, [alpha_c0] paddw mm6, [ldelt_c] movq [alpha_c0], mm6 movq mm7, [alpha_d0] paddw mm7, [ldelt_d] movq [alpha_d0], mm7 movq mm0, [alpha_d1] paddw mm0, [rdelt_d] movq [alpha_d1], mm0 movq mm2, [alpha_b1] paddw mm2, [rdelt_b] movq [alpha_b1], mm2 movq mm3, [alpha_c1] paddw mm3, [rdelt_c] movq [alpha_c1], mm3 movq mm0, [alpha_a1] paddw mm0, [rdelt_a] movq [alpha_a1], mm0 movd mm1, sq_shift ; top of loop expects this dec dword [iy] jnz near yloop4 emms epilogue ret ; void doSquare3( ; U32 *dst, ; int sq_shift, ; const int *aoff, ; const U32 *const *bmp_ptrs, ; const U8 *const *alpha_ptrs ); export_fn doSquare3 prologue ; init iy mov eax, 1 mov cl, sq_shift shl eax, cl mov dword [iy], eax ; init ix shr eax, 1 mov dword [ix], eax movd mm1, sq_shift ; get alpha values for the corners of the square for each texture type. ; replicate the values into 4 words of the qwords. Also calc vertical ; stepping values for the alpha values on left and right edges. ; load alpha value into bh to mul by 256 for precision. then ; punpcklwd mm0, mm0 followed by punpckldq mm0, mm0 ; to replicate the low word into all words of mm0. ; shift down difference by sqshift to divide by pixels per square to get ; increment. mov esi, aoff mov edi, alpha_ptrs mov eax, [edi] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_a0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_a1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_a], mm0 psraw mm3, mm1 movq [rdelt_a], mm3 mov eax, [edi+4] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_b0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_b1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_b], mm0 psraw mm3, mm1 movq [rdelt_b], mm3 mov eax, [edi+8] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_c0], mm2 movq [alpha_c2], mm0 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_c1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_c], mm0 psraw mm3, mm1 movq [rdelt_c], mm3 mov esi, bmp_ptrs mov eax, [esi] mov ebx, [esi+4] mov ecx, [esi+8] movq mm0, [alpha_a1] movq mm2, [alpha_b1] movq mm3, [alpha_c1] movq mm4, [alpha_a0] movq mm5, [alpha_b0] movq mm6, [alpha_c0] mov edi, dst yloop3: ; mm1 should be sq_shift at this point ; mm0 should be alpha_a1 ; mm2 should be alpha_b1 ; mm3 should be alpha_c1 ; calculate alpha step increments...word-size steps are replicated ; to fill qword. psubw mm0, mm4 psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_a], mm0 ;delta = ainc ainc ainc ainc psubw mm2, mm5 psraw mm2, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_b], mm2 ;delta = ainc ainc ainc ainc psubw mm3, mm6 psraw mm3, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_c], mm3 ;delta = ainc ainc ainc ainc mov esi, [ix] pxor mm2, mm2 movq mm7, [delta_a] xloop3: movq mm0, [eax] movq mm1, mm0 punpcklbw mm0, mm2 pmulhw mm0, mm4 paddw mm4, mm7 punpckhbw mm1, mm2 pmulhw mm1, mm4 paddw mm4, mm7 packuswb mm0, mm1 movq mm3, [ebx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm5 paddw mm5, [delta_b] punpckhbw mm1, mm2 pmulhw mm1, mm5 paddw mm5, [delta_b] packuswb mm3, mm1 paddb mm0, mm3 movq mm3, [ecx] movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm6 paddw mm6, [delta_c] punpckhbw mm1, mm2 pmulhw mm1, mm6 paddw mm6, [delta_c] packuswb mm3, mm1 paddb mm0, mm3 paddb mm0, mm0 movq [edi], mm0 add eax, 8 add ebx, 8 add ecx, 8 add edi, 8 dec esi jnz near xloop3 movq mm4, [alpha_a0] paddw mm4, [ldelt_a] movq [alpha_a0], mm4 movq mm5, [alpha_b0] paddw mm5, [ldelt_b] movq [alpha_b0], mm5 movq mm6, [alpha_c0] paddw mm6, [ldelt_c] movq [alpha_c0], mm6 movq mm2, [alpha_b1] paddw mm2, [rdelt_b] movq [alpha_b1], mm2 movq mm3, [alpha_c1] paddw mm3, [rdelt_c] movq [alpha_c1], mm3 movq mm0, [alpha_a1] paddw mm0, [rdelt_a] movq [alpha_a1], mm0 movd mm1, sq_shift ; top of loop expects this dec dword [iy] jnz near yloop3 emms epilogue ret ; void doSquare2( ; U32 *dst, ; int sq_shift, ; const int *aoff, ; const U32 *const *bmp_ptrs, ; const U8 *const *alpha_ptrs ); export_fn doSquare2 prologue ; init iy mov eax, 1 mov cl, sq_shift shl eax, cl mov dword [iy], eax ; init ix shr eax, 1 mov dword [ix], eax movd mm1, sq_shift ; get alpha values for the corners of the square for each texture type. ; replicate the values into 4 words of the qwords. Also calc vertical ; stepping values for the alpha values on left and right edges. ; punpcklwd mm0, mm0 followed by punpckldq mm0, mm0 ; to replicate the low word into all words of mm0. ; shift down difference by sqshift to divide by pixels per square to get ; increment. mov esi, aoff mov edi, alpha_ptrs mov eax, [edi] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_a0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_a1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_a], mm0 psraw mm3, mm1 movq [rdelt_a], mm3 mov eax, [edi+4] mov edx, eax add eax, [esi] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm2, ebx punpcklwd mm2, mm2 add eax, [esi+8] punpckldq mm2, mm2 xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm0, ebx punpcklwd mm0, mm0 punpckldq mm0, mm0 movq [alpha_b0], mm2 psubw mm0, mm2 add eax, [esi+4] xor ebx, ebx mov bl, [eax] shl ebx, 7 mov eax, edx movd mm4, ebx punpcklwd mm4, mm4 add eax, [esi+12] punpckldq mm4, mm4 xor ebx, ebx mov bl, [eax] shl ebx, 7 movd mm3, ebx movq [alpha_b1], mm4 punpcklwd mm3, mm3 punpckldq mm3, mm3 psraw mm0, mm1 psubw mm3, mm4 movq [ldelt_b], mm0 psraw mm3, mm1 movq [rdelt_b], mm3 mov esi, bmp_ptrs mov eax, [esi] mov ebx, [esi+4] movq mm0, [alpha_a1] movq mm2, [alpha_b1] movq mm4, [alpha_a0] movq mm5, [alpha_b0] mov edi, dst yloop2: ; mm1 should be sq_shift at this point ; mm0 should be alpha_a1 ; mm2 should be alpha_b1 ; calculate alpha step increments...word-size steps are replicated ; to fill qword. psubw mm0, mm4 psraw mm0, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_a], mm0 ;delta = ainc ainc ainc ainc psubw mm2, mm5 psraw mm2, mm1 ;mm0 = (right-left) >> sq_shift movq [delta_b], mm2 ;delta = ainc ainc ainc ainc mov esi, [ix] pxor mm2, mm2 movq mm6, [delta_a] movq mm7, [delta_b] xloop2: movq mm0, [eax] movq mm3, [ebx] movq mm1, mm0 punpcklbw mm0, mm2 pmulhw mm0, mm4 paddw mm4, mm6 punpckhbw mm1, mm2 pmulhw mm1, mm4 paddw mm4, mm6 packuswb mm0, mm1 movq mm1, mm3 punpcklbw mm3, mm2 pmulhw mm3, mm5 paddw mm5, mm7 punpckhbw mm1, mm2 pmulhw mm1, mm5 paddw mm5, mm7 packuswb mm3, mm1 paddb mm0, mm3 paddb mm0, mm0 movq [edi], mm0 add edi, 8 add eax, 8 add ebx, 8 dec esi jnz xloop2 movq mm4, [alpha_a0] paddw mm4, [ldelt_a] movq [alpha_a0], mm4 movq mm5, [alpha_b0] paddw mm5, [ldelt_b] movq [alpha_b0], mm5 movq mm2, [alpha_b1] paddw mm2, [rdelt_b] movq [alpha_b1], mm2 movq mm0, [alpha_a1] paddw mm0, [rdelt_a] movq [alpha_a1], mm0 movd mm1, sq_shift ; top of loop expects this dec dword [iy] jnz near yloop2 emms epilogue ret ; params for doLumelPlus1Mip %define dstmip0 [ebp+8] %define dstmip1 [ebp+12] %define srcptr [ebp+16] ; void doLumelPlus1Mip( U16 *dstmip0, U16 *dstmip1, const U32 *srcptr ); export_fn doLumelPlus1Mip prologue movd mm7, [sTargetTexelsPerLumel_log2] movd mm0, [lumels] movq mm4, mm0 pand mm0, [redLightMask] movq mm5, mm4 pand mm4, [greenLightMask] psllq mm0, 31 pand mm5, [blueLightMask] psllq mm4, 20 paddw mm0, mm4 psllq mm5, 9 paddw mm0, mm5 ; mm0 = 0000rrrrggggbbbb qword for lp[0] movq [leftq], mm0 movd mm1, [lumels+8] ; get lp2 movq mm4, mm1 pand mm1, [redLightMask] movq mm5, mm4 pand mm4, [greenLightMask] psllq mm1, 31 pand mm5, [blueLightMask] psllq mm4, 20 paddw mm1, mm4 psllq mm5, 9 paddw mm1, mm5 ; mm1 = 0000rrrrggggbbbb qword for lp[2] psubw mm1, mm0 psraw mm1, mm7 movq [ldeltq], mm1 psllw mm1, 1 movq [ldelt_x2], mm1 movd mm2, [lumels+4] ; get lp[1] movq mm4, mm2 pand mm2, [redLightMask] movq mm5, mm4 pand mm4, [greenLightMask] psllq mm2, 31 pand mm5, [blueLightMask] psllq mm4, 20 paddw mm2, mm4 psllq mm5, 9 paddw mm2, mm5 ; mm2 = 0000rrrrggggbbbb qword for lp[1] movq [rightq], mm2 movd mm3, [lumels+12] ; get lp3 movq mm4, mm3 pand mm3, [redLightMask] movq mm5, mm4 pand mm4, [greenLightMask] psllq mm3, 31 pand mm5, [blueLightMask] psllq mm4, 20 paddw mm3, mm4 psllq mm5, 9 paddw mm3, mm5 ; mm3 = 0000rrrrggggbbbb qword for lp[3] psubw mm3, mm2 psraw mm3, mm7 movq [rdeltq], mm3 psllw mm3, 1 movq [rdelt_x2], mm3 mov edi, dstmip0 mov esi, srcptr mov edx, dstmip1 pxor mm6, mm6 mov ecx, [sTargetTexelsPerLumelDiv2] ; yloop count movq mm2, [leftq] movq mm3, [rightq] ; mm2 is left, mm3 is right yloop_dlpm: movd mm7, [sTargetTexelsPerLumel_log2] movq mm6, mm2 movq mm1, mm2 ; mm1 is light1 movq mm5, mm3 movq mm4, mm3 paddw mm5, [rdeltq] ; right + rdelt psubw mm4, mm6 ; right - left paddw mm6, [ldeltq] ; left + ldelt psraw mm4, mm7 movq [delta2], mm4 psubw mm5, mm6 ; mm6 is light2 psraw mm5, mm7 movq [delta3], mm5 mov ebx, [sTargetTexelsPerLumelDiv2] ; loop count ; do 4 source pixels per loop ; mm1 is light1 ; mm6 is light2 pxor mm7, mm7 xloop_dlpm: ; get first of source, col 0 and 1 movq mm4, [esi] add esi, [nextsrcrow] movq mm5, mm4 punpcklbw mm4, [zero] pmulhw mm4, mm1 ; mm1 is light factor for first row paddw mm1, [delta2] punpckhbw mm5, [zero] pmulhw mm5, mm1 paddw mm1, [delta2] movq mm7, [esi] add esi, [minus1srcrowsPlus8] movq mm0, mm4 paddw mm0, mm5 ; mm0 is the avg, for mip1[0,1] packuswb mm4, mm5 ; put both pixels in same qword paddw mm4, mm4 ; double it, because lighting mul halved it paddusb mm4, mm4 movq mm5, mm4 ; save the original data pand mm4, [redblue] ; mask out all but the 5MSBits of red and blue pmaddwd mm4, [mulfact] ; multiply each word by ; 2^13, 2^3, 2^13, 2^3 and add results pand mm5, [green] ; mask out all but the 5MSBits of green por mm4, mm5 ; combine the red, green, and blue bits psrld mm4, 6 ; shift into position packssdw mm4, [zero] ; pack into single dword pslld mm4, 1 ; shift into final position por mm4, [alpha] ; add the alpha bit ; write 2 pixels to mip0 movd [edi], mm4 ; get second row, cols 0 and 1 movq mm5, mm7 pxor mm4, mm4 punpcklbw mm7, mm4 pmulhw mm7, mm6 ; mm6 is light factor for 2nd row paddw mm6, [delta3] punpckhbw mm5, mm4 pmulhw mm5, mm6 paddw mm6, [delta3] paddw mm0, mm7 paddw mm0, mm5 psrlw mm0, 1 ; mm0 is mip1[0,1] average paddusb mm0, mm0 packuswb mm7, mm5 ; put both pixels in same qword paddw mm7, mm7 ; double it, because lighting mul halved it paddusb mm7, mm7 movq mm5, mm7 ; save the original data pand mm7, [redblue] ; mask out all but the 5MSBits of red and blue pmaddwd mm7, [mulfact] ; multiply each word by ; 2^13, 2^3, 2^13, 2^3 and add results pand mm5, [green] ; mask out all but the 5MSBits of green por mm7, mm5 ; combine the red, green, and blue bits psrld mm7, 6 ; shift into position packssdw mm7, mm4 ; pack into single dword pslld mm7, 1 ; shift into final position por mm7, [alpha] ; add the alpha bit ; write 2 16-bit pixels to mip0, 2nd row movd [edi+0x100], mm7 movq mm5, mm0 movq mm4, mm0 pand mm4, [mask_f8] ; red psrlq mm5, 13 pand mm5, [mask_7c0] ; green psllq mm4, 8 pand mm0, [mask_f800000000] ; blue paddw mm5, mm4 psrlq mm0, 34 paddw mm0, mm5 ; write 1 pixels to mip1 movd eax, mm0 mov [edx], ax ; increment ptrs add edx, 2 ; mip1 add edi, 4 ; mip0 dec ebx jnz near xloop_dlpm add esi, [srcrows_x2_MinusTPL] add edx, [mip1_dstrowadd] add edi, [mip0_dstrowadd] paddw mm2, [ldelt_x2] ; mm2 is left paddw mm3, [rdelt_x2] ; mm3 is right dec ecx jnz near yloop_dlpm emms epilogue ret ; params for do1x1Lumel %define dstptr [ebp+8] %define srcptr [ebp+12] ; void do1x1Lumel( U16 *dstptr, const U32 *srcptr ); export_fn do1x1Lumel prologue movd mm0, [lumels] movq mm4, mm0 pand mm0, [redLightMask] movq mm5, mm4 pand mm4, [greenLightMask] psllq mm0, 31 pand mm5, [blueLightMask] psllq mm4, 20 paddw mm0, mm4 psllq mm5, 9 paddw mm0, mm5 ; mm0 = 0000rrrrggggbbbb qword for lp[0] mov edi, dstptr mov esi, srcptr pxor mm6, mm6 movd mm4, [esi] punpcklbw mm4, mm6 ; mm6 is expected to be 0 here pmulhw mm4, mm0 paddw mm4, mm4 paddusb mm4, mm4 movq mm7, mm4 movq mm6, mm4 psrlq mm4, 34 pand mm7, [mask_f8] psrlq mm6, 13 psllq mm7, 8 pand mm6, [mask_7c0] paddw mm4, mm7 paddw mm4, mm6 movd eax, mm4 mov [edi],ax emms epilogue ret ; params for cheatmips %define srcptr [ebp+8] %define dstmip0 [ebp+12] %define dstmip1 [ebp+16] %define wid [ebp+20] ; void cheatmips( U16 *srcptr, U16 *dstmip0, U16 *dstmip1, int wid ); export_fn cheatmips prologue mov esi, srcptr mov edi, dstmip0 mov edx, dstmip1 mov ecx, wid shr ecx, 1 mov eax, ecx shr eax, 3 shl dword wid, 1 movq mm6, [mask_0000ffff0000ffff] movq mm7, [mask_00007fff00007fff] yloop_cm: mov ebx, eax xloop_cm: movq mm0, [esi] movq mm1, [esi+8] movq mm2, [esi+16] movq mm3, [esi+24] pand mm0, mm6 psrlw mm1, 1 pand mm1, mm7 psrlw mm0, 1 pand mm2, mm6 psrlw mm3, 1 pand mm3, mm7 psrlw mm2, 1 packssdw mm0, mm1 packssdw mm2, mm3 psllw mm0, 1 ;mip1, qw 0 movq [edi], mm0 psllw mm2, 1 ;mip1, qw 1 movq [edi+8], mm2 test ecx, 1 jnz nomip2 movq mm1, mm0 movq mm3, mm2 pand mm1, mm6 psrlw mm3, 1 pand mm3, mm7 psrlw mm1, 1 packssdw mm1, mm3 psllw mm1, 1 ;mip2, qw 0 movq [edx], mm1 add edx, 8 nomip2: add esi, 32 add edi, 16 dec ebx jnz xloop_cm add esi, wid dec ecx jnz near yloop_cm emms epilogue ret ; params for cheatmips4x4 %define srcptr [ebp+8] %define dstmip0 [ebp+12] %define dstmip1 [ebp+16] ; void cheatmips4x4( U16 *srcptr, U16 *dstmip0, U16 *dstmip1 ); export_fn cheatmips4x4 prologue mov esi, srcptr mov edi, dstmip0 mov edx, dstmip1 movq mm0, [esi] movq mm1, [esi+16] pand mm0, [mask_0000ffff0000ffff] psrlw mm1, 1 pand mm1, [mask_00007fff00007fff] psrlw mm0, 1 packssdw mm0, mm1 psllw mm0, 1 ; mip1, qw 0 movq [edi], mm0 movd eax, mm0 mov [edx], ax emms epilogue ret