1334 lines
58 KiB
C++
Executable File
1334 lines
58 KiB
C++
Executable File
//-----------------------------------------------------------------------------
|
|
// Torque Game Engine
|
|
// Copyright (C) GarageGames.com, Inc.
|
|
//-----------------------------------------------------------------------------
|
|
|
|
#include "terrain/blender.h"
|
|
#include "platform/profiler.h"
|
|
|
|
/*************Configuration Switches*******************************/
|
|
// Haven't determined whether the alphaTable is a big win.
|
|
// Needs more measurement, must check tuning.
|
|
// alphaTable decreases integer math but increases RAM cache usage.
|
|
#define USE_ALPHA_TABLE_AT_4 1
|
|
#define USE_ALPHA_TABLE_AT_3 1
|
|
#define USE_ALPHA_TABLE_AT_2 1
|
|
|
|
|
|
#if defined(TORQUE_SUPPORTS_NASM)
|
|
# define BLENDER_USE_ASM
|
|
#endif
|
|
|
|
|
|
/*************Explanation*******************************/
|
|
// Manifest CONSTANTS mentioned within are defined right below.
|
|
// doc block moved to blender.h -- bjg
|
|
|
|
// Primary Manifest Values
|
|
#define TEXELS_PER_SOURCE_EDGE_LOG2 8
|
|
#define TEXELS_PER_TARGET_EDGE_LOG2 TerrainTextureMipLevel
|
|
#define LUMELS_PER_TILE_EDGE_LOG2 9
|
|
#define SQUARES_PER_TILE_EDGE_LOG2 8
|
|
#define MAX_MIPMAP_LEVELS 4
|
|
#define MAXIMUM_TEXTURES 4
|
|
#define SQUARES_PER_MIPMAP_EDGE_LOG2 (10 - TEXELS_PER_TARGET_EDGE_LOG2)
|
|
|
|
// Derived Manifest Values
|
|
#define MAX_TEXELS_PER_SQUARE_EDGE_LOG2 (TEXELS_PER_SOURCE_EDGE_LOG2 - SQUARES_PER_MIPMAP_EDGE_LOG2)
|
|
#define MIN_TEXELS_PER_SQUARE_EDGE_LOG2 (MAX_TEXELS_PER_SQUARE_EDGE_LOG2 - MAX_MIPMAP_LEVELS + 1)
|
|
#define LUMELS_PER_SQUARE_EDGE_LOG2 (LUMELS_PER_TILE_EDGE_LOG2 - SQUARES_PER_TILE_EDGE_LOG2)
|
|
|
|
#define MAX_TEXELS_PER_SQUARE_EDGE (1 << MAX_TEXELS_PER_SQUARE_EDGE_LOG2)
|
|
#define MAX_TEXELS_PER_SQUARE_LOG2 (MAX_TEXELS_PER_SQUARE_EDGE_LOG2 << 1)
|
|
#define MAX_TEXELS_PER_SQUARE (MAX_TEXELS_PER_SQUARE_EDGE*MAX_TEXELS_PER_SQUARE_EDGE)
|
|
#define LUMELS_PER_SQUARE_EDGE (1 << LUMELS_PER_SQUARE_EDGE_LOG2)
|
|
#define LUMELS_PER_SQUARE (LUMELS_PER_SQUARE_EDGE*LUMELS_PER_SQUARE_EDGE)
|
|
#define LUMELS_PER_TILE_EDGE (1 << LUMELS_PER_TILE_EDGE_LOG2)
|
|
#define LUMELS_PER_TILE_EDGE_MASK (LUMELS_PER_TILE_EDGE - 1)
|
|
#define SQUARES_PER_TILE_EDGE (1 << SQUARES_PER_TILE_EDGE_LOG2)
|
|
#define SQUARES_PER_TILE_EDGE_MASK (SQUARES_PER_TILE_EDGE - 1)
|
|
#define SQUARES_PER_MIPMAP_EDGE (1 << SQUARES_PER_MIPMAP_EDGE_LOG2)
|
|
#define SQUARES_PER_MIPMAP_EDGE_MASK (SQUARES_PER_MIPMAP_EDGE-1)
|
|
#define SQUARES_PER_MIPMAP (SQUARES_PER_MIPMAP_EDGE*SQUARES_PER_MIPMAP_EDGE)
|
|
#define TEXELS_PER_SOURCE_EDGE (1 << TEXELS_PER_SOURCE_EDGE_LOG2)
|
|
#define TEXELS_PER_TARGET_EDGE (1 << TEXELS_PER_TARGET_EDGE_LOG2)
|
|
#define TEXELS_PER_SOURCE_BMP (TEXELS_PER_SOURCE_EDGE * TEXELS_PER_SOURCE_EDGE)
|
|
#define TEXELS_PER_TARGET_BMP (TEXELS_PER_TARGET_EDGE * TEXELS_PER_TARGET_EDGE)
|
|
|
|
// Paul's leftovers
|
|
#define ALPHA_WID (1 << SQUARES_PER_TILE_EDGE_LOG2)
|
|
|
|
// -------------------------
|
|
// The following defines are for indexing into the source image maps.
|
|
// based on endianness, RGBA become ABGR, and we need to change indexing.
|
|
#if defined(TORQUE_BIG_ENDIAN)
|
|
#define SRC_IS_ABGR 1
|
|
#define SRC_OFF_RED 0
|
|
#define SRC_OFF_GREEN 1
|
|
#define SRC_OFF_BLUE 2
|
|
#else
|
|
#define SRC_IS_ABGR 0
|
|
#define SRC_OFF_RED 0
|
|
#define SRC_OFF_GREEN 1
|
|
#define SRC_OFF_BLUE 2
|
|
#endif
|
|
|
|
#ifdef __VEC__
|
|
vector unsigned int vlumels;
|
|
bool Blender::smUseVecBlender = false;
|
|
#endif
|
|
extern "C"
|
|
{
|
|
U32 lumels[4];
|
|
}
|
|
|
|
#if defined(BLENDER_USE_ASM)
|
|
|
|
extern "C"
|
|
{
|
|
// declarations for the NASM functions
|
|
void doSquare4( U32 *dst, int sq_shift, const int *aoff,
|
|
const U32 *const *bmp_ptrs, const U8 *const *alpha_ptrs );
|
|
void doSquare3( U32 *dst, int sq_shift, const int *aoff,
|
|
const U32 *const *bmp_ptrs, const U8 *const *alpha_ptrs );
|
|
void doSquare2( U32 *dst, int sq_shift, const int *aoff,
|
|
const U32 *const *bmp_ptrs, const U8 *const *alpha_ptrs );
|
|
void doLumelPlus1Mip( U16 *dstmip0, U16 *dstmip1, const U32 *srcptr );
|
|
void do1x1Lumel( U16 *dstptr, const U32 *srcptr );
|
|
void cheatmips( U16 *srcptr, U16 *dstmip0, U16 *dstmip1, int wid );
|
|
void cheatmips4x4( const U16 *srcptr, U16 *dstmip0, U16 *dstmip1 );
|
|
|
|
// global vars used for parameter passing
|
|
extern U32 sTargetTexelsPerLumel_log2;
|
|
extern U32 sTargetTexelsPerLumel;
|
|
extern U32 sTargetTexelsPerLumelDiv2;
|
|
|
|
extern U32 nextsrcrow;
|
|
extern U32 nextdstrow;
|
|
extern U32 mip0_dstrowadd;
|
|
extern U32 mip1_dstrowadd;
|
|
extern U32 minus1srcrowsPlus8;
|
|
extern U32 srcrows_x2_MinusTPL;
|
|
}
|
|
|
|
#else /* BLENDER_USE_ASM */
|
|
|
|
static U8 alphaTable[64 * 256];
|
|
class InitAlphaTable
|
|
{
|
|
public:
|
|
// build alpha blending table for C versions...
|
|
// This is just a cached pre-calc of some math...
|
|
InitAlphaTable()
|
|
{
|
|
for (U32 alpha = 0; alpha < 64; ++alpha)
|
|
for (U32 pix = 0; pix < 256; ++pix)
|
|
{
|
|
alphaTable[(alpha << 8) | pix] = (pix * alpha + 32) / 63;
|
|
}
|
|
}
|
|
};
|
|
static InitAlphaTable initAlphaTable; // Okay, cheesy
|
|
|
|
|
|
static void doSquare4( U32 *bmp_dest, int sq_shift, const int *alphaOffsets, const U32 *const *bmp_ptrs,
|
|
const U8 *const *alpha_ptrs )
|
|
{
|
|
int squareSize = 1 << sq_shift;
|
|
int squareSizeColors = squareSize;
|
|
|
|
int left_scan_edge_alpha_a;
|
|
int left_scan_edge_alpha_b;
|
|
int left_scan_edge_alpha_c;
|
|
int left_scan_edge_alpha_d;
|
|
|
|
int delta_left_alpha_a;
|
|
int delta_left_alpha_b;
|
|
int delta_left_alpha_c;
|
|
int delta_left_alpha_d;
|
|
|
|
int right_scan_edge_alpha_a;
|
|
int right_scan_edge_alpha_b;
|
|
int right_scan_edge_alpha_c;
|
|
int right_scan_edge_alpha_d;
|
|
|
|
int delta_right_alpha_a;
|
|
int delta_right_alpha_b;
|
|
int delta_right_alpha_c;
|
|
int delta_right_alpha_d;
|
|
|
|
{
|
|
int top_left_alpha = alpha_ptrs[0][ alphaOffsets[0] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
int top_right_alpha = alpha_ptrs[0][ alphaOffsets[1] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
int bot_left_alpha = alpha_ptrs[0][ alphaOffsets[2] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
int bot_right_alpha = alpha_ptrs[0][ alphaOffsets[3] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
|
|
delta_left_alpha_a = (bot_left_alpha - top_left_alpha) / squareSize;
|
|
delta_right_alpha_a = (bot_right_alpha - top_right_alpha) / squareSize;
|
|
|
|
left_scan_edge_alpha_a = top_left_alpha;
|
|
right_scan_edge_alpha_a = top_right_alpha;
|
|
|
|
top_left_alpha = alpha_ptrs[1][ alphaOffsets[0] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
top_right_alpha = alpha_ptrs[1][ alphaOffsets[1] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_left_alpha = alpha_ptrs[1][ alphaOffsets[2] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_right_alpha = alpha_ptrs[1][ alphaOffsets[3] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
|
|
delta_left_alpha_b = (bot_left_alpha - top_left_alpha) / squareSize;
|
|
delta_right_alpha_b = (bot_right_alpha - top_right_alpha) / squareSize;
|
|
|
|
left_scan_edge_alpha_b = top_left_alpha;
|
|
right_scan_edge_alpha_b = top_right_alpha;
|
|
|
|
top_left_alpha = alpha_ptrs[2][ alphaOffsets[0] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
top_right_alpha = alpha_ptrs[2][ alphaOffsets[1] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_left_alpha = alpha_ptrs[2][ alphaOffsets[2] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_right_alpha = alpha_ptrs[2][ alphaOffsets[3] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
|
|
delta_left_alpha_c = (bot_left_alpha - top_left_alpha) / squareSize;
|
|
delta_right_alpha_c = (bot_right_alpha - top_right_alpha) / squareSize;
|
|
|
|
left_scan_edge_alpha_c = top_left_alpha;
|
|
right_scan_edge_alpha_c = top_right_alpha;
|
|
|
|
top_left_alpha = alpha_ptrs[3][ alphaOffsets[0] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
top_right_alpha = alpha_ptrs[3][ alphaOffsets[1] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_left_alpha = alpha_ptrs[3][ alphaOffsets[2] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_right_alpha = alpha_ptrs[3][ alphaOffsets[3] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
|
|
delta_left_alpha_d = (bot_left_alpha - top_left_alpha) / squareSize;
|
|
delta_right_alpha_d = (bot_right_alpha - top_right_alpha) / squareSize;
|
|
|
|
left_scan_edge_alpha_d = top_left_alpha;
|
|
right_scan_edge_alpha_d = top_right_alpha;
|
|
}
|
|
|
|
U8* sourcePtr_a = (U8*)bmp_ptrs[0];
|
|
U8* sourcePtr_b = (U8*)bmp_ptrs[1];
|
|
U8* sourcePtr_c = (U8*)bmp_ptrs[2];
|
|
U8* sourcePtr_d = (U8*)bmp_ptrs[3];
|
|
U8* destPtr = (U8*)bmp_dest;
|
|
|
|
for(S32 iy = squareSize; --iy >= 0;)
|
|
{
|
|
int scan_alpha_a = left_scan_edge_alpha_a;
|
|
left_scan_edge_alpha_a += delta_left_alpha_a;
|
|
int delta_scan_alpha_a = (right_scan_edge_alpha_a - scan_alpha_a) / squareSize;
|
|
right_scan_edge_alpha_a += delta_right_alpha_a;
|
|
|
|
int scan_alpha_b = left_scan_edge_alpha_b;
|
|
left_scan_edge_alpha_b += delta_left_alpha_b;
|
|
int delta_scan_alpha_b = (right_scan_edge_alpha_b - scan_alpha_b) / squareSize;
|
|
right_scan_edge_alpha_b += delta_right_alpha_b;
|
|
|
|
int scan_alpha_c = left_scan_edge_alpha_c;
|
|
left_scan_edge_alpha_c += delta_left_alpha_c;
|
|
int delta_scan_alpha_c = (right_scan_edge_alpha_c - scan_alpha_c) / squareSize;
|
|
right_scan_edge_alpha_c += delta_right_alpha_c;
|
|
|
|
int scan_alpha_d = left_scan_edge_alpha_d;
|
|
left_scan_edge_alpha_d += delta_left_alpha_d;
|
|
int delta_scan_alpha_d = (right_scan_edge_alpha_d - scan_alpha_d) / squareSize;
|
|
right_scan_edge_alpha_d += delta_right_alpha_d;
|
|
|
|
for(S32 ix = squareSizeColors; --ix >= 0;)
|
|
{
|
|
#if USE_ALPHA_TABLE_AT_4
|
|
int six_bit_alpha_a = (scan_alpha_a >> (MAX_TEXELS_PER_SQUARE_LOG2 - 6)) & 0x3F00;
|
|
int six_bit_alpha_b = (scan_alpha_b >> (MAX_TEXELS_PER_SQUARE_LOG2 - 6)) & 0x3F00;
|
|
int six_bit_alpha_c = (scan_alpha_c >> (MAX_TEXELS_PER_SQUARE_LOG2 - 6)) & 0x3F00;
|
|
int six_bit_alpha_d = (scan_alpha_d >> (MAX_TEXELS_PER_SQUARE_LOG2 - 6)) & 0x3F00;
|
|
destPtr[0] = alphaTable[six_bit_alpha_a | sourcePtr_a[SRC_OFF_RED]] +
|
|
alphaTable[six_bit_alpha_b | sourcePtr_b[SRC_OFF_RED]] +
|
|
alphaTable[six_bit_alpha_c | sourcePtr_c[SRC_OFF_RED]] +
|
|
alphaTable[six_bit_alpha_d | sourcePtr_d[SRC_OFF_RED]];
|
|
destPtr[1] = alphaTable[six_bit_alpha_a | sourcePtr_a[SRC_OFF_GREEN]] +
|
|
alphaTable[six_bit_alpha_b | sourcePtr_b[SRC_OFF_GREEN]] +
|
|
alphaTable[six_bit_alpha_c | sourcePtr_c[SRC_OFF_GREEN]] +
|
|
alphaTable[six_bit_alpha_d | sourcePtr_d[SRC_OFF_GREEN]];
|
|
destPtr[2] = alphaTable[six_bit_alpha_a | sourcePtr_a[SRC_OFF_BLUE]] +
|
|
alphaTable[six_bit_alpha_b | sourcePtr_b[SRC_OFF_BLUE]] +
|
|
alphaTable[six_bit_alpha_c | sourcePtr_c[SRC_OFF_BLUE]] +
|
|
alphaTable[six_bit_alpha_d | sourcePtr_d[SRC_OFF_BLUE]];
|
|
#else
|
|
#define EIGHT_BITS_SQ_SHIFT_2 (0xff * MAX_TEXELS_PER_SQUARE)
|
|
destPtr[0] = (scan_alpha_a * sourcePtr_a[SRC_OFF_RED] +
|
|
scan_alpha_b * sourcePtr_b[SRC_OFF_RED] +
|
|
scan_alpha_c * sourcePtr_c[SRC_OFF_RED] +
|
|
scan_alpha_d * sourcePtr_d[SRC_OFF_RED]) / EIGHT_BITS_SQ_SHIFT_2;
|
|
destPtr[1] = (scan_alpha_a * sourcePtr_a[SRC_OFF_GREEN] +
|
|
scan_alpha_b * sourcePtr_b[SRC_OFF_GREEN] +
|
|
scan_alpha_c * sourcePtr_c[SRC_OFF_GREEN] +
|
|
scan_alpha_d * sourcePtr_d[SRC_OFF_GREEN]) / EIGHT_BITS_SQ_SHIFT_2;
|
|
destPtr[2] = (scan_alpha_a * sourcePtr_a[SRC_OFF_BLUE] +
|
|
scan_alpha_b * sourcePtr_b[SRC_OFF_BLUE] +
|
|
scan_alpha_c * sourcePtr_c[SRC_OFF_BLUE] +
|
|
scan_alpha_d * sourcePtr_d[SRC_OFF_BLUE]) / EIGHT_BITS_SQ_SHIFT_2;
|
|
#endif
|
|
scan_alpha_a += delta_scan_alpha_a;
|
|
scan_alpha_b += delta_scan_alpha_b;
|
|
scan_alpha_c += delta_scan_alpha_c;
|
|
scan_alpha_d += delta_scan_alpha_d;
|
|
sourcePtr_a += 4;
|
|
sourcePtr_b += 4;
|
|
sourcePtr_c += 4;
|
|
sourcePtr_d += 4;
|
|
destPtr += 4;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void doSquare3( U32 *bmp_dest, int sq_shift, const int *alphaOffsets, const U32 *const *bmp_ptrs,
|
|
const U8 *const *alpha_ptrs )
|
|
{
|
|
int squareSize = 1 << sq_shift;
|
|
int squareSizeColors = squareSize;
|
|
|
|
int left_scan_edge_alpha_a;
|
|
int left_scan_edge_alpha_b;
|
|
int left_scan_edge_alpha_c;
|
|
|
|
int delta_left_alpha_a;
|
|
int delta_left_alpha_b;
|
|
int delta_left_alpha_c;
|
|
|
|
int right_scan_edge_alpha_a;
|
|
int right_scan_edge_alpha_b;
|
|
int right_scan_edge_alpha_c;
|
|
|
|
int delta_right_alpha_a;
|
|
int delta_right_alpha_b;
|
|
int delta_right_alpha_c;
|
|
|
|
{
|
|
int top_left_alpha = alpha_ptrs[0][ alphaOffsets[0] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
int top_right_alpha = alpha_ptrs[0][ alphaOffsets[1] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
int bot_left_alpha = alpha_ptrs[0][ alphaOffsets[2] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
int bot_right_alpha = alpha_ptrs[0][ alphaOffsets[3] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
|
|
delta_left_alpha_a = (bot_left_alpha - top_left_alpha) / squareSize;
|
|
delta_right_alpha_a = (bot_right_alpha - top_right_alpha) / squareSize;
|
|
|
|
left_scan_edge_alpha_a = top_left_alpha;
|
|
right_scan_edge_alpha_a = top_right_alpha;
|
|
|
|
top_left_alpha = alpha_ptrs[1][ alphaOffsets[0] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
top_right_alpha = alpha_ptrs[1][ alphaOffsets[1] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_left_alpha = alpha_ptrs[1][ alphaOffsets[2] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_right_alpha = alpha_ptrs[1][ alphaOffsets[3] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
|
|
delta_left_alpha_b = (bot_left_alpha - top_left_alpha) / squareSize;
|
|
delta_right_alpha_b = (bot_right_alpha - top_right_alpha) / squareSize;
|
|
|
|
left_scan_edge_alpha_b = top_left_alpha;
|
|
right_scan_edge_alpha_b = top_right_alpha;
|
|
|
|
top_left_alpha = alpha_ptrs[2][ alphaOffsets[0] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
top_right_alpha = alpha_ptrs[2][ alphaOffsets[1] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_left_alpha = alpha_ptrs[2][ alphaOffsets[2] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_right_alpha = alpha_ptrs[2][ alphaOffsets[3] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
|
|
delta_left_alpha_c = (bot_left_alpha - top_left_alpha) / squareSize;
|
|
delta_right_alpha_c = (bot_right_alpha - top_right_alpha) / squareSize;
|
|
|
|
left_scan_edge_alpha_c = top_left_alpha;
|
|
right_scan_edge_alpha_c = top_right_alpha;
|
|
}
|
|
|
|
U8* sourcePtr_a = (U8*)bmp_ptrs[0];
|
|
U8* sourcePtr_b = (U8*)bmp_ptrs[1];
|
|
U8* sourcePtr_c = (U8*)bmp_ptrs[2];
|
|
U8* destPtr = (U8*)bmp_dest;
|
|
|
|
for(S32 iy = squareSize; --iy >= 0;)
|
|
{
|
|
int scan_alpha_a = left_scan_edge_alpha_a;
|
|
left_scan_edge_alpha_a += delta_left_alpha_a;
|
|
int delta_scan_alpha_a = (right_scan_edge_alpha_a - scan_alpha_a) / squareSize;
|
|
right_scan_edge_alpha_a += delta_right_alpha_a;
|
|
|
|
int scan_alpha_b = left_scan_edge_alpha_b;
|
|
left_scan_edge_alpha_b += delta_left_alpha_b;
|
|
int delta_scan_alpha_b = (right_scan_edge_alpha_b - scan_alpha_b) / squareSize;
|
|
right_scan_edge_alpha_b += delta_right_alpha_b;
|
|
|
|
int scan_alpha_c = left_scan_edge_alpha_c;
|
|
left_scan_edge_alpha_c += delta_left_alpha_c;
|
|
int delta_scan_alpha_c = (right_scan_edge_alpha_c - scan_alpha_c) / squareSize;
|
|
right_scan_edge_alpha_c += delta_right_alpha_c;
|
|
|
|
for(S32 ix = squareSizeColors; --ix >= 0;)
|
|
{
|
|
#if USE_ALPHA_TABLE_AT_3
|
|
int six_bit_alpha_a = (scan_alpha_a >> (MAX_TEXELS_PER_SQUARE_LOG2 - 6)) & 0x3F00;
|
|
int six_bit_alpha_b = (scan_alpha_b >> (MAX_TEXELS_PER_SQUARE_LOG2 - 6)) & 0x3F00;
|
|
int six_bit_alpha_c = (scan_alpha_c >> (MAX_TEXELS_PER_SQUARE_LOG2 - 6)) & 0x3F00;
|
|
destPtr[0] = alphaTable[six_bit_alpha_a | sourcePtr_a[SRC_OFF_RED]] +
|
|
alphaTable[six_bit_alpha_b | sourcePtr_b[SRC_OFF_RED]] +
|
|
alphaTable[six_bit_alpha_c | sourcePtr_c[SRC_OFF_RED]];
|
|
destPtr[1] = alphaTable[six_bit_alpha_a | sourcePtr_a[SRC_OFF_GREEN]] +
|
|
alphaTable[six_bit_alpha_b | sourcePtr_b[SRC_OFF_GREEN]] +
|
|
alphaTable[six_bit_alpha_c | sourcePtr_c[SRC_OFF_GREEN]];
|
|
destPtr[2] = alphaTable[six_bit_alpha_a | sourcePtr_a[SRC_OFF_BLUE]] +
|
|
alphaTable[six_bit_alpha_b | sourcePtr_b[SRC_OFF_BLUE]] +
|
|
alphaTable[six_bit_alpha_c | sourcePtr_c[SRC_OFF_BLUE]];
|
|
#else
|
|
#define EIGHT_BITS_SQ_SHIFT_2 (0xff * MAX_TEXELS_PER_SQUARE)
|
|
destPtr[0] = (scan_alpha_a * sourcePtr_a[SRC_OFF_RED] +
|
|
scan_alpha_b * sourcePtr_b[SRC_OFF_RED] +
|
|
scan_alpha_c * sourcePtr_c[SRC_OFF_RED]) / EIGHT_BITS_SQ_SHIFT_2;
|
|
destPtr[1] = (scan_alpha_a * sourcePtr_a[SRC_OFF_GREEN] +
|
|
scan_alpha_b * sourcePtr_b[SRC_OFF_GREEN] +
|
|
scan_alpha_c * sourcePtr_c[SRC_OFF_GREEN]) / EIGHT_BITS_SQ_SHIFT_2;
|
|
destPtr[2] = (scan_alpha_a * sourcePtr_a[SRC_OFF_BLUE] +
|
|
scan_alpha_b * sourcePtr_b[SRC_OFF_BLUE] +
|
|
scan_alpha_c * sourcePtr_c[SRC_OFF_BLUE]) / EIGHT_BITS_SQ_SHIFT_2;
|
|
#endif
|
|
scan_alpha_a += delta_scan_alpha_a;
|
|
scan_alpha_b += delta_scan_alpha_b;
|
|
scan_alpha_c += delta_scan_alpha_c;
|
|
sourcePtr_a += 4;
|
|
sourcePtr_b += 4;
|
|
sourcePtr_c += 4;
|
|
destPtr += 4;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void doSquare2( U32 *bmp_dest, int sq_shift, int *alphaOffsets, const U32 *const *bmp_ptrs,
|
|
const U8 *const *alpha_ptrs )
|
|
{
|
|
int squareSize = 1 << sq_shift;
|
|
int squareSizeColors = squareSize;
|
|
|
|
int left_scan_edge_alpha_a;
|
|
int left_scan_edge_alpha_b;
|
|
|
|
int delta_left_alpha_a;
|
|
int delta_left_alpha_b;
|
|
|
|
int right_scan_edge_alpha_a;
|
|
int right_scan_edge_alpha_b;
|
|
|
|
int delta_right_alpha_a;
|
|
int delta_right_alpha_b;
|
|
|
|
{
|
|
int top_left_alpha = alpha_ptrs[0][ alphaOffsets[0] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
int top_right_alpha = alpha_ptrs[0][ alphaOffsets[1] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
int bot_left_alpha = alpha_ptrs[0][ alphaOffsets[2] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
int bot_right_alpha = alpha_ptrs[0][ alphaOffsets[3] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
|
|
delta_left_alpha_a = (bot_left_alpha - top_left_alpha) / squareSize;
|
|
delta_right_alpha_a = (bot_right_alpha - top_right_alpha) / squareSize;
|
|
|
|
left_scan_edge_alpha_a = top_left_alpha;
|
|
right_scan_edge_alpha_a = top_right_alpha;
|
|
|
|
top_left_alpha = alpha_ptrs[1][ alphaOffsets[0] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
top_right_alpha = alpha_ptrs[1][ alphaOffsets[1] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_left_alpha = alpha_ptrs[1][ alphaOffsets[2] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
bot_right_alpha = alpha_ptrs[1][ alphaOffsets[3] ] << MAX_TEXELS_PER_SQUARE_LOG2;
|
|
|
|
delta_left_alpha_b = (bot_left_alpha - top_left_alpha) / squareSize;
|
|
delta_right_alpha_b = (bot_right_alpha - top_right_alpha) / squareSize;
|
|
|
|
left_scan_edge_alpha_b = top_left_alpha;
|
|
right_scan_edge_alpha_b = top_right_alpha;
|
|
}
|
|
|
|
U8* sourcePtr_a = (U8*)bmp_ptrs[0];
|
|
U8* sourcePtr_b = (U8*)bmp_ptrs[1];
|
|
U8* destPtr = (U8*)bmp_dest;
|
|
|
|
for(S32 iy = squareSize; --iy >= 0;)
|
|
{
|
|
int scan_alpha_a = left_scan_edge_alpha_a;
|
|
left_scan_edge_alpha_a += delta_left_alpha_a;
|
|
int delta_scan_alpha_a = (right_scan_edge_alpha_a - scan_alpha_a) / squareSize;
|
|
right_scan_edge_alpha_a += delta_right_alpha_a;
|
|
|
|
int scan_alpha_b = left_scan_edge_alpha_b;
|
|
left_scan_edge_alpha_b += delta_left_alpha_b;
|
|
int delta_scan_alpha_b = (right_scan_edge_alpha_b - scan_alpha_b) / squareSize;
|
|
right_scan_edge_alpha_b += delta_right_alpha_b;
|
|
|
|
for(S32 ix = squareSizeColors; --ix >= 0;)
|
|
{
|
|
#if USE_ALPHA_TABLE_AT_2
|
|
int six_bit_alpha_a = (scan_alpha_a >> (MAX_TEXELS_PER_SQUARE_LOG2 - 6)) & 0x3F00;
|
|
int six_bit_alpha_b = (scan_alpha_b >> (MAX_TEXELS_PER_SQUARE_LOG2 - 6)) & 0x3F00;
|
|
destPtr[0] = alphaTable[six_bit_alpha_a | sourcePtr_a[SRC_OFF_RED]] +
|
|
alphaTable[six_bit_alpha_b | sourcePtr_b[SRC_OFF_RED]];
|
|
destPtr[1] = alphaTable[six_bit_alpha_a | sourcePtr_a[SRC_OFF_GREEN]] +
|
|
alphaTable[six_bit_alpha_b | sourcePtr_b[SRC_OFF_GREEN]];
|
|
destPtr[2] = alphaTable[six_bit_alpha_a | sourcePtr_a[SRC_OFF_BLUE]] +
|
|
alphaTable[six_bit_alpha_b | sourcePtr_b[SRC_OFF_BLUE]];
|
|
#else
|
|
#define EIGHT_BITS_SQ_SHIFT_2 (0xff * MAX_TEXELS_PER_SQUARE)
|
|
destPtr[0] = (scan_alpha_a * sourcePtr_a[SRC_OFF_RED] +
|
|
scan_alpha_b * sourcePtr_b[SRC_OFF_RED] +
|
|
EIGHT_BITS_SQ_SHIFT_2 / 2) / EIGHT_BITS_SQ_SHIFT_2;
|
|
destPtr[1] = (scan_alpha_a * sourcePtr_a[SRC_OFF_GREEN] +
|
|
scan_alpha_b * sourcePtr_b[SRC_OFF_GREEN] +
|
|
EIGHT_BITS_SQ_SHIFT_2 / 2) / EIGHT_BITS_SQ_SHIFT_2;
|
|
destPtr[2] = (scan_alpha_a * sourcePtr_a[SRC_OFF_BLUE] +
|
|
scan_alpha_b * sourcePtr_b[SRC_OFF_BLUE] +
|
|
EIGHT_BITS_SQ_SHIFT_2 / 2) / EIGHT_BITS_SQ_SHIFT_2;
|
|
#endif
|
|
scan_alpha_a += delta_scan_alpha_a;
|
|
scan_alpha_b += delta_scan_alpha_b;
|
|
sourcePtr_a += 4;
|
|
sourcePtr_b += 4;
|
|
destPtr += 4;
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif /* BLENDER_USE_ASM */
|
|
|
|
// old C extruder
|
|
static void extrude5551( const U16 *srcMip, U16 *mip, U32 height, U32 width )
|
|
{
|
|
const U16 *src = srcMip;
|
|
U16 *dst = mip;
|
|
U32 stride = width << 1;
|
|
|
|
for(U32 y = 0; y < height; y++)
|
|
{
|
|
for(U32 x = 0; x < width; x++)
|
|
{
|
|
U32 a = src[0];
|
|
U32 b = src[1];
|
|
U32 c = src[stride];
|
|
U32 d = src[stride+1];
|
|
#if SRC_IS_ABGR
|
|
dst[x] = ((( ((a >> 10) & 0x1f) + ((b >> 10) & 0x1f) + ((c >> 10) & 0x1f) + ((d >> 10) & 0x1F) ) >> 2) << 10) |
|
|
((( ((a >> 5) & 0x1f) + ((b >> 5) & 0x1f) + ((c >> 5) & 0x1f) + ((d >> 5) & 0x1F) ) >> 2) << 5) |
|
|
((( ((a >> 0) & 0x1F) + ((b >> 0) & 0x1F) + ((c >> 0) & 0x1f) + ((d >> 0) & 0x1f)) >> 2) << 0);
|
|
#else
|
|
dst[x] = ((((a >> 11) + (b >> 11) + (c >> 11) + (d >> 11)) >> 2) << 11) |
|
|
((( ((a >> 6) & 0x1f) + ((b >> 6) & 0x1f) + ((c >> 6) & 0x1f) + ((d >> 6) & 0x1F) ) >> 2) << 6) |
|
|
((( ((a >> 1) & 0x1F) + ((b >> 1) & 0x1F) + ((c >> 1) & 0x1f) + ((d >> 1) & 0x1f)) >> 2) << 1);
|
|
#endif
|
|
src += 2;
|
|
}
|
|
src += stride;
|
|
dst += width;
|
|
}
|
|
}
|
|
|
|
|
|
/// Take first mip in array, and extrude rest into other entries of array
|
|
/// i.e. power is 7 for 128x128, but there should be power+1 entries in the
|
|
/// array.
|
|
static void extrude( U16 **mips, U32 power )
|
|
{
|
|
U32 width = 1 << (power - 1);
|
|
|
|
for ( U32 i = 0; i < power; i++ )
|
|
{
|
|
extrude5551( mips[i], mips[i+1], width, width );
|
|
width >>= 1;
|
|
}
|
|
}
|
|
|
|
|
|
/// Blend textures!
|
|
///
|
|
/// - level is between 2 (high detail) and 5 (low detail) inclusive;
|
|
/// - x and y are in alpha sized squares (not tex squares)
|
|
/// - lmap is light map data, format is 5551 (1555 ABGR)
|
|
/// - destmips is a list of 16-bit 5551 RGBA (1555 ABGR) mip-map buffers for the result.
|
|
|
|
// Note: to use the altivec implementation you must use the -faltivec compiler option and have a G4 or G5 or better
|
|
|
|
#ifdef __VEC__
|
|
|
|
void transpose(long size, vector unsigned int *X)
|
|
{
|
|
long vWidth = size/4;
|
|
|
|
long i,vi,j,vj;
|
|
vector unsigned int R0,R1,R2,R3;
|
|
vector unsigned int T0,T1,T2,T3;
|
|
vector unsigned int C0,C1,C2,C3;
|
|
|
|
for (i=0, vi=0; i<vWidth;i++, vi+=size) {
|
|
|
|
R0 = X[ vi + 0*vWidth + i ];
|
|
R1 = X[ vi + 1*vWidth + i ];
|
|
R2 = X[ vi + 2*vWidth + i ];
|
|
R3 = X[ vi + 3*vWidth + i ];
|
|
|
|
T0 = vec_mergeh(R0,R2);
|
|
T1 = vec_mergeh(R1,R3);
|
|
T2 = vec_mergel(R0,R2);
|
|
T3 = vec_mergel(R1,R3);
|
|
|
|
X[ vi + 0*vWidth + i ] = vec_mergeh(T0,T1);
|
|
X[ vi + 1*vWidth + i ] = vec_mergel(T0,T1);
|
|
X[ vi + 2*vWidth + i ] = vec_mergeh(T2,T3);
|
|
X[ vi + 3*vWidth + i ] = vec_mergel(T2,T3);
|
|
|
|
for (j=i+1, vj=vi+size; j<(vWidth); j++, vj+=size)
|
|
{
|
|
R0 = X[ vi + 0*vWidth + j ];
|
|
R1 = X[ vi + 1*vWidth + j ];
|
|
R2 = X[ vi + 2*vWidth + j ];
|
|
R3 = X[ vi + 3*vWidth + j ];
|
|
|
|
C0 = X[ vj + 0*vWidth + i ];
|
|
C1 = X[ vj + 1*vWidth + i ];
|
|
C2 = X[ vj + 2*vWidth + i ];
|
|
C3 = X[ vj + 3*vWidth + i ];
|
|
|
|
T0 = vec_mergeh(R0,R2);
|
|
T1 = vec_mergeh(R1,R3);
|
|
T2 = vec_mergel(R0,R2);
|
|
T3 = vec_mergel(R1,R3);
|
|
|
|
X[ vj + 0*vWidth + i ] = vec_mergeh(T0,T1);
|
|
X[ vj + 1*vWidth + i ] = vec_mergel(T0,T1);
|
|
X[ vj + 2*vWidth + i ] = vec_mergeh(T2,T3);
|
|
X[ vj + 3*vWidth + i ] = vec_mergel(T2,T3);
|
|
|
|
T0 = vec_mergeh(C0,C2);
|
|
T1 = vec_mergeh(C1,C3);
|
|
T2 = vec_mergel(C0,C2);
|
|
T3 = vec_mergel(C1,C3);
|
|
|
|
X[ vi + 0*vWidth + j ] = vec_mergeh(T0,T1);
|
|
X[ vi + 1*vWidth + j ] = vec_mergel(T0,T1);
|
|
X[ vi + 2*vWidth + j ] = vec_mergeh(T2,T3);
|
|
X[ vi + 3*vWidth + j ] = vec_mergel(T2,T3);
|
|
}
|
|
}
|
|
}
|
|
|
|
inline vector unsigned int vec_loadAndSplatU32( unsigned int *scalarPtr )
|
|
{
|
|
register vector unsigned char splatMap = vec_lvsl( 0, scalarPtr );
|
|
const register vector unsigned int result = vec_lde( 0, scalarPtr );
|
|
|
|
splatMap = (vector unsigned char) vec_splat( (vector unsigned int) splatMap, 0 );
|
|
|
|
return( vec_perm( result, result, splatMap ) );
|
|
}
|
|
|
|
// Move the alpha calculation out of the inner loops of blender_vec() to allow the compiler to work its magic
|
|
class AlphaCalc
|
|
{
|
|
public:
|
|
AlphaCalc::AlphaCalc( const U8 * const alphaTable )
|
|
: vec_two( (vector unsigned int)( 2 ) ),
|
|
index_const( (vector unsigned int)( 0x3F00 ) ),
|
|
v255( (vector unsigned int)( 255 ) ),
|
|
dstcol_const( (vector unsigned int) (0xf8) ),
|
|
#if SRC_IS_ABGR
|
|
col_adjust1( (vector unsigned int) (7, 2, 0, 0) ),
|
|
col_adjust2( (vector unsigned int) (0, 0, 3, 0) ),
|
|
#else
|
|
// for some strange and nonexistent altivec processor which uses RGBA5551
|
|
col_adjust1( (vector unsigned int) (8, 3, 0, 0) ),
|
|
col_adjust2( (vector unsigned int) (0, 0, 2, 0) ),
|
|
#endif
|
|
globalAlphaTable( alphaTable )
|
|
{
|
|
}
|
|
|
|
inline U16 Calc( vector unsigned int hscan_component, vector unsigned int src ) const
|
|
{
|
|
u_tmp.v = vec_or(vec_and(vec_sr(hscan_component, vec_two), index_const), src);
|
|
|
|
vector unsigned int dstcol;
|
|
unsigned int *sloader = (unsigned int *) &dstcol;
|
|
sloader[0] = globalAlphaTable[u_tmp.s[0]];
|
|
sloader[1] = globalAlphaTable[u_tmp.s[1]];
|
|
sloader[2] = globalAlphaTable[u_tmp.s[2]];
|
|
|
|
dstcol = vec_add( dstcol, dstcol );
|
|
dstcol = vec_min( dstcol, v255 );
|
|
|
|
// NOTE that on Mac, color order is flipped (ABGR1555 instead of RGBA5551), so:
|
|
// 1. we already reversed color order via BIG_ENDIAN indexing above, but
|
|
// 2. we need to change the shifts for alpha being the high bit instead of the low.
|
|
u_tmp.v = vec_sr(vec_sl(vec_and(dstcol, dstcol_const), col_adjust1), col_adjust2);
|
|
|
|
return( (unsigned short) (u_tmp.s[0] | u_tmp.s[1] | u_tmp.s[2]) );
|
|
}
|
|
|
|
private:
|
|
const vector unsigned int vec_two;
|
|
const vector unsigned int index_const;
|
|
const vector unsigned int v255;
|
|
const vector unsigned int dstcol_const;
|
|
const vector unsigned int col_adjust1;
|
|
const vector unsigned int col_adjust2;
|
|
|
|
mutable union {
|
|
vector unsigned int v;
|
|
U32 s[4];
|
|
} u_tmp;
|
|
|
|
const U8 * const globalAlphaTable;
|
|
};
|
|
|
|
/* Kyle Goodwin's altivec texture blender implementation. There's still a few areas this could be improved,
|
|
but given the enormous increase this already grants to performance the minor areas aren't significant.
|
|
Unfortunately the code below is highly unreadable for two main reasons, the original blender I was rewriting
|
|
was unreadable, and for speed I chose to use exclusively atomic 1-to-1 assembly equivilent vector operations.
|
|
The onyl exception to this is the one matrix transpose operation, which could possibly be optimized more.
|
|
I often must reference vector values through normal pointers or as part of a union, but since this impedes
|
|
parallelization I did it as little as possible. All in all this works out quite well and beings mac
|
|
performance in line with pc performance.
|
|
*/
|
|
|
|
inline void Blender::blend_vec( int x, int y, int squaresPerTargetEdge_log2, const U16 *lmap, U16 **destmips )
|
|
{
|
|
PROFILE_START(ALTIVEC_BLEND);
|
|
const U32 squaresPerTargetEdge(1 << squaresPerTargetEdge_log2); // 32 (low detail) to 4 (high detail).
|
|
const U32 texelsPerSquareEdge_log2(TEXELS_PER_TARGET_EDGE_LOG2 - squaresPerTargetEdge_log2); // 5 (high detail) to 2 (low detail)
|
|
const U32 texelsPerSquareEdge(1 << texelsPerSquareEdge_log2); // == TEXELS_PER_TARGET_EDGE / squaresPerTargetEdge); 4 (low) to 32 (high) detail.
|
|
const U32 texelsPerSquare_log2(texelsPerSquareEdge_log2 << 1); // 10 (high detail) to 4 (low detail)
|
|
const U32 sourceMipMapIndex(MAX_TEXELS_PER_SQUARE_EDGE_LOG2 - texelsPerSquareEdge_log2);
|
|
const U32 targetTexelsPerLumel_log2(texelsPerSquareEdge_log2 - LUMELS_PER_SQUARE_EDGE_LOG2);
|
|
const U32 targetTexelsPerLumel(1 << targetTexelsPerLumel_log2);
|
|
|
|
const U32 yStrideThroughTarget(TEXELS_PER_TARGET_EDGE);
|
|
const U32 yStrideThroughSquare(texelsPerSquareEdge);
|
|
const U32 xStrideAcrossLumels(targetTexelsPerLumel);
|
|
const U32 yStrideThroughTargetAcrossLumels(yStrideThroughTarget << targetTexelsPerLumel_log2);
|
|
const U32 yStrideThroughSquareAcrossLumels(yStrideThroughSquare << targetTexelsPerLumel_log2);
|
|
|
|
const U32 *const*const allSourceBitMaps = &bmpdata[sourceMipMapIndex * num_src_bmps];
|
|
|
|
const AlphaCalc alphaCalc( alphaTable );
|
|
|
|
// sy & sx index through the SQUAREs of the DESTINATION MIP-MAP
|
|
// All Source MIP-MAPs are 2D arrays of squares:
|
|
// SQUARE source_mip_map_2D[SQUARES_PER_MIPMAP_EDGE][SQUARES_PER_MIPMAP_EDGE];
|
|
// But they are stored as 1D arrays:
|
|
// SQUARE source_mip_map_1D[SQUARES_PER_MIPMAP_EDGE*SQUARES_PER_MIPMAP_EDGE];
|
|
// therefore the following are equivalent:
|
|
// source_mip_map_2D[Y][X]
|
|
// source_mip_map_1D[(Y * SQUARES_PER_MIPMAP_EDGE) + X]
|
|
// source_mip_map_1D[(Y << SQUARES_PER_MIPMAP_EDGE_LOG2) + X]
|
|
// source_mip_map_1D[(Y << SQUARES_PER_MIPMAP_EDGE_LOG2) | X]
|
|
// This loop is from [0] through [squaresPerTargetEdge - 1] of the destination
|
|
// and from [y] through [y + squaresPerTargetEdge - 1] of the source.
|
|
// A single terrain TILE is equivalently:
|
|
// SQUARE terrain_tile_2D[SQUARES_PER_TILE_EDGE][SQUARES_PER_TILE_EDGE];
|
|
// or
|
|
// SQUARE terrain_tile_1D[SQUARES_PER_TILE_EDGE*SQUARES_PER_TILE_EDGE];
|
|
// therefore the following are equivalent:
|
|
// terrain_tile_2D[Y][X]
|
|
// terrain_tile_1D[(Y << SQUARES_PER_MIPMAP_EDGE_LOG2) | X]
|
|
// Neither source_mip_map_1D nor terrain_tile_1D appear explicitly.
|
|
|
|
for ( int yInTarget = 0; yInTarget < squaresPerTargetEdge; ++yInTarget )
|
|
{
|
|
// This whole section is called "doing 2-dimensional array indexing the hard way"
|
|
// yInTile & after_yInTile are the bottom and top of the source square we are actually processing,
|
|
// masked to tile size which is what causes the "repeating" effect
|
|
const int yInTile((y + yInTarget) & SQUARES_PER_TILE_EDGE_MASK);
|
|
const int after_yInTile((yInTile + 1) & SQUARES_PER_TILE_EDGE_MASK);
|
|
|
|
// yInTile_offset and after_yInTile_offset are the offsets into the terrain_tile_1D format arrays
|
|
const int yInTile_offset(yInTile << SQUARES_PER_TILE_EDGE_LOG2);
|
|
const int after_yInTile_offset(after_yInTile << SQUARES_PER_TILE_EDGE_LOG2);
|
|
|
|
// py is the row index in squares into the source_mip_map_2D
|
|
const int yInSource(yInTile & SQUARES_PER_MIPMAP_EDGE_MASK);
|
|
// yInSource_offset is the offset in squares into the source_mip_map_1D, times the size
|
|
// of the squares.
|
|
const int yInSource_offset(yInSource << (texelsPerSquare_log2 + SQUARES_PER_MIPMAP_EDGE_LOG2));
|
|
|
|
// This loop is from [yInTarget][0] through [yInTarget][squaresPerTargetEdge - 1] of the destination
|
|
// and from [yInTile][x] through [yInTile][x + squaresPerTargetEdge - 1] of the source.
|
|
|
|
for ( int xInTarget = 0; xInTarget < squaresPerTargetEdge; xInTarget++ )
|
|
{
|
|
// xInTile & after_xInTile are the left and right side of the source square we are actually processing,
|
|
// masked to tile size which is what causes the "repeating" effect
|
|
const int xInTile((x + xInTarget) & SQUARES_PER_TILE_EDGE_MASK);
|
|
const int after_xInTile((xInTile + 1) & SQUARES_PER_TILE_EDGE_MASK);
|
|
// xInSource is the column index in squares into the source_mip_map_2D
|
|
const int xInSource(xInTile & SQUARES_PER_MIPMAP_EDGE_MASK);
|
|
// As you can see the GRID is accessed in TILE co-ordinates
|
|
const U32 gridflags(GRIDFLAGS( xInTile, yInTile ));
|
|
|
|
|
|
// Cache the source textures at our mip-map level as specified by the GRID-FLAGS
|
|
const U32 *sourceSquareBitMaps[MAXIMUM_TEXTURES];
|
|
// Cache the Alpha-Maps as specified by the GRID-FLAGS
|
|
const U8 *alphaMaps[MAXIMUM_TEXTURES];
|
|
|
|
// Pre-calculate (U8*) &source_mip_map_2D[yInSource][xInSource] --
|
|
// ( (yInSource * SQUARES_PER_MIPMAP_EDGE) + xInSource ) * sizeof(SQUARE)
|
|
const int bitmapOffset(yInSource_offset | (xInSource << texelsPerSquare_log2));
|
|
|
|
int numTexturesToBlend = 0;
|
|
|
|
for ( int i = 0; i < num_src_bmps; ++i )
|
|
{
|
|
if ( gridflags & (MATERIALSTART << i) ) // Gridflags tell us which materials are used for this square
|
|
{// Cache Alpha maps and bitmaps.
|
|
alphaMaps[ numTexturesToBlend ] = alpha_data[ i ];
|
|
sourceSquareBitMaps[ numTexturesToBlend++ ] = &allSourceBitMaps[ i ][bitmapOffset];
|
|
|
|
if ( numTexturesToBlend == MAXIMUM_TEXTURES )
|
|
break; // Why? What happens if more than (4) textures should be blended?
|
|
}
|
|
}
|
|
|
|
const U32 *bufferToLightFrom = blendbuffer;
|
|
|
|
if ( numTexturesToBlend < 2 )
|
|
{
|
|
// don't copy the square over...just leave it and tell
|
|
// lighting code to use src bmp as the source instead of
|
|
// the blend_buffer;
|
|
bufferToLightFrom = sourceSquareBitMaps[ 0 ];
|
|
}
|
|
else
|
|
{
|
|
int alphaOffsets[4];
|
|
|
|
alphaOffsets[0] = yInTile_offset | xInTile; // precalculate offsets for tile-coords[yInTile][xInTile]
|
|
alphaOffsets[1] = yInTile_offset | after_xInTile; // and so on for the square bounded by
|
|
alphaOffsets[2] = after_yInTile_offset | xInTile; // [yInTile][xInTile] [yInTile][after_xInTile]
|
|
alphaOffsets[3] = after_yInTile_offset | after_xInTile;// [after_yInTile][xInTile] [after_yInTile][after_xInTile]
|
|
|
|
switch( numTexturesToBlend ) // Blend 1 square of the numTexturesToBlend bit-maps into the blend buffer
|
|
{
|
|
case 2:
|
|
doSquare2( blendbuffer, texelsPerSquareEdge_log2, alphaOffsets, sourceSquareBitMaps, alphaMaps );
|
|
break;
|
|
case 3:
|
|
doSquare3( blendbuffer, texelsPerSquareEdge_log2, alphaOffsets, sourceSquareBitMaps, alphaMaps );
|
|
break;
|
|
default: // more subtle paranoia
|
|
doSquare4( blendbuffer, texelsPerSquareEdge_log2, alphaOffsets, sourceSquareBitMaps, alphaMaps );
|
|
break;
|
|
}
|
|
}
|
|
|
|
// [these comments are making me paranoid -- Ed.]
|
|
|
|
// copy in the lighting info
|
|
|
|
// Once again we make with the linear 2D array
|
|
const U32 xTexelInTarget(xInTarget << texelsPerSquareEdge_log2);
|
|
const U32 yTexelInTarget(yInTarget << texelsPerSquareEdge_log2);
|
|
const U32 yTexelInTarget_offset((yTexelInTarget << TEXELS_PER_TARGET_EDGE_LOG2));
|
|
U16 *const bits0 = &destmips[0][ yTexelInTarget_offset + xTexelInTarget ];
|
|
U16 *const bits1 = &destmips[1][ (yTexelInTarget_offset >> 2) + (xTexelInTarget >> 1) ];
|
|
U16 *const bits2 = &destmips[2][ (yTexelInTarget_offset >> 4) + (xTexelInTarget >> 2) ];
|
|
|
|
const U32 base_xInLightmap(xInTile << LUMELS_PER_SQUARE_EDGE_LOG2);
|
|
const U32 base_yInLightmap(yInTile << LUMELS_PER_SQUARE_EDGE_LOG2);
|
|
|
|
U32 yInLightmap_offset(base_yInLightmap << LUMELS_PER_TILE_EDGE_LOG2);
|
|
U32 next_yInLightmap(base_yInLightmap);
|
|
U32 yTexelInTargetSquare_offset(0);
|
|
U32 yTexelInSquare_offset(0);
|
|
|
|
for(U32 yLumelInSquare(0); yLumelInSquare < LUMELS_PER_SQUARE_EDGE; ++yLumelInSquare)
|
|
{
|
|
next_yInLightmap = (next_yInLightmap + 1) & LUMELS_PER_TILE_EDGE_MASK;
|
|
const U32 next_yInLightmap_offset(next_yInLightmap << LUMELS_PER_TILE_EDGE_LOG2);
|
|
|
|
U32 xInLightmap(base_xInLightmap);
|
|
U32 xTexelInSquare_offset = 0;
|
|
|
|
for(U32 xLumelInSquare(0); xLumelInSquare < LUMELS_PER_SQUARE_EDGE; ++xLumelInSquare)
|
|
{
|
|
const U32 next_xInLightmap((xInLightmap + 1) & LUMELS_PER_TILE_EDGE_MASK);
|
|
U32 texelInTargetSquare_offset = yTexelInTargetSquare_offset + xTexelInSquare_offset;
|
|
U32 texelInSquare_offset = yTexelInSquare_offset + xTexelInSquare_offset;
|
|
|
|
unsigned int *loader = (unsigned int *) &vlumels;
|
|
loader[0] = lmap[xInLightmap | yInLightmap_offset];
|
|
loader[1] = lmap[next_xInLightmap | yInLightmap_offset];
|
|
loader[2] = lmap[xInLightmap | next_yInLightmap_offset];
|
|
loader[3] = lmap[next_xInLightmap | next_yInLightmap_offset];
|
|
|
|
// Split the LUMELs into colors
|
|
vector unsigned int col[4];
|
|
vector unsigned int col_const = (vector unsigned int) (0x1f << 11);
|
|
|
|
col[2] = vec_and(vlumels, col_const);
|
|
col[1] = vec_and(vec_sl(vlumels, (vector unsigned int) (5)), col_const);
|
|
col[0] = vec_and(vec_sl(vlumels, (vector unsigned int) (10)), col_const);
|
|
|
|
vector unsigned int vec_targetTexelsPerLumel_log2 = vec_loadAndSplatU32( &targetTexelsPerLumel_log2 );
|
|
|
|
transpose(4, col); // transpose the matrix since we were using rows and now we need columns
|
|
|
|
// One for each color component
|
|
vector unsigned int left_component_delta = vec_sr(vec_sub(col[2], col[0]), vec_targetTexelsPerLumel_log2);
|
|
vector unsigned int right_component_delta = vec_sr(vec_sub(col[3], col[1]), vec_targetTexelsPerLumel_log2);
|
|
|
|
vector unsigned int vscan_left_component = col[0];
|
|
vector unsigned int vscan_right_component = col[1];
|
|
|
|
// Now we interpolate the color shifts across the square
|
|
for(U32 yTexelInLumel = 0; yTexelInLumel < targetTexelsPerLumel; ++yTexelInLumel)
|
|
{
|
|
vector unsigned int across_component_delta = vec_sr(vec_sub(vscan_right_component, vscan_left_component), vec_targetTexelsPerLumel_log2);
|
|
vector unsigned int hscan_component = vscan_left_component;
|
|
|
|
vscan_left_component = vec_add(vscan_left_component, left_component_delta);
|
|
vscan_right_component = vec_add(vscan_right_component, right_component_delta);
|
|
|
|
U16 *dstbits = &bits0[ texelInTargetSquare_offset ];
|
|
const U8 *srcbits = (U8 *)&bufferToLightFrom[ texelInSquare_offset ];
|
|
|
|
for(U32 xTexelInLumel = 0; xTexelInLumel < targetTexelsPerLumel; ++xTexelInLumel)
|
|
{
|
|
vector unsigned int cur_srcbits;
|
|
loader = (unsigned int *) &cur_srcbits;
|
|
loader[0] = srcbits[0];
|
|
loader[1] = srcbits[1];
|
|
loader[2] = srcbits[2];
|
|
|
|
*dstbits++ = alphaCalc.Calc( hscan_component, cur_srcbits );
|
|
|
|
srcbits += 4;
|
|
}
|
|
|
|
texelInTargetSquare_offset += yStrideThroughTarget;
|
|
texelInSquare_offset += yStrideThroughSquare;
|
|
}
|
|
|
|
xTexelInSquare_offset += xStrideAcrossLumels;
|
|
xInLightmap = next_xInLightmap;
|
|
}
|
|
yInLightmap_offset = next_yInLightmap_offset;
|
|
yTexelInTargetSquare_offset += yStrideThroughTargetAcrossLumels;
|
|
yTexelInSquare_offset += yStrideThroughSquareAcrossLumels;
|
|
}
|
|
// end of lighting.
|
|
}
|
|
}
|
|
|
|
extrude( destmips, TEXELS_PER_TARGET_EDGE_LOG2 );
|
|
PROFILE_END();
|
|
}
|
|
|
|
void Blender::blend( int x, int y, int squaresPerTargetEdge_log2, const U16 *lmap, U16 **destmips )
|
|
{
|
|
if(smUseVecBlender)
|
|
blend_vec(x, y, squaresPerTargetEdge_log2, lmap, destmips);
|
|
else
|
|
blend_c(x, y, squaresPerTargetEdge_log2, lmap, destmips);
|
|
}
|
|
|
|
#else
|
|
void Blender::blend( int x, int y, int squaresPerTargetEdge_log2, const U16 *lmap, U16 **destmips )
|
|
{
|
|
blend_c(x, y, squaresPerTargetEdge_log2, lmap, destmips);
|
|
}
|
|
#endif
|
|
|
|
inline void Blender::blend_c( int x, int y, int squaresPerTargetEdge_log2, const U16 *lmap, U16 **destmips )
|
|
{
|
|
PROFILE_START(Blender);
|
|
const int squaresPerTargetEdge(1 << squaresPerTargetEdge_log2); // 32 (low detail) to 4 (high detail).
|
|
const int texelsPerSquareEdge_log2(TEXELS_PER_TARGET_EDGE_LOG2 - squaresPerTargetEdge_log2); // 5 (high detail) to 2 (low detail)
|
|
const int texelsPerSquareEdge(1 << texelsPerSquareEdge_log2); // == TEXELS_PER_TARGET_EDGE / squaresPerTargetEdge); 4 (low) to 32 (high) detail.
|
|
const int texelsPerSquare_log2(texelsPerSquareEdge_log2 << 1); // 10 (high detail) to 4 (low detail)
|
|
const int sourceMipMapIndex(MAX_TEXELS_PER_SQUARE_EDGE_LOG2 - texelsPerSquareEdge_log2);
|
|
const int targetTexelsPerLumel_log2(texelsPerSquareEdge_log2 - LUMELS_PER_SQUARE_EDGE_LOG2);
|
|
const int targetTexelsPerLumel(1 << targetTexelsPerLumel_log2);
|
|
const U32 yStrideThroughTarget(TEXELS_PER_TARGET_EDGE);
|
|
const U32 yStrideThroughSquare(texelsPerSquareEdge);
|
|
const U32 xStrideAcrossLumels(targetTexelsPerLumel);
|
|
const U32 yStrideThroughTargetAcrossLumels(yStrideThroughTarget << targetTexelsPerLumel_log2);
|
|
const U32 yStrideThroughSquareAcrossLumels(yStrideThroughSquare << targetTexelsPerLumel_log2);
|
|
|
|
#if defined(BLENDER_USE_ASM)
|
|
// These are all secret parameters passed to the assembly language code through statics.
|
|
sTargetTexelsPerLumel_log2 = targetTexelsPerLumel_log2;
|
|
sTargetTexelsPerLumel = targetTexelsPerLumel;
|
|
|
|
sTargetTexelsPerLumelDiv2 = targetTexelsPerLumel >> 1;
|
|
nextsrcrow = ((yStrideThroughSquare) << 2);
|
|
nextdstrow = ((yStrideThroughTarget) << 1);
|
|
|
|
mip0_dstrowadd = (nextdstrow << 1) - (targetTexelsPerLumel << 1);
|
|
mip1_dstrowadd = (nextdstrow >> 1) - (targetTexelsPerLumel);
|
|
minus1srcrowsPlus8 = 8 - nextsrcrow;
|
|
srcrows_x2_MinusTPL = (nextsrcrow << 1) - (targetTexelsPerLumel << 2);
|
|
#endif
|
|
|
|
const U32 *const*const allSourceBitMaps = &bmpdata[sourceMipMapIndex * num_src_bmps];
|
|
|
|
// sy & sx index through the SQUAREs of the DESTINATION MIP-MAP
|
|
// All Source MIP-MAPs are 2D arrays of squares:
|
|
// SQUARE source_mip_map_2D[SQUARES_PER_MIPMAP_EDGE][SQUARES_PER_MIPMAP_EDGE];
|
|
// But they are stored as 1D arrays:
|
|
// SQUARE source_mip_map_1D[SQUARES_PER_MIPMAP_EDGE*SQUARES_PER_MIPMAP_EDGE];
|
|
// therefore the following are equivalent:
|
|
// source_mip_map_2D[Y][X]
|
|
// source_mip_map_1D[(Y * SQUARES_PER_MIPMAP_EDGE) + X]
|
|
// source_mip_map_1D[(Y << SQUARES_PER_MIPMAP_EDGE_LOG2) + X]
|
|
// source_mip_map_1D[(Y << SQUARES_PER_MIPMAP_EDGE_LOG2) | X]
|
|
// This loop is from [0] through [squaresPerTargetEdge - 1] of the destination
|
|
// and from [y] through [y + squaresPerTargetEdge - 1] of the source.
|
|
// A single terrain TILE is equivalently:
|
|
// SQUARE terrain_tile_2D[SQUARES_PER_TILE_EDGE][SQUARES_PER_TILE_EDGE];
|
|
// or
|
|
// SQUARE terrain_tile_1D[SQUARES_PER_TILE_EDGE*SQUARES_PER_TILE_EDGE];
|
|
// therefore the following are equivalent:
|
|
// terrain_tile_2D[Y][X]
|
|
// terrain_tile_1D[(Y << SQUARES_PER_MIPMAP_EDGE_LOG2) | X]
|
|
// Neither source_mip_map_1D nor terrain_tile_1D appear explicitly.
|
|
|
|
for ( int yInTarget = 0; yInTarget < squaresPerTargetEdge; yInTarget++ )
|
|
{
|
|
// This whole section is called "doing 2-dimensional array indexing the hard way"
|
|
// yInTile & after_yInTile are the bottom and top of the source square we are actually processing,
|
|
// masked to tile size which is what causes the "repeating" effect
|
|
const int yInTile((y + yInTarget) & SQUARES_PER_TILE_EDGE_MASK);
|
|
const int after_yInTile((yInTile + 1) & SQUARES_PER_TILE_EDGE_MASK);
|
|
|
|
// yInTile_offset and after_yInTile_offset are the offsets into the terrain_tile_1D format arrays
|
|
const int yInTile_offset(yInTile << SQUARES_PER_TILE_EDGE_LOG2);
|
|
const int after_yInTile_offset(after_yInTile << SQUARES_PER_TILE_EDGE_LOG2);
|
|
|
|
// py is the row index in squares into the source_mip_map_2D
|
|
const int yInSource(yInTile & SQUARES_PER_MIPMAP_EDGE_MASK);
|
|
// yInSource_offset is the offset in squares into the source_mip_map_1D, times the size
|
|
// of the squares.
|
|
const int yInSource_offset(yInSource << (texelsPerSquare_log2 + SQUARES_PER_MIPMAP_EDGE_LOG2));
|
|
|
|
// This loop is from [yInTarget][0] through [yInTarget][squaresPerTargetEdge - 1] of the destination
|
|
// and from [yInTile][x] through [yInTile][x + squaresPerTargetEdge - 1] of the source.
|
|
|
|
for ( int xInTarget = 0; xInTarget < squaresPerTargetEdge; xInTarget++ )
|
|
{
|
|
// xInTile & after_xInTile are the left and right side of the source square we are actually processing,
|
|
// masked to tile size which is what causes the "repeating" effect
|
|
const int xInTile((x + xInTarget) & SQUARES_PER_TILE_EDGE_MASK);
|
|
const int after_xInTile((xInTile + 1) & SQUARES_PER_TILE_EDGE_MASK);
|
|
// xInSource is the column index in squares into the source_mip_map_2D
|
|
const int xInSource(xInTile & SQUARES_PER_MIPMAP_EDGE_MASK);
|
|
// As you can see the GRID is accessed in TILE co-ordinates
|
|
const U32 gridflags(GRIDFLAGS( xInTile, yInTile ));
|
|
|
|
int numTexturesToBlend = 0;
|
|
int alphaOffsets[4];
|
|
alphaOffsets[0] = yInTile_offset | xInTile; // precalculate offsets for tile-coords[yInTile][xInTile]
|
|
alphaOffsets[1] = yInTile_offset | after_xInTile; // and so on for the square bounded by
|
|
alphaOffsets[2] = after_yInTile_offset | xInTile; // [yInTile][xInTile] [yInTile][after_xInTile]
|
|
alphaOffsets[3] = after_yInTile_offset | after_xInTile;// [after_yInTile][xInTile] [after_yInTile][after_xInTile]
|
|
|
|
// Cache the source textures at our mip-map level as specified by the GRID-FLAGS
|
|
const U32 *sourceSquareBitMaps[MAXIMUM_TEXTURES];
|
|
// Cache the Alpha-Maps as specified by the GRID-FLAGS
|
|
const U8 *alphaMaps[MAXIMUM_TEXTURES];
|
|
|
|
// Pre-calculate (U8*) &source_mip_map_2D[yInSource][xInSource] --
|
|
// ( (yInSource * SQUARES_PER_MIPMAP_EDGE) + xInSource ) * sizeof(SQUARE)
|
|
const int bitmapOffset(yInSource_offset | (xInSource << texelsPerSquare_log2));
|
|
|
|
for ( int i = 0; i < num_src_bmps; i++ )
|
|
if ( gridflags & (MATERIALSTART << i) ) // Gridflags tell us which materials are used for this square
|
|
{// Cache Alpha maps and bitmaps.
|
|
alphaMaps[ numTexturesToBlend ] = alpha_data[ i ];
|
|
sourceSquareBitMaps[ numTexturesToBlend++ ] = &allSourceBitMaps[ i ][bitmapOffset];
|
|
|
|
if ( numTexturesToBlend == MAXIMUM_TEXTURES )
|
|
break; // Why? What happens if more than (4) textures should be blended?
|
|
}
|
|
|
|
const U32 *bufferToLightFrom = blendbuffer;
|
|
switch( numTexturesToBlend ) // Blend 1 square of the numTexturesToBlend bit-maps into the blend buffer
|
|
{
|
|
case 0:// paranoia
|
|
case 1:
|
|
// don't copy the square over...just leave it and tell
|
|
// lighting code to use src bmp as the source instead of
|
|
// the blend_buffer;
|
|
bufferToLightFrom = sourceSquareBitMaps[ 0 ];
|
|
break;
|
|
case 2:
|
|
doSquare2( blendbuffer, texelsPerSquareEdge_log2, alphaOffsets, sourceSquareBitMaps, alphaMaps );
|
|
break;
|
|
case 3:
|
|
doSquare3( blendbuffer, texelsPerSquareEdge_log2, alphaOffsets, sourceSquareBitMaps, alphaMaps );
|
|
break;
|
|
default: // more subtle paranoia
|
|
doSquare4( blendbuffer, texelsPerSquareEdge_log2, alphaOffsets, sourceSquareBitMaps, alphaMaps );
|
|
break;
|
|
}
|
|
|
|
// [these comments are making me paranoid -- Ed.]
|
|
|
|
// copy in the lighting info
|
|
|
|
// Once again we make with the linear 2D array
|
|
const U32 xTexelInTarget(xInTarget << texelsPerSquareEdge_log2);
|
|
const U32 yTexelInTarget(yInTarget << texelsPerSquareEdge_log2);
|
|
const U32 yTexelInTarget_offset((yTexelInTarget << TEXELS_PER_TARGET_EDGE_LOG2));
|
|
U16 *const bits0 = &destmips[0][ yTexelInTarget_offset + xTexelInTarget ];
|
|
U16 *const bits1 = &destmips[1][ (yTexelInTarget_offset >> 2) + (xTexelInTarget >> 1) ];
|
|
U16 *const bits2 = &destmips[2][ (yTexelInTarget_offset >> 4) + (xTexelInTarget >> 2) ];
|
|
|
|
const U32 base_xInLightmap(xInTile << LUMELS_PER_SQUARE_EDGE_LOG2);
|
|
const U32 base_yInLightmap(yInTile << LUMELS_PER_SQUARE_EDGE_LOG2);
|
|
|
|
U32 yInLightmap_offset(base_yInLightmap << LUMELS_PER_TILE_EDGE_LOG2);
|
|
U32 next_yInLightmap(base_yInLightmap);
|
|
U32 yTexelInTargetSquare_offset(0);
|
|
U32 yTexelInSquare_offset(0);
|
|
|
|
for(U32 yLumelInSquare(0); yLumelInSquare < LUMELS_PER_SQUARE_EDGE; ++yLumelInSquare)
|
|
{
|
|
next_yInLightmap = (next_yInLightmap + 1) & LUMELS_PER_TILE_EDGE_MASK;
|
|
U32 next_yInLightmap_offset(next_yInLightmap << LUMELS_PER_TILE_EDGE_LOG2);
|
|
|
|
U32 xInLightmap(base_xInLightmap);
|
|
U32 xTexelInSquare_offset = 0;
|
|
|
|
for(U32 xLumelInSquare(0); xLumelInSquare < LUMELS_PER_SQUARE_EDGE; ++xLumelInSquare)
|
|
{
|
|
U32 next_xInLightmap((xInLightmap + 1) & LUMELS_PER_TILE_EDGE_MASK);
|
|
U32 texelInTargetSquare_offset = yTexelInTargetSquare_offset + xTexelInSquare_offset;
|
|
U32 texelInSquare_offset = yTexelInSquare_offset + xTexelInSquare_offset;
|
|
|
|
// lumels are secret parameters to subroutines
|
|
lumels[0] = U32(lmap[xInLightmap | yInLightmap_offset]);
|
|
lumels[1] = U32(lmap[next_xInLightmap | yInLightmap_offset]);
|
|
lumels[2] = U32(lmap[xInLightmap | next_yInLightmap_offset]);
|
|
lumels[3] = U32(lmap[next_xInLightmap | next_yInLightmap_offset]);
|
|
|
|
PROFILE_START(BlenderInASM);
|
|
#if defined(BLENDER_USE_ASM)
|
|
if ( targetTexelsPerLumel > 1 )
|
|
{
|
|
doLumelPlus1Mip( &bits0[ texelInTargetSquare_offset ],
|
|
&bits1[ (yTexelInTargetSquare_offset >> 2) + (xTexelInSquare_offset >> 1) ],
|
|
&bufferToLightFrom[ texelInSquare_offset ] );
|
|
}
|
|
else
|
|
do1x1Lumel( &bits0[ texelInTargetSquare_offset ], &bufferToLightFrom[ texelInSquare_offset ] );
|
|
#else
|
|
// Split the LUMELs into colors
|
|
U32 col[3][4];
|
|
|
|
U32 i;
|
|
for(i = 0; i < 4; i++)
|
|
{
|
|
col[2][i] = (lumels[i]) & (0x1f << 11);
|
|
col[1][i] = (lumels[i] << 5) & (0x1f << 11);
|
|
col[0][i] = (lumels[i] << 10) & (0x1f << 11);
|
|
}
|
|
|
|
// One for each color component
|
|
U32 left_component_delta[3];
|
|
U32 right_component_delta[3];
|
|
U32 vscan_left_component[3];
|
|
U32 vscan_right_component[3];
|
|
|
|
for(i = 0; i < 3; i++)
|
|
{
|
|
left_component_delta[i] = (col[i][2] - col[i][0]) >> targetTexelsPerLumel_log2;
|
|
right_component_delta[i] = (col[i][3] - col[i][1]) >> targetTexelsPerLumel_log2;
|
|
|
|
vscan_left_component[i] = col[i][0];
|
|
vscan_right_component[i] = col[i][1];
|
|
}
|
|
|
|
// Now we interpolate the color shifts across the square
|
|
for(U32 yTexelInLumel = 0; yTexelInLumel < targetTexelsPerLumel; yTexelInLumel++)
|
|
{
|
|
U32 across_component_delta[3];
|
|
U32 hscan_component[3];
|
|
|
|
for(i = 0; i < 3; i++)
|
|
{
|
|
across_component_delta[i] = (vscan_right_component[i] - vscan_left_component[i]) >> targetTexelsPerLumel_log2;
|
|
hscan_component[i] = vscan_left_component[i];
|
|
vscan_left_component[i] += left_component_delta[i];
|
|
vscan_right_component[i] += right_component_delta[i];
|
|
}
|
|
|
|
U16 *dstbits = &bits0[ texelInTargetSquare_offset ];
|
|
const U8 *srcbits = (U8 *)&bufferToLightFrom[ texelInSquare_offset ];
|
|
|
|
for(U32 xTexelInLumel = 0; xTexelInLumel < targetTexelsPerLumel; xTexelInLumel++)
|
|
{
|
|
PROFILE_START(BlendInnermost);
|
|
U16 dstcol[3];
|
|
|
|
for(i = 0; i < 3; i++) // Unroll this dumb loop?
|
|
{
|
|
U32 index = (hscan_component[i] >> 2) & 0x3F00;
|
|
dstcol[i] = alphaTable[index | srcbits[i]];
|
|
hscan_component[i] += across_component_delta[i];
|
|
}
|
|
|
|
|
|
const U16 max = 255;
|
|
dstcol[0] += dstcol[0];
|
|
dstcol[1] += dstcol[1];
|
|
dstcol[2] += dstcol[2];
|
|
dstcol[0] = (dstcol[0] > max) ? max : dstcol[0];
|
|
dstcol[1] = (dstcol[1] > max) ? max : dstcol[1];
|
|
dstcol[2] = (dstcol[2] > max) ? max : dstcol[2];
|
|
|
|
|
|
#if SRC_IS_ABGR
|
|
// NOTE that on Mac, color order is flipped (ABGR1555 instead of RGBA5551), so:
|
|
// 1. we already reversed color order via BIG_ENDIAN indexing above, but
|
|
// 2. we need to change the shifts for alpha being the high bit instead of the low.
|
|
*dstbits++ = ((dstcol[0] & 0xf8) << 7) | ((dstcol[1] & 0xf8) << 2) | ((dstcol[2] & 0xf8) >> 3);
|
|
#else
|
|
*dstbits++ = ((dstcol[0] & 0xf8) << 8) | ((dstcol[1] & 0xf8) << 3) | ((dstcol[2] & 0xf8) >> 2);
|
|
#endif
|
|
srcbits += 4;
|
|
PROFILE_END();
|
|
}
|
|
|
|
texelInTargetSquare_offset += yStrideThroughTarget;
|
|
texelInSquare_offset += yStrideThroughSquare;
|
|
}
|
|
#endif
|
|
PROFILE_END();
|
|
xTexelInSquare_offset += xStrideAcrossLumels;
|
|
xInLightmap = next_xInLightmap;
|
|
}
|
|
yInLightmap_offset = next_yInLightmap_offset;
|
|
yTexelInTargetSquare_offset += yStrideThroughTargetAcrossLumels;
|
|
yTexelInSquare_offset += yStrideThroughSquareAcrossLumels;
|
|
}
|
|
// end of lighting.
|
|
}
|
|
}
|
|
|
|
#if defined(BLENDER_USE_ASM)
|
|
if ( targetTexelsPerLumel > 1)
|
|
{
|
|
cheatmips( destmips[1], destmips[2], destmips[3], 64 );
|
|
cheatmips( destmips[3], destmips[4], destmips[5], 16 );
|
|
cheatmips4x4( destmips[5], destmips[6], destmips[7] );
|
|
}
|
|
else
|
|
#endif
|
|
extrude( destmips, TEXELS_PER_TARGET_EDGE_LOG2 );
|
|
PROFILE_END();
|
|
}
|
|
|
|
|
|
/// dc - note that this takes an 24bit texture and places it in 32bit space,
|
|
/// prob for better alignment/access.
|
|
void Blender::addSourceTexture( int textureIndex, const U8 **bmps )
|
|
{
|
|
int texelsPerSquareEdge = MAX_TEXELS_PER_SQUARE_EDGE;
|
|
|
|
for ( int mipLevel = 0; mipLevel < num_mip_levels; mipLevel++, texelsPerSquareEdge >>= 1)
|
|
{
|
|
U32 *dst = bmpdata[ mipLevel * num_src_bmps + textureIndex ];
|
|
const U8 *rowPtr = bmps[ mipLevel ];
|
|
const U32 texelStride(3);
|
|
const U32 colStride(texelsPerSquareEdge * texelStride);
|
|
const U32 blkStride(colStride * SQUARES_PER_MIPMAP_EDGE);
|
|
const U32 rowStride(texelsPerSquareEdge * blkStride);
|
|
|
|
// copy the bmp data over, changing the format so each block
|
|
// is contiguous.
|
|
for ( int row = 0; row < SQUARES_PER_MIPMAP_EDGE; row++, rowPtr += rowStride)
|
|
{
|
|
const U8* colPtr = rowPtr;
|
|
|
|
for ( int col = 0; col < SQUARES_PER_MIPMAP_EDGE; col++, colPtr += colStride)
|
|
{
|
|
const U8 *blkPtr = colPtr;
|
|
|
|
for ( int py = 0; py < texelsPerSquareEdge; py++, blkPtr += blkStride)
|
|
{
|
|
const U8* texelPtr = blkPtr;
|
|
for ( int px = 0; px < texelsPerSquareEdge; px++, texelPtr += texelStride )
|
|
#if SRC_IS_ABGR
|
|
*dst++ = (U32(texelPtr[0]) << 24) | (U32(texelPtr[1]) << 16) | (U32(texelPtr[2]) << 8);
|
|
#else
|
|
*dst++ = texelPtr[0] | (U32(texelPtr[1]) << 8) | (U32(texelPtr[2]) << 16);
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#define CACHE_ROUND_SHIFT 12
|
|
#define CACHE_ROUND_ADJUST ((1 << CACHE_ROUND_SHIFT) - 1)
|
|
#define CACHE_ROUND_MASK (~CACHE_ROUND_ADJUST)
|
|
#define DWORD_STAGGER 0
|
|
|
|
static U32 *round_to_cache_start( U32 *ptr )
|
|
{
|
|
return ( (U32 *) ((dsize_t(ptr) + CACHE_ROUND_ADJUST) & CACHE_ROUND_MASK) );
|
|
}
|
|
|
|
Blender::Blender( int num_src, int num_mips, U8 **alphas )
|
|
{
|
|
int bmps_size = MAX_TEXELS_PER_SQUARE; // blending buffer (1 square)
|
|
int mip_size = TEXELS_PER_SOURCE_BMP;
|
|
int i, j;
|
|
|
|
alpha_data = new U8*[num_src];
|
|
for (i = 0; i < num_src; i++)
|
|
alpha_data[i] = alphas[i];
|
|
|
|
num_src_bmps = num_src;
|
|
num_mip_levels = num_mips;
|
|
|
|
bmpdata = new U32*[ num_src * num_mips ];
|
|
|
|
for ( i = 0; i < num_mips; i++ )
|
|
{
|
|
bmps_size += (mip_size + DWORD_STAGGER) * num_src;
|
|
mip_size >>= 2;
|
|
}
|
|
|
|
bmp_alloc_ptr = new U32[ bmps_size + CACHE_ROUND_ADJUST ];
|
|
U32 *bmps = round_to_cache_start( bmp_alloc_ptr );
|
|
|
|
// buffer that we'll be blending into, and lighting out of.
|
|
blendbuffer = bmps;
|
|
|
|
U32 *curbmp = blendbuffer + MAX_TEXELS_PER_SQUARE;
|
|
int bmp_size = TEXELS_PER_SOURCE_BMP;
|
|
int bmpnum = 0;
|
|
|
|
// initialize pointers into buffer for source textures.
|
|
for ( j = 0; j < num_mips; j++ )
|
|
{
|
|
for ( i = 0; i < num_src; i++ )
|
|
{
|
|
bmpdata[ bmpnum ] = curbmp;
|
|
// U32 *bptr = curbmp;
|
|
|
|
curbmp += (bmp_size + DWORD_STAGGER);
|
|
bmpnum++;
|
|
}
|
|
|
|
bmp_size >>= 2;
|
|
}
|
|
}
|
|
|
|
Blender::~Blender()
|
|
{
|
|
if ( bmp_alloc_ptr )
|
|
delete [] bmp_alloc_ptr;
|
|
|
|
if ( bmpdata )
|
|
delete [] bmpdata;
|
|
|
|
if ( alpha_data )
|
|
delete [] alpha_data;
|
|
}
|