554 lines
17 KiB
C++
Executable File
554 lines
17 KiB
C++
Executable File
//-----------------------------------------------------------------------------
|
|
// Torque Game Engine
|
|
// Copyright (C) GarageGames.com, Inc.
|
|
//-----------------------------------------------------------------------------
|
|
|
|
#include "core/unicode.h"
|
|
#include "core/frameAllocator.h"
|
|
#include "platform/profiler.h"
|
|
#include <stdio.h>
|
|
|
|
//-----------------------------------------------------------------------------
|
|
/// replacement character. Standard correct value is 0xFFFD.
|
|
#define kReplacementChar 0xFFFD
|
|
|
|
/// Look up table. Shift a byte >> 1, then look up how many bytes to expect after it.
|
|
/// Contains -1's for illegal values.
|
|
#pragma mark U8 firstByteLUT
|
|
U8 firstByteLUT[128] =
|
|
{
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x0F // single byte ascii
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x1F // single byte ascii
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x2F // single byte ascii
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x3F // single byte ascii
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4F // trailing utf8
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x5F // trailing utf8
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0x6F // first of 2
|
|
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, // 0x7F // first of 3,4,5,illegal in utf-8
|
|
};
|
|
|
|
/// Look up table. Shift a 16-bit word >> 10, then look up whether it is a surrogate,
|
|
/// and which part. 0 means non-surrogate, 1 means 1st in pair, 2 means 2nd in pair.
|
|
#pragma mark U8 surrogateLUT
|
|
U8 surrogateLUT[64] =
|
|
{
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0F
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x1F
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x2F
|
|
0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, // 0x3F
|
|
};
|
|
|
|
/// Look up table. Feed value from firstByteLUT in, gives you
|
|
/// the mask for the data bits of that UTF-8 code unit.
|
|
#pragma mark U8 byteMask8LUT[]
|
|
U8 byteMask8LUT[] = { 0x3f, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; // last 0=6, 1=7, 2=5, 4, 3, 2, 1 bits
|
|
|
|
/// Mask for the data bits of a UTF-16 surrogate.
|
|
#pragma mark U8 byteMaskLow10
|
|
U16 byteMaskLow10 = 0x03ff;
|
|
|
|
#pragma mark -
|
|
//-----------------------------------------------------------------------------
|
|
inline bool isSurrogateRange(U32 codepoint)
|
|
{
|
|
return ( 0xd800 < codepoint && codepoint < 0xdfff );
|
|
}
|
|
|
|
inline bool isAboveBMP(U32 codepoint)
|
|
{
|
|
return ( codepoint > 0xFFFF );
|
|
}
|
|
|
|
#pragma mark -
|
|
//-----------------------------------------------------------------------------
|
|
const U32 convertUTF8toUTF16(const UTF8 *unistring, UTF16 *outbuffer, U32 len)
|
|
{
|
|
PROFILE_START(convertUTF8toUTF16);
|
|
U32 walked, nCodepoints;
|
|
UTF32 middleman;
|
|
|
|
nCodepoints=0;
|
|
while(*unistring != NULL && nCodepoints < len)
|
|
{
|
|
walked = 1;
|
|
middleman = oneUTF8toUTF32(unistring,&walked);
|
|
outbuffer[nCodepoints] = oneUTF32toUTF16(middleman);
|
|
unistring+=walked;
|
|
nCodepoints++;
|
|
}
|
|
|
|
nCodepoints = getMin(nCodepoints,len);
|
|
outbuffer[nCodepoints] = NULL;
|
|
|
|
PROFILE_END();
|
|
return nCodepoints;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const U32 convertUTF8toUTF32(const UTF8 *unistring, UTF32 *outbuffer, U32 len)
|
|
{
|
|
PROFILE_START(convertUTF8toUTF32);
|
|
U32 walked, nCodepoints;
|
|
|
|
nCodepoints=0;
|
|
while(*unistring != NULL && nCodepoints < len)
|
|
{
|
|
walked = 1;
|
|
outbuffer[nCodepoints] = oneUTF8toUTF32(unistring,&walked);
|
|
unistring+=walked;
|
|
nCodepoints++;
|
|
}
|
|
|
|
nCodepoints = getMin(nCodepoints,len);
|
|
outbuffer[nCodepoints] = NULL;
|
|
|
|
PROFILE_END();
|
|
return nCodepoints;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const U32 convertUTF16toUTF8( const UTF16 *unistring, UTF8 *outbuffer, U32 len)
|
|
{
|
|
PROFILE_START(convertUTF16toUTF8);
|
|
U32 walked, nCodeunits, codeunitLen;
|
|
UTF32 middleman;
|
|
|
|
nCodeunits=0;
|
|
while( *unistring != NULL && nCodeunits < len - 3)
|
|
{
|
|
walked = 1;
|
|
middleman = oneUTF16toUTF32(unistring,&walked);
|
|
codeunitLen = oneUTF32toUTF8(middleman, &outbuffer[nCodeunits]);
|
|
unistring += walked;
|
|
nCodeunits += codeunitLen;
|
|
}
|
|
|
|
nCodeunits = getMin(nCodeunits,len);
|
|
outbuffer[nCodeunits] = NULL;
|
|
|
|
PROFILE_END();
|
|
return nCodeunits;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const U32 convertUTF16toUTF32(const UTF16 *unistring, UTF32 *outbuffer, U32 len)
|
|
{
|
|
PROFILE_START(convertUTF16toUTF32);
|
|
U32 walked, nCodepoints;
|
|
|
|
nCodepoints=0;
|
|
while( *unistring != NULL && nCodepoints < len )
|
|
{
|
|
walked=1;
|
|
outbuffer[nCodepoints] = oneUTF16toUTF32(unistring,&walked);
|
|
unistring += walked;
|
|
nCodepoints++;
|
|
}
|
|
|
|
nCodepoints = getMin(nCodepoints,len);
|
|
outbuffer[nCodepoints] = NULL;
|
|
|
|
PROFILE_END();
|
|
return nCodepoints;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const U32 convertUTF32toUTF8( const UTF32 *unistring, UTF8 *outbuffer, U32 len)
|
|
{
|
|
PROFILE_START(convertUTF32toUTF8);
|
|
U32 nCodeunits, codeunitLen;
|
|
|
|
nCodeunits=0;
|
|
while( *unistring != NULL && nCodeunits < len - 3)
|
|
{
|
|
codeunitLen = oneUTF32toUTF8(*unistring, &outbuffer[nCodeunits]);
|
|
unistring++;
|
|
nCodeunits += codeunitLen;
|
|
}
|
|
|
|
nCodeunits = getMin(nCodeunits,len);
|
|
outbuffer[nCodeunits] = NULL;
|
|
|
|
PROFILE_END();
|
|
return nCodeunits;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const U32 convertUTF32toUTF16(const UTF32 *unistring, UTF16 *outbuffer, U32 len)
|
|
{
|
|
PROFILE_START(convertUTF32toUTF16);
|
|
U32 walked, nCodepoints;
|
|
|
|
nCodepoints=0;
|
|
while(*unistring != NULL && nCodepoints < len)
|
|
{
|
|
outbuffer[nCodepoints] = oneUTF32toUTF16(*unistring);
|
|
unistring++;
|
|
nCodepoints++;
|
|
}
|
|
|
|
nCodepoints = getMin(nCodepoints,len);
|
|
outbuffer[nCodepoints] = NULL;
|
|
|
|
PROFILE_END();
|
|
return nCodepoints;
|
|
}
|
|
|
|
#pragma mark -
|
|
//-----------------------------------------------------------------------------
|
|
// Functions that convert buffers of unicode code points
|
|
//-----------------------------------------------------------------------------
|
|
const UTF16* convertUTF8toUTF16( const UTF8* unistring)
|
|
{
|
|
PROFILE_START(convertUTF8toUTF16);
|
|
// allocate plenty of memory.
|
|
U32 nCodepoints, len = dStrlen(unistring);
|
|
FrameTemp<UTF16> buf(len);
|
|
|
|
// perform conversion
|
|
nCodepoints = convertUTF8toUTF16( unistring, buf, len);
|
|
|
|
// add 1 for the NULL terminator the converter promises it included.
|
|
nCodepoints++;
|
|
|
|
// allocate the return buffer, copy over, and return it.
|
|
UTF16 *ret = new UTF16(nCodepoints);
|
|
dMemcpy(ret, buf, nCodepoints);
|
|
|
|
PROFILE_END();
|
|
return ret;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const UTF32* convertUTF8toUTF32( const UTF8* unistring)
|
|
{
|
|
PROFILE_START(convertUTF8toUTF32);
|
|
// allocate plenty of memory.
|
|
U32 nCodepoints, len = dStrlen(unistring);
|
|
FrameTemp<UTF32> buf(len);
|
|
|
|
// perform conversion
|
|
nCodepoints = convertUTF8toUTF32( unistring, buf, len);
|
|
|
|
// add 1 for the NULL terminator the converter promises it included.
|
|
nCodepoints++;
|
|
|
|
// allocate the return buffer, copy over, and return it.
|
|
UTF32 *ret = new UTF32(nCodepoints);
|
|
dMemcpy(ret, buf, nCodepoints);
|
|
|
|
PROFILE_END();
|
|
return ret;
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const UTF8* convertUTF16toUTF8( const UTF16* unistring)
|
|
{
|
|
PROFILE_START(convertUTF16toUTF8);
|
|
// allocate plenty of memory.
|
|
U32 nCodeunits, len = dStrlen(unistring) * 3;
|
|
FrameTemp<UTF8> buf(len);
|
|
|
|
// perform conversion
|
|
nCodeunits = convertUTF16toUTF8( unistring, buf, len);
|
|
|
|
// add 1 for the NULL terminator the converter promises it included.
|
|
nCodeunits++;
|
|
|
|
// allocate the return buffer, copy over, and return it.
|
|
UTF8 *ret = new UTF8(nCodeunits);
|
|
dMemcpy(ret, buf, nCodeunits);
|
|
|
|
PROFILE_END();
|
|
return ret;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const UTF32* convertUTF16toUTF32(const UTF16* unistring)
|
|
{
|
|
PROFILE_START(convertUTF16toUTF32);
|
|
// allocate plenty of memory.
|
|
U32 nCodepoints, len = dStrlen(unistring);
|
|
FrameTemp<UTF32> buf(len);
|
|
|
|
// perform conversion
|
|
nCodepoints = convertUTF16toUTF32( unistring, buf, len);
|
|
|
|
// add 1 for the NULL terminator the converter promises it included.
|
|
nCodepoints++;
|
|
|
|
// allocate the return buffer, copy over, and return it.
|
|
UTF32 *ret = new UTF32(nCodepoints);
|
|
dMemcpy(ret, buf, nCodepoints);
|
|
|
|
PROFILE_END();
|
|
return ret;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const UTF8* convertUTF32toUTF8( const UTF32* unistring)
|
|
{
|
|
PROFILE_START(convertUTF32toUTF8);
|
|
// allocate plenty of memory.
|
|
U32 nCodeunits, len = dStrlen(unistring) * 3;
|
|
FrameTemp<UTF8> buf(len);
|
|
|
|
// perform conversion
|
|
nCodeunits = convertUTF32toUTF8( unistring, buf, len);
|
|
|
|
// add 1 for the NULL terminator the converter promises it included.
|
|
nCodeunits++;
|
|
|
|
// allocate the return buffer, copy over, and return it.
|
|
UTF8 *ret = new UTF8(nCodeunits);
|
|
dMemcpy(ret, buf, nCodeunits);
|
|
|
|
PROFILE_END();
|
|
return ret;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const UTF16* convertUTF32toUTF16(const UTF32* unistring)
|
|
{
|
|
PROFILE_START(convertUTF32toUTF16);
|
|
// allocate plenty of memory.
|
|
U32 nCodepoints, len = dStrlen(unistring);
|
|
FrameTemp<UTF16> buf(len);
|
|
|
|
// perform conversion
|
|
nCodepoints = convertUTF32toUTF16( unistring, buf, len);
|
|
|
|
// add 1 for the NULL terminator the converter promises it included.
|
|
nCodepoints++;
|
|
|
|
// allocate the return buffer, copy over, and return it.
|
|
UTF16 *ret = new UTF16(nCodepoints);
|
|
dMemcpy(ret, buf, nCodepoints);
|
|
|
|
PROFILE_END();
|
|
return ret;
|
|
}
|
|
|
|
#pragma mark -
|
|
//-----------------------------------------------------------------------------
|
|
// Functions that converts one unicode codepoint at a time
|
|
//-----------------------------------------------------------------------------
|
|
const UTF32 oneUTF8toUTF32( const UTF8* codepoint, U32 *unitsWalked)
|
|
{
|
|
PROFILE_START(oneUTF8toUTF32);
|
|
// codepoints 6 codeunits long are read, but do not convert correctly,
|
|
// and are filtered out anyway.
|
|
U32 expectedByteCount;
|
|
UTF32 ret = 0;
|
|
U8 codeunit;
|
|
|
|
// check the first byte ( a.k.a. codeunit ) .
|
|
unsigned char c = codepoint[0];
|
|
c = c >> 1;
|
|
expectedByteCount = firstByteLUT[c];
|
|
if(expectedByteCount > 0) // 0 or negative is illegal to start with
|
|
{
|
|
// process 1st codeunit
|
|
ret |= byteMask8LUT[expectedByteCount] & codepoint[0]; // bug?
|
|
|
|
// process trailing codeunits
|
|
for(U32 i=1;i<expectedByteCount; i++)
|
|
{
|
|
codeunit = codepoint[i];
|
|
if( firstByteLUT[codeunit>>1] == 0 )
|
|
{
|
|
ret <<= 6; // shift up 6
|
|
ret |= (codeunit & 0x3f); // mask in the low 6 bits of this codeunit byte.
|
|
}
|
|
else
|
|
{
|
|
// found a bad codepoint - did not get a medial where we wanted one.
|
|
// Dump the replacement, and claim to have parsed only 1 char,
|
|
// so that we'll dump a slew of replacements, instead of eating the next char.
|
|
ret = kReplacementChar;
|
|
expectedByteCount = 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// found a bad codepoint - got a medial or an illegal codeunit.
|
|
// Dump the replacement, and claim to have parsed only 1 char,
|
|
// so that we'll dump a slew of replacements, instead of eating the next char.
|
|
ret = kReplacementChar;
|
|
expectedByteCount = 1;
|
|
}
|
|
|
|
if(unitsWalked != NULL)
|
|
*unitsWalked = expectedByteCount;
|
|
|
|
// codepoints in the surrogate range are illegal, and should be replaced.
|
|
if(isSurrogateRange(ret))
|
|
ret = kReplacementChar;
|
|
|
|
// codepoints outside the Basic Multilingual Plane add complexity to our UTF16 string classes,
|
|
// we've read them correctly so they wont foul the byte stream,
|
|
// but we kill them here to make sure they wont foul anything else
|
|
if(isAboveBMP(ret))
|
|
ret = kReplacementChar;
|
|
|
|
PROFILE_END();
|
|
return ret;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const UTF32 oneUTF16toUTF32(const UTF16* codepoint, U32 *unitsWalked)
|
|
{
|
|
PROFILE_START(oneUTF16toUTF32);
|
|
U8 expectedType;
|
|
U32 unitCount;
|
|
UTF32 ret = 0;
|
|
UTF16 codeunit1,codeunit2;
|
|
|
|
codeunit1 = codepoint[0];
|
|
expectedType = surrogateLUT[codeunit1 >> 10];
|
|
switch(expectedType)
|
|
{
|
|
case 0: // simple
|
|
ret = codeunit1;
|
|
unitCount = 1;
|
|
break;
|
|
case 1: // 2 surrogates
|
|
codeunit2 = codepoint[1];
|
|
if( surrogateLUT[codeunit2 >> 10] == 2)
|
|
{
|
|
ret = ((codeunit1 & byteMaskLow10 ) << 10) | (codeunit2 & byteMaskLow10);
|
|
unitCount = 2;
|
|
break;
|
|
}
|
|
// else, did not find a trailing surrogate where we expected one,
|
|
// so fall through to the error
|
|
case 2: // error
|
|
// found a trailing surrogate where we expected a codepoint or leading surrogate.
|
|
// Dump the replacement.
|
|
ret = kReplacementChar;
|
|
unitCount = 1;
|
|
break;
|
|
}
|
|
|
|
if(unitsWalked != NULL)
|
|
*unitsWalked = unitCount;
|
|
|
|
// codepoints in the surrogate range are illegal, and should be replaced.
|
|
if(isSurrogateRange(ret))
|
|
ret = kReplacementChar;
|
|
|
|
// codepoints outside the Basic Multilingual Plane add complexity to our UTF16 string classes,
|
|
// we've read them correctly so they wont foul the byte stream,
|
|
// but we kill them here to make sure they wont foul anything else
|
|
// NOTE: these are perfectly legal codepoints, we just dont want to deal with them.
|
|
if(isAboveBMP(ret))
|
|
ret = kReplacementChar;
|
|
|
|
PROFILE_END();
|
|
return ret;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const UTF16 oneUTF32toUTF16(const UTF32 codepoint)
|
|
{
|
|
// found a codepoint outside the codeable UTF-16 range!
|
|
// or, found an illegal codepoint!
|
|
if(codepoint >= 0x10FFFF || isSurrogateRange(codepoint))
|
|
return kReplacementChar;
|
|
|
|
// these are legal, we just dont want to deal with them.
|
|
if(isAboveBMP(codepoint))
|
|
return kReplacementChar;
|
|
|
|
return (UTF16)codepoint;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const U32 oneUTF32toUTF8(const UTF32 codepoint, UTF8 *threeByteCodeunitBuf)
|
|
{
|
|
PROFILE_START(oneUTF32toUTF8);
|
|
U32 bytecount = 0;
|
|
UTF8 *buf;
|
|
U32 working = codepoint;
|
|
buf = threeByteCodeunitBuf;
|
|
|
|
//-----------------
|
|
if(isSurrogateRange(working)) // found an illegal codepoint!
|
|
working = kReplacementChar;
|
|
//return oneUTF32toUTF8(kReplacementChar, threeByteCodeunitBuf);
|
|
|
|
if(isAboveBMP(working)) // these are legal, we just dont want to deal with them.
|
|
working = kReplacementChar;
|
|
//return oneUTF32toUTF8(kReplacementChar, threeByteCodeunitBuf);
|
|
|
|
//-----------------
|
|
if( working < (1 << 7)) // codeable in 7 bits
|
|
bytecount = 1;
|
|
else if( working < (1 << 11)) // codeable in 11 bits
|
|
bytecount = 2;
|
|
else if( working < (1 << 16)) // codeable in 16 bits
|
|
bytecount = 3;
|
|
|
|
AssertISV( bytecount > 0, "Error converting to UTF-8 in oneUTF32toUTF8(). isAboveBMP() should have caught this!");
|
|
|
|
//-----------------
|
|
U8 mask = byteMask8LUT[0]; // 0011 1111
|
|
U8 marker = ( ~mask << 1); // 1000 0000
|
|
|
|
// Process the low order bytes, shifting the codepoint down 6 each pass.
|
|
for( int i = bytecount-1; i > 0; i--)
|
|
{
|
|
threeByteCodeunitBuf[i] = marker | (working & mask);
|
|
working >>= 6;
|
|
}
|
|
|
|
// Process the 1st byte. filter based on the # of expected bytes.
|
|
mask = byteMask8LUT[bytecount];
|
|
marker = ( ~mask << 1 );
|
|
threeByteCodeunitBuf[0] = marker | working & mask;
|
|
|
|
PROFILE_END();
|
|
return bytecount;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const U32 dStrlen(const UTF16 *unistring)
|
|
{
|
|
U32 i = 0;
|
|
while(unistring[i] != NULL)
|
|
i++;
|
|
|
|
return i;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
const U32 dStrlen(const UTF32 *unistring)
|
|
{
|
|
U32 i = 0;
|
|
while(unistring[i] != NULL)
|
|
i++;
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
/* alternate utf-8 decode impl for speed, no error checking,
|
|
left here for your amusement:
|
|
|
|
U32 codeunit = codepoint + expectedByteCount - 1;
|
|
U32 i = 0;
|
|
switch(expectedByteCount)
|
|
{
|
|
case 6: ret |= ( *(codeunit--) & 0x3f ); i++;
|
|
case 5: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
|
|
case 4: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
|
|
case 3: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
|
|
case 2: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
|
|
case 1: ret |= *(codeunit) & byteMask8LUT[expectedByteCount] << (6 * i);
|
|
}
|
|
*/
|