SNAP Library , User Reference  2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
TUniCodec Class Reference

#include <unicode.h>

List of all members.

Public Types

enum  { DefaultReplacementChar = 0xfffd }

Public Member Functions

 TUniCodec ()
 TUniCodec (TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_)
template<typename TSrcVec , typename TDestCh >
size_t DecodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
size_t DecodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
size_t EncodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
size_t EncodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec >
TStr EncodeUtf8Str (const TSrcVec &src, size_t srcIdx, const size_t srcCount) const
template<typename TSrcVec >
TStr EncodeUtf8Str (const TSrcVec &src) const
template<typename TSrcVec , typename TDestCh >
size_t DecodeUtf16FromBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
template<typename TSrcVec , typename TDestCh >
size_t DecodeUtf16FromWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
template<typename TSrcVec , typename TDestCh >
size_t EncodeUtf16ToWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
template<typename TSrcVec , typename TDestCh >
size_t EncodeUtf16ToBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
void TestUtf8 ()
void TestUtf16 ()

Public Attributes

int replacementChar
TUnicodeErrorHandling errorHandling
bool strict
bool skipBom

Protected Types

enum  {
  DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0),
  DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0),
  DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0),
  DefineByte = (1, 0, 0, 0, 0, 0, 0, 0)
}
enum  { Utf16FirstSurrogate = 0xd800, Utf16SecondSurrogate = 0xdc00 }
typedef TUniVecIdx TVecIdx

Protected Member Functions

void TestUtf8 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, FILE *f)
void TestDecodeUtf8 (TRnd &rnd, const TStr &testCaseDesc)
void WordsToBytes (const TIntV &src, TIntV &dest)
void TestUtf16 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, FILE *f)
void TestDecodeUtf16 (TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)

Static Protected Member Functions

static bool IsMachineLittleEndian ()
static uint GetRndUint (TRnd &rnd)
static uint GetRndUint (TRnd &rnd, uint minVal, uint maxVal)
static int SwapBytes (int x)

Friends

class TUniCaseFolding

Detailed Description

Definition at line 54 of file unicode.h.


Member Typedef Documentation

typedef TUniVecIdx TUniCodec::TVecIdx [protected]

Definition at line 118 of file unicode.h.


Member Enumeration Documentation

anonymous enum
Enumerator:
DefaultReplacementChar 

Definition at line 59 of file unicode.h.

{ DefaultReplacementChar = 0xfffd };
anonymous enum [protected]
Enumerator:
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 

Definition at line 101 of file unicode.h.

             {
#define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
                DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
                DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
                DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
                DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
                DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
                DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
                DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
                DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
                DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
                DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
                DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
                DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
#undef DefineByte
        };
anonymous enum [protected]
Enumerator:
Utf16FirstSurrogate 
Utf16SecondSurrogate 

Definition at line 156 of file unicode.h.

             {
                Utf16FirstSurrogate = 0xd800,
                Utf16SecondSurrogate = 0xdc00
        };

Constructor & Destructor Documentation

TUniCodec::TUniCodec ( ) [inline]

Definition at line 91 of file unicode.h.

TUniCodec::TUniCodec ( TUnicodeErrorHandling  errorHandling_,
bool  strict_,
int  replacementChar_,
bool  skipBom_ 
) [inline]

Definition at line 95 of file unicode.h.

                                                                                                           :
                replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
        {
        }

Member Function Documentation

template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::DecodeUtf16FromBytes ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const

Definition at line 2205 of file unicode.h.

{
        IAssert(srcCount % 2 == 0);
        IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
        IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
        if (clrDest) dest.Clr();
        size_t nDecoded = 0;
        if (srcCount <= 0) return nDecoded;
        const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
        bool littleEndian = false;
  bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
        if (bomHandling == bomIgnored) littleEndian = leDefault;
        else if (bomHandling == bomAllowed || bomHandling == bomRequired)
        {
                int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
                if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
                else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
                else if (bomHandling == bomAllowed) littleEndian = leDefault;
                else { // Report an error.
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
                        case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
                        default: Fail; } }
        }
        else Fail;
        while (srcIdx < srcEnd)
        {
                const size_t charSrcIdx = srcIdx;
                uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
                uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
                if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
                {
                        // c is the first character in a surrogate pair.  Read the next character.
                        if (! (srcIdx + 2 <= srcEnd)) {
                                switch (errorHandling) {
                                case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
                                case uehAbort: return nDecoded;
                                case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                                case uehIgnore: continue;
                                default: Fail; } }
                        uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
                        uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
                        // c2 should be the second character of the surrogate pair.
                        if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
                                switch (errorHandling) {
                                case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
                                case uehAbort: return nDecoded;
                                // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
                                case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
                                case uehIgnore: srcIdx -= 2; continue;
                                default: Fail; } }
                        // c and c2 each contain 10 bits of information.
                        uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
                        cc += 0x10000;
                        dest.Add(TDestCh(cc)); nDecoded++; continue;
                }
                else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
                        case uehAbort: return nDecoded;
                        case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
                if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
                // Otherwise, store 'c' to the destination vector.
                dest.Add(TDestCh(c)); nDecoded++;
        }
        return nDecoded;
}
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::DecodeUtf16FromWords ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
bool  clrDest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const

Definition at line 2289 of file unicode.h.

{
        IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
        IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
        if (clrDest) dest.Clr();
        size_t nDecoded = 0;
        if (srcCount <= 0) return nDecoded;
        const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
        bool swap = false;
  bool isMachineLe = IsMachineLittleEndian();
        bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
        if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
        else if (bomHandling == bomAllowed || bomHandling == bomRequired)
        {
                int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
                if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
                else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
                else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
                else { // Report an error.
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
                        case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
                        default: Fail; } }
        }
        else Fail;
        while (srcIdx < srcEnd)
        {
                const size_t charSrcIdx = srcIdx;
                uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
                if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
                if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
                {
                        // c is the first character in a surrogate pair.  Read the next character.
                        if (! (srcIdx < srcEnd)) {
                                switch (errorHandling) {
                                case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
                                case uehAbort: return nDecoded;
                                case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                                case uehIgnore: continue;
                                default: Fail; } }
                        uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
                        if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
                        // c2 should be the second character of the surrogate pair.
                        if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
                                switch (errorHandling) {
                                case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
                                case uehAbort: return nDecoded;
                                // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
                                case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
                                case uehIgnore: srcIdx -= 1; continue;
                                default: Fail; } }
                        // c and c2 each contain 10 bits of information.
                        uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
                        cc += 0x10000;
                        dest.Add(TDestCh(cc)); nDecoded++; continue;
                }
                else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
                        case uehAbort: return nDecoded;
                        case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
                if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
                // Otherwise, store 'c' to the destination vector.
                dest.Add(TDestCh(c)); nDecoded++;
        }
        return nDecoded;
}
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::DecodeUtf8 ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const

Definition at line 2031 of file unicode.h.

{
        size_t nDecoded = 0;
        if (clrDest) dest.Clr();
        const size_t origSrcIdx = srcIdx;
        const size_t srcEnd = srcIdx + srcCount;
        while (srcIdx < srcEnd)
        {
                const size_t charSrcIdx = srcIdx;
                uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
                if ((c & _1000_0000) == 0) {
                        // c is one of the characters 0..0x7f, encoded as a single byte.
                        dest.Add(TDestCh(c)); nDecoded++; continue; }
                else if ((c & _1100_0000) == _1000_0000) {
                        // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
                        // We must have been thrown into the middle of a multi-byte character.
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
                        case uehAbort: return nDecoded;
                        case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                else
                {
                        // c introduces a sequence of 2..6 bytes, depending on how many
                        // of the most significant bits of c are set.
                        uint nMoreBytes = 0, nBits = 0, minVal = 0;
                        if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
                        else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
                        else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
                        else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
                        else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
                        else {
                                // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
                                // (which allowed the encoding of codepoints up to 2^31 - 1).  However, in principle this
                                // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
                                // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
                                if (strict)  {
                                        switch (errorHandling) {
                                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
                                        case uehAbort: return nDecoded;
                                        // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
                                        // and try to decode the character.  Then, since 'strict' is true and
                                        // the codepoint is clearly >= 2^31, we'll notice this as an error later
                                        // and (in the case of uehReplace) insert a replacement character then.
                                        // This is probably better than inserting a replacement character right
                                        // away and then trying to read the next byte as if a new character
                                        // was beginning there -- if the current byte is really followed by five
                                        // 10xxxxxx bytes, we'll just get six replacement characters in a row.
                                        case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
                                        case uehIgnore: break; // continue;
                                        default: Fail; } }
                                nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
                        // Decode this multi-byte sequence.
                        uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
                        bool cancel = false;
                        for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
                                // See if there are enough bytes left in the source vector.
                                if (! (srcIdx < srcEnd)) {
                                        switch (errorHandling) {
                                        case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
                                        case uehAbort: return nDecoded;
                                        case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
                                        case uehIgnore: cancel = true; continue;
                                        default: Fail; } }
                                // Read the next byte.
                                c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
                                if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
                                        switch (errorHandling) {
                                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
                                        case uehAbort: return nDecoded;
                                        case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
                                        case uehIgnore: srcIdx--; cancel = true; continue;
                                        default: Fail; } }
                                cOut <<= 6; cOut |= (c & _0011_1111); }
                        if (cancel) continue;
                        if (strict) {
                                // err1: This codepoint has been represented by more bytes than it should have been.
                                // For example, cOut in the range 0..127 should be represented by a single byte,
                                // not by two or more bytes.
                                // - For example, this may happen in the "modified UTF-8" sometimes used for Java
                                // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
                                // the appearance of null bytes in the encoded stream.
                                bool err1 = (cOut < minVal);
                                // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
                                // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
                                // are valid Unicode codepoints.  Thus, no more than 4 bytes are ever necessary.
                                bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
                                if (err1 || err2) switch (errorHandling) {
                                        case uehThrow:
                                                if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
                                                else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
                                                else { Fail; break; }
                                        case uehAbort: return nDecoded;
                                        case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                                        case uehIgnore: continue;
                                        default: Fail; } }
                        // Add the decoded codepoint to the destination vector.
                        // If this is the first decoded character, and it's one of the byte-order marks
                        // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
                        if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
                                dest.Add(cOut); nDecoded++; }
                } // else (multi-byte sequence)
        } // while
        return nDecoded;
}
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::DecodeUtf8 ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const [inline]

Definition at line 135 of file unicode.h.

{ return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::EncodeUtf16ToBytes ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const

Definition at line 2423 of file unicode.h.

{
        bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
        size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
        if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
        while (srcIdx < srcEnd)
        {
                uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
                if (! (c <= 0x10ffffu)) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
                        case uehAbort: return nEncoded;
#define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
                        case uehReplace: ___OutRepl; continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
                        case uehAbort: return nEncoded;
                        case uehReplace: ___OutRepl; continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
                        case uehAbort: return nEncoded;
                        case uehReplace: ___OutRepl; continue;
                        case uehIgnore: continue;
                        default: Fail; } }
#undef ___OutRepl
                // If c is <= 0xffff, it can be stored directly.
                if (c <= 0xffffu) {
                        if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
                        else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
                        nEncoded++; continue; }
                // Otherwise, represent c by a pair of surrogate characters.
                c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
                uint c1 = (c >> 10) & 1023, c2 = c & 1023;
                c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
                if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
                else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
                nEncoded++; continue;
        }
        return nEncoded;
}
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::EncodeUtf16ToWords ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const

Definition at line 2371 of file unicode.h.

{
        bool isMachineLe = IsMachineLittleEndian();
        bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
        size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
        if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
        while (srcIdx < srcEnd)
        {
                uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
                if (! (c <= 0x10ffffu)) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
                        case uehAbort: return nEncoded;
                        case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
                        case uehAbort: return nEncoded;
                        case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
                        case uehAbort: return nEncoded;
                        case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                // If c is <= 0xffff, it can be stored directly.
                if (c <= 0xffffu) {
                        if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
                        dest.Add(TDestCh(c)); nEncoded++; continue; }
                // Otherwise, represent c by a pair of surrogate characters.
                c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
                uint c1 = (c >> 10) & 1023, c2 = c & 1023;
                c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
                if (swap) {
                        c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
                        c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
                dest.Add(TDestCh(c1));
                dest.Add(TDestCh(c2));
                nEncoded++; continue;
        }
        return nEncoded;
}
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::EncodeUtf8 ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const

Definition at line 2147 of file unicode.h.

{
        size_t nEncoded = 0;
        for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
        {
                uint c = uint(src[TVecIdx(srcIdx)]);
                bool err = false;
                if (strict && c > 0x10ffff) {
                        err = true;
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
                        case uehAbort: return nEncoded;
                        case uehReplace: c = replacementChar; break;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (c < 0x80u)
                        dest.Add(TDestCh(c & 0xffu));
                else if (c < 0x800u) {
                        dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                else if (c < 0x10000u) {
                        dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                else if (c < 0x200000u) {
                        dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                else if (c < 0x4000000u) {
                        dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                else {
                        dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                if (! err) nEncoded++;
        }
        return nEncoded;
}
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::EncodeUtf8 ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const [inline]

Definition at line 144 of file unicode.h.

{ return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
template<typename TSrcVec >
TStr TUniCodec::EncodeUtf8Str ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount 
) const [inline]

Definition at line 148 of file unicode.h.

{ TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
template<typename TSrcVec >
TStr TUniCodec::EncodeUtf8Str ( const TSrcVec &  src) const [inline]

Definition at line 149 of file unicode.h.

{ TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
uint TUniCodec::GetRndUint ( TRnd rnd) [static, protected]

Definition at line 62 of file unicode.cpp.

{
        uint u = rnd.GetUniDevUInt(256) & 0xff;
        u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
        u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
        u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
        return u;
}
uint TUniCodec::GetRndUint ( TRnd rnd,
uint  minVal,
uint  maxVal 
) [static, protected]

Definition at line 71 of file unicode.cpp.

{
        if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd);
        uint range = maxVal - minVal + 1;
        if (range > (uint(1) << (8 * sizeof(uint) - 1)))
                while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; }
        uint mask = 1;
        while (mask < range) mask <<= 1;
        mask -= 1;
        while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; }
}
bool TUniCodec::IsMachineLittleEndian ( ) [static, protected]

Definition at line 83 of file unicode.cpp.

{
        static bool isLE, initialized = false;
        if (initialized) return isLE;
        int i = 0x0201;
        char *p = (char *) (&i);
        char c1, c2;
        memcpy(&c1, p, 1); memcpy(&c2, p + 1, 1);
        if (c1 == 1 && c2 == 2) isLE = true;
        else if (c1 == 2 && c2 == 1) isLE = false;
        else {
                FailR(("TUniCodec::IsMachineLittleEndian: c1 = " + TInt::GetStr(int(uchar(c1)), "%02x") + ", c2 = " + TInt::GetStr(int(uchar(c2)), "%02x") + ".").CStr());
                isLE = true; }
        initialized = true; return isLE;
}
static int TUniCodec::SwapBytes ( int  x) [inline, static, protected]

Definition at line 249 of file unicode.h.

                                           {
                return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
void TUniCodec::TestDecodeUtf16 ( TRnd rnd,
const TStr testCaseDesc,
const TUtf16BomHandling  bomHandling,
const TUniByteOrder  defaultByteOrder,
const bool  insertBom 
) [protected]

Definition at line 345 of file unicode.cpp.

{
        TIntV src; TIntV expectedDest; int expectedRetVal = 0;
        bool expectedAbort = false;
        FILE *f = 0;
        bool isMachineLe = IsMachineLittleEndian();
        bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
        bool swap = (isMachineLe != isDefaultLe);
        if (insertBom) {
                src.Add(swap ? 0xfffe : 0xfeff);
                if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } }
        else if (bomHandling == bomRequired) {
                expectedAbort = true; expectedRetVal = -1; }
        // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where:
        // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y');
        // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint.
        //   (absent = 0, 'a' = 1).
        for (int i = 0; i < testCaseDesc.Len(); )
        {
                const char c = testCaseDesc[i++];
                uint cp = 0; int nWords = -1;
                if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated
                if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range
                else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range
                else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range
                else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP
                else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16
                else if (c == 'X') { cp = 0xfffe; nWords = 1; }
                else if (c == 'Y') { cp = 0xfeff; nWords = 1; }
                else Fail;
                if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
                // Process 'e'.
                int nToDel = 0;
                if (i < testCaseDesc.Len()) {
                        const char e = testCaseDesc[i];
                        if (e >= 'a') { i += 1; nToDel = 1; }}
                IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1)));
                if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
                // Will an error occur during the decoding of this codepoint?
                bool errHere = false;
                if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true;
                else if (cp > 0x10ffff) { Fail; errHere = true; }
                else if (nToDel > 0) errHere = true;
                else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true;
                // Update 'expectedDest' and 'expectedRetVal'.
                if (! expectedAbort) {
                        if (! errHere) {
                                if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
                                else { expectedDest.Add(cp); expectedRetVal += 1; } }
                        else if (errorHandling == uehReplace) {
                                expectedDest.Add(replacementChar); }
                        if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
                // Update 'src'.
                if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp);
                else {
                        int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate;
                        int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate;
                        src.Add(swap ? SwapBytes(c1) : c1);
                        if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); }
        }
        if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
        TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f);
}
void TUniCodec::TestDecodeUtf8 ( TRnd rnd,
const TStr testCaseDesc 
) [protected]

Definition at line 137 of file unicode.cpp.

{
        TIntV src; TIntV expectedDest; int expectedRetVal = 0;
        bool expectedAbort = false;
        FILE *f = 0; // stderr
        // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where:
        // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z');
        // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6');
        // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint.
        //   (absent = 0, 'a' = 1, 'b' = 2 and so on).
        for (int i = 0; i < testCaseDesc.Len(); )
        {
                IAssert(i + 2 <= testCaseDesc.Len());
                const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2;
                uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false;
                IAssert('1' <= d && d <= '6'); nBytes = d - '0';
                if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte
                else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes
                else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes
                else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode
                else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode
                else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes
                else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits
                else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits
                else if (c == 'X') { cp = 0xfffe; minBytes = 3; }
                else if (c == 'Y') { cp = 0xfeff; minBytes = 3; }
                else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f))
                else Fail;
                IAssert(nBytes >= minBytes);
                // Process 'e'.
                int nToDel = 0;
                if (i < testCaseDesc.Len()) {
                        const char e = testCaseDesc[i];
                        if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }}
                IAssert(nToDel < nBytes);
                // Will an error occur during the decoding of this codepoint?
                bool errHere = false;
                if (eighties) errHere = true;
                else if (nToDel > 0) errHere = true;
                else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true;
                // Update 'expectedDest' and 'expetedRetVal'.
                if (! expectedAbort) {
                        if (! errHere) {
                                if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
                                else { expectedDest.Add(cp); expectedRetVal += 1; } }
                        else if (errorHandling == uehReplace) {
                                if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar);
                                else expectedDest.Add(replacementChar); }
                        if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
                // Update 'src'.
                if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff));
                else if (nBytes == 1) src.Add(cp);
                else {
                        int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes);
                        src.Add(mask | (uint(cp) >> (6 * (nBytes - 1))));
                        for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); }
        }
        if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
        TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f);
}
void TUniCodec::TestUtf16 ( bool  decode,
size_t  expectedRetVal,
bool  expectedThrow,
const TIntV src,
const TIntV expectedDest,
const TUtf16BomHandling  bomHandling,
const TUniByteOrder  defaultByteOrder,
const bool  insertBom,
FILE *  f 
) [protected]

Definition at line 288 of file unicode.cpp.

{
        TIntV srcBytes, expectedDestBytes;
        WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes);
        TIntV dest;
        if (f) {
                fprintf(f, "Settings: %s  %s  %s  %s  %s replacementChar = %x  \n",
                        (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
                        (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")),
                        (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"),
                        (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"),
                        uint(replacementChar));
                fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); }
        for (int useBytes = 0; useBytes < 2; useBytes++)
        {
                const char *fmt = (useBytes ? " %02x" : " %04x");
                try
                {
                        dest.Clr();
                        size_t retVal;
                        if (! useBytes) {
                                if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder);
                                else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
                        else {
                                if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder);
                                else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
                        const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest);
                        if (f) {
                                fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(dest[i]));
                                fprintf(f, "\n    expDest  "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(ed[i]));
                                fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
                        bool ok = true;
                        if (retVal != expectedRetVal) ok = false;
                        if (dest.Len() != ed.Len()) ok = false;
                        if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false;
                        if (! ok)
                        {
                                printf("!!!\n");
                        }
                        IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
                        IAssert(dest.Len() == ed.Len());
                        for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]);
                }
                catch (TUnicodeException e)
                {
                        if (f) {
                                fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i]));
                                fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
                        IAssert(expectedThrow);
                }
        }
}

Definition at line 412 of file unicode.cpp.

{
        TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar);
        for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
        for (int strict_ = 0; strict_ < 2; strict_++)
        for (int errMode_ = 0; errMode_ < 4; errMode_++)
        for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++)
        for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++)
        for (int insertBom_ = 0; insertBom_ < 2; insertBom_++)
        {
                strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
                bool insertBom = (insertBom_ == 1);
                TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_;
                TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_;
                TRnd rnd = TRnd(123);
                // Test DecodeUtf16 on various random UTF-16-encoded sequences.
                for (int i = 0; i < 10; i++)
                {
                        TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom);
                }
                //continue;
                // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters
                // close to powers of 2.
                TIntV src, expectedDest, src2;
                expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
                for (int pow = 8; pow <= 32; pow++)
                {
                        uint uFrom, uTo;
                        if (pow == 8) uFrom = 0, uTo = 1u << pow;
                        else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
                        else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
                        printf("%u..%u          \r", uFrom, uTo);
                        for (uint u = uFrom; ; u++)
                        {
                                int nWords = 0;
                                if (u < 0x10000) nWords = 1;
                                else nWords = 2;
                                bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe));
                                bool swap = (isMachineLe != isDestLe);
                                bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023);
                                src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0));
                                if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff);
                                if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023)))
                                {
                                        // Try to encode 'u' and see if it gets decoded correctly.
                                        if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u);
                                        else {
                                                int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023);
                                                int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023);
                                                src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1);
                                                src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); }
                                        if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding
                                        {
                                                expectedDest.Reserve(2, 0);
                                                if (insertBom && ! skipBom) expectedDest.Add(0xfeff);
                                                if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
                                                else if (! err) expectedDest.Add(u);
                                                int erv = (err ? 0 : expectedDest.Len());
                                                if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0;
                                                bool errD = err;
                                                if (bomHandling == bomRequired && ! insertBom) {
                                                        expectedDest.Clr(false);
                                                        if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); }
                                                        else { erv = -1; errD = true;
                                                                /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }}
                                                TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0);
                                        }
                                }
                                // We can also test the UTF-16 encoder.
                                src2[0] = u;
                                if (err) {
                                        src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff);
                                        if (errorHandling == uehReplace) {
                                                src.Add(swap ? SwapBytes(replacementChar) : replacementChar);
                                                /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); }
                                                else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */
                                        }}
                                TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0);
                                //
                                if (u == uTo) break;
                        }
                }
        }
}
void TUniCodec::TestUtf8 ( bool  decode,
size_t  expectedRetVal,
bool  expectedThrow,
const TIntV src,
const TIntV expectedDest,
FILE *  f 
) [protected]

Definition at line 103 of file unicode.cpp.

{
        TIntV dest;
        if (f) {
                fprintf(f, "Settings: %s  %s  %s   replacementChar = %x\n",
                        (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
                        (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar));
                fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); }
        try
        {
                size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true));
                if (f) {
                        fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(dest[i]));
                        fprintf(f, "\n    expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(expectedDest[i]));
                        fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
                if (retVal != expectedRetVal)
                        printf("!!!");
                IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
                if (dest.Len() != expectedDest.Len())
                        printf("!!!");
                IAssert(dest.Len() == expectedDest.Len());
                for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
        }
        catch (TUnicodeException e)
        {
                if (f) {
                        fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i]));
                        fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
                IAssert(expectedThrow);
        }
}

Definition at line 198 of file unicode.cpp.

{
        TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true);
        for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
        for (int strict_ = 0; strict_ < 2; strict_++)
        for (int errMode_ = 0; errMode_ < 4; errMode_++)
        {
                strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
                TRnd rnd = TRnd(123);
                // Test DecodeUtf8 on various random UTF-8-encoded sequences.
                for (int i = 0; i < 10; i++)
                {
                        TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6");
                        TestDecodeUtf8(rnd, "X3A5dA6d");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2");
                        TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2");
                        TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a");
                        TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a");
                        TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b");
                        TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c");
                        TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d");
                        TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e");
                }
                // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters
                // close to powers of 2.
                TIntV src, expectedDest, src2;
                expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
                for (int pow = 8; pow <= 32; pow++)
                {
                        uint uFrom, uTo;
                        if (pow == 8) uFrom = 0, uTo = 1u << pow;
                        else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
                        else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
                        printf("%u..%u          \r", uFrom, uTo);
                        for (uint u = uFrom; ; u++)
                        {
                                int nBytes = 0;
                                if (u < (1u << 7)) nBytes = 1;
                                else if (u < (1u << 11)) nBytes = 2;
                                else if (u < (1u << 16)) nBytes = 3;
                                else if (u < (1u << 21)) nBytes = 4;
                                else if (u < (1u << 26)) nBytes = 5;
                                else nBytes = 6;
                                src.Gen(6, nBytes);
                                if (nBytes == 1) src[0] = u;
                                else {
                                        src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1)));
                                        for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); }
                                bool err = (strict && u > 0x10ffff);
                                expectedDest.Reserve(1, 0);
                                if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
                                else if (! err) expectedDest.Add(u);
                                int erv = (err ? 0 : 1);
                                if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0;
                                TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0);
                                // We can also test the UTF-8 encoder.
                                src2[0] = u;
                                if (err) {
                                        if (errorHandling == uehReplace) src = utf8ReplCh;
                                        else src.Clr(false); }
                                TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0);
                                //
                                if (u == uTo) break;
                        }
                }
        }
}
void TUniCodec::WordsToBytes ( const TIntV src,
TIntV dest 
) [protected]

Definition at line 278 of file unicode.cpp.

{
        dest.Clr();
        bool isLE = IsMachineLittleEndian();
        for (int i = 0; i < src.Len(); i++) {
                int c = src[i] & 0xffff;
                if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
                else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } }
}

Friends And Related Function Documentation

friend class TUniCaseFolding [friend]

Definition at line 120 of file unicode.h.


Member Data Documentation

Definition at line 64 of file unicode.h.

Definition at line 89 of file unicode.h.

Definition at line 83 of file unicode.h.


The documentation for this class was generated from the following files: