SNAP Library , User Reference
2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
#include <unicode.h>
Public Types | |
enum | { DefaultReplacementChar = 0xfffd } |
Public Member Functions | |
TUniCodec () | |
TUniCodec (TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) | |
template<typename TSrcVec , typename TDestCh > | |
size_t | DecodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | DecodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | EncodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | EncodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const |
template<typename TSrcVec > | |
TStr | EncodeUtf8Str (const TSrcVec &src, size_t srcIdx, const size_t srcCount) const |
template<typename TSrcVec > | |
TStr | EncodeUtf8Str (const TSrcVec &src) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | DecodeUtf16FromBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | DecodeUtf16FromWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | EncodeUtf16ToWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
template<typename TSrcVec , typename TDestCh > | |
size_t | EncodeUtf16ToBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const |
void | TestUtf8 () |
void | TestUtf16 () |
Public Attributes | |
int | replacementChar |
TUnicodeErrorHandling | errorHandling |
bool | strict |
bool | skipBom |
Protected Types | |
enum | { DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0) } |
enum | { Utf16FirstSurrogate = 0xd800, Utf16SecondSurrogate = 0xdc00 } |
typedef TUniVecIdx | TVecIdx |
Protected Member Functions | |
void | TestUtf8 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, FILE *f) |
void | TestDecodeUtf8 (TRnd &rnd, const TStr &testCaseDesc) |
void | WordsToBytes (const TIntV &src, TIntV &dest) |
void | TestUtf16 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, FILE *f) |
void | TestDecodeUtf16 (TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom) |
Static Protected Member Functions | |
static bool | IsMachineLittleEndian () |
static uint | GetRndUint (TRnd &rnd) |
static uint | GetRndUint (TRnd &rnd, uint minVal, uint maxVal) |
static int | SwapBytes (int x) |
Friends | |
class | TUniCaseFolding |
typedef TUniVecIdx TUniCodec::TVecIdx [protected] |
anonymous enum |
Definition at line 59 of file unicode.h.
{ DefaultReplacementChar = 0xfffd };
anonymous enum [protected] |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte | |
DefineByte |
Definition at line 101 of file unicode.h.
{ #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0 DefineByte(1, 0, 0, 0, 0, 0, 0, 0), DefineByte(1, 1, 0, 0, 0, 0, 0, 0), DefineByte(1, 1, 1, 0, 0, 0, 0, 0), DefineByte(1, 1, 1, 1, 0, 0, 0, 0), DefineByte(1, 1, 1, 1, 1, 0, 0, 0), DefineByte(1, 1, 1, 1, 1, 1, 0, 0), DefineByte(1, 1, 1, 1, 1, 1, 1, 0), DefineByte(0, 0, 1, 1, 1, 1, 1, 1), DefineByte(0, 0, 0, 1, 1, 1, 1, 1), DefineByte(0, 0, 0, 0, 1, 1, 1, 1), DefineByte(0, 0, 0, 0, 0, 1, 1, 1), DefineByte(0, 0, 0, 0, 0, 0, 1, 1) #undef DefineByte };
anonymous enum [protected] |
Definition at line 156 of file unicode.h.
{ Utf16FirstSurrogate = 0xd800, Utf16SecondSurrogate = 0xdc00 };
TUniCodec::TUniCodec | ( | ) | [inline] |
Definition at line 91 of file unicode.h.
: replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true) { }
TUniCodec::TUniCodec | ( | TUnicodeErrorHandling | errorHandling_, |
bool | strict_, | ||
int | replacementChar_, | ||
bool | skipBom_ | ||
) | [inline] |
Definition at line 95 of file unicode.h.
: replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_) { }
size_t TUniCodec::DecodeUtf16FromBytes | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest, | ||
const TUtf16BomHandling | bomHandling = bomAllowed , |
||
const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
) | const |
Definition at line 2205 of file unicode.h.
{ IAssert(srcCount % 2 == 0); IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); if (clrDest) dest.Clr(); size_t nDecoded = 0; if (srcCount <= 0) return nDecoded; const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; bool littleEndian = false; bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian())); if (bomHandling == bomIgnored) littleEndian = leDefault; else if (bomHandling == bomAllowed || bomHandling == bomRequired) { int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; } else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; } else if (bomHandling == bomAllowed) littleEndian = leDefault; else { // Report an error. switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead)."); case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); default: Fail; } } } else Fail; while (srcIdx < srcEnd) { const size_t charSrcIdx = srcIdx; uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) { // c is the first character in a surrogate pair. Read the next character. if (! (srcIdx + 2 <= srcEnd)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); // c2 should be the second character of the surrogate pair. if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); case uehAbort: return nDecoded; // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue; case uehIgnore: srcIdx -= 2; continue; default: Fail; } } // c and c2 each contain 10 bits of information. uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); cc += 0x10000; dest.Add(TDestCh(cc)); nDecoded++; continue; } else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; // Otherwise, store 'c' to the destination vector. dest.Add(TDestCh(c)); nDecoded++; } return nDecoded; }
size_t TUniCodec::DecodeUtf16FromWords | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
bool | clrDest, | ||
const TUtf16BomHandling | bomHandling = bomAllowed , |
||
const TUniByteOrder | defaultByteOrder = boMachineEndian |
||
) | const |
Definition at line 2289 of file unicode.h.
{ IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); if (clrDest) dest.Clr(); size_t nDecoded = 0; if (srcCount <= 0) return nDecoded; const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; bool swap = false; bool isMachineLe = IsMachineLittleEndian(); bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe)); if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe); else if (bomHandling == bomAllowed || bomHandling == bomRequired) { int c = uint(src[TVecIdx(srcIdx)]) & 0xffff; if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; } else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; } else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe); else { // Report an error. switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead)."); case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); default: Fail; } } } else Fail; while (srcIdx < srcEnd) { const size_t charSrcIdx = srcIdx; uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) { // c is the first character in a surrogate pair. Read the next character. if (! (srcIdx < srcEnd)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); // c2 should be the second character of the surrogate pair. if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); case uehAbort: return nDecoded; // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue; case uehIgnore: srcIdx -= 1; continue; default: Fail; } } // c and c2 each contain 10 bits of information. uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); cc += 0x10000; dest.Add(TDestCh(cc)); nDecoded++; continue; } else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; // Otherwise, store 'c' to the destination vector. dest.Add(TDestCh(c)); nDecoded++; } return nDecoded; }
size_t TUniCodec::DecodeUtf8 | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest = true |
||
) | const |
Definition at line 2031 of file unicode.h.
{ size_t nDecoded = 0; if (clrDest) dest.Clr(); const size_t origSrcIdx = srcIdx; const size_t srcEnd = srcIdx + srcCount; while (srcIdx < srcEnd) { const size_t charSrcIdx = srcIdx; uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; if ((c & _1000_0000) == 0) { // c is one of the characters 0..0x7f, encoded as a single byte. dest.Add(TDestCh(c)); nDecoded++; continue; } else if ((c & _1100_0000) == _1000_0000) { // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx. // We must have been thrown into the middle of a multi-byte character. switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } else { // c introduces a sequence of 2..6 bytes, depending on how many // of the most significant bits of c are set. uint nMoreBytes = 0, nBits = 0, minVal = 0; if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80; else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800; else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000; else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000; else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000; else { // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8 // (which allowed the encoding of codepoints up to 2^31 - 1). However, in principle this // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh. if (strict) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x."); case uehAbort: return nDecoded; // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes // and try to decode the character. Then, since 'strict' is true and // the codepoint is clearly >= 2^31, we'll notice this as an error later // and (in the case of uehReplace) insert a replacement character then. // This is probably better than inserting a replacement character right // away and then trying to read the next byte as if a new character // was beginning there -- if the current byte is really followed by five // 10xxxxxx bytes, we'll just get six replacement characters in a row. case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: break; // continue; default: Fail; } } nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; } // Decode this multi-byte sequence. uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c. bool cancel = false; for (uint i = 0; i < nMoreBytes && ! cancel; i++) { // See if there are enough bytes left in the source vector. if (! (srcIdx < srcEnd)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue; case uehIgnore: cancel = true; continue; default: Fail; } } // Read the next byte. c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx. switch (errorHandling) { case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx."); case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue; case uehIgnore: srcIdx--; cancel = true; continue; default: Fail; } } cOut <<= 6; cOut |= (c & _0011_1111); } if (cancel) continue; if (strict) { // err1: This codepoint has been represented by more bytes than it should have been. // For example, cOut in the range 0..127 should be represented by a single byte, // not by two or more bytes. // - For example, this may happen in the "modified UTF-8" sometimes used for Java // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid // the appearance of null bytes in the encoded stream. bool err1 = (cOut < minVal); // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes. // However, later this was restricted to the codepoints 0..0x10ffff only, because only these // are valid Unicode codepoints. Thus, no more than 4 bytes are ever necessary. bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff)); if (err1 || err2) switch (errorHandling) { case uehThrow: if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ")."); else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid."); else { Fail; break; } case uehAbort: return nDecoded; case uehReplace: dest.Add(TDestCh(replacementChar)); continue; case uehIgnore: continue; default: Fail; } } // Add the decoded codepoint to the destination vector. // If this is the first decoded character, and it's one of the byte-order marks // (0xfffe and 0xfeff), we will skip it (unless skipBom is false). if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) { dest.Add(cOut); nDecoded++; } } // else (multi-byte sequence) } // while return nDecoded; }
size_t TUniCodec::DecodeUtf8 | ( | const TSrcVec & | src, |
TVec< TDestCh > & | dest, | ||
const bool | clrDest = true |
||
) | const [inline] |
Definition at line 135 of file unicode.h.
{ return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
size_t TUniCodec::EncodeUtf16ToBytes | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest, | ||
const bool | insertBom, | ||
const TUniByteOrder | destByteOrder = boMachineEndian |
||
) | const |
Definition at line 2423 of file unicode.h.
{ bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian())); size_t nEncoded = 0, srcEnd = srcIdx + srcCount; if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; } while (srcIdx < srcEnd) { uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; if (! (c <= 0x10ffffu)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); case uehAbort: return nEncoded; #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); } case uehReplace: ___OutRepl; continue; case uehIgnore: continue; default: Fail; } } if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); case uehAbort: return nEncoded; case uehReplace: ___OutRepl; continue; case uehIgnore: continue; default: Fail; } } if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); case uehAbort: return nEncoded; case uehReplace: ___OutRepl; continue; case uehIgnore: continue; default: Fail; } } #undef ___OutRepl // If c is <= 0xffff, it can be stored directly. if (c <= 0xffffu) { if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); } else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } nEncoded++; continue; } // Otherwise, represent c by a pair of surrogate characters. c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); uint c1 = (c >> 10) & 1023, c2 = c & 1023; c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); } else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); } nEncoded++; continue; } return nEncoded; }
size_t TUniCodec::EncodeUtf16ToWords | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest, | ||
const bool | insertBom, | ||
const TUniByteOrder | destByteOrder = boMachineEndian |
||
) | const |
Definition at line 2371 of file unicode.h.
{ bool isMachineLe = IsMachineLittleEndian(); bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe); size_t nEncoded = 0, srcEnd = srcIdx + srcCount; if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; } while (srcIdx < srcEnd) { uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; if (! (c <= 0x10ffffu)) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); case uehAbort: return nEncoded; case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; case uehIgnore: continue; default: Fail; } } if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); case uehAbort: return nEncoded; case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; case uehIgnore: continue; default: Fail; } } if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); case uehAbort: return nEncoded; case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; case uehIgnore: continue; default: Fail; } } // If c is <= 0xffff, it can be stored directly. if (c <= 0xffffu) { if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); dest.Add(TDestCh(c)); nEncoded++; continue; } // Otherwise, represent c by a pair of surrogate characters. c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); uint c1 = (c >> 10) & 1023, c2 = c & 1023; c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; if (swap) { c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8); c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); } dest.Add(TDestCh(c1)); dest.Add(TDestCh(c2)); nEncoded++; continue; } return nEncoded; }
size_t TUniCodec::EncodeUtf8 | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount, | ||
TVec< TDestCh > & | dest, | ||
const bool | clrDest = true |
||
) | const |
Definition at line 2147 of file unicode.h.
{ size_t nEncoded = 0; for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { uint c = uint(src[TVecIdx(srcIdx)]); bool err = false; if (strict && c > 0x10ffff) { err = true; switch (errorHandling) { case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed)."); case uehAbort: return nEncoded; case uehReplace: c = replacementChar; break; case uehIgnore: continue; default: Fail; } } if (c < 0x80u) dest.Add(TDestCh(c & 0xffu)); else if (c < 0x800u) { dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } else if (c < 0x10000u) { dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } else if (c < 0x200000u) { dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111))); dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } else if (c < 0x4000000u) { dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011))); dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } else { dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011))); dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } if (! err) nEncoded++; } return nEncoded; }
size_t TUniCodec::EncodeUtf8 | ( | const TSrcVec & | src, |
TVec< TDestCh > & | dest, | ||
const bool | clrDest = true |
||
) | const [inline] |
Definition at line 144 of file unicode.h.
{ return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
TStr TUniCodec::EncodeUtf8Str | ( | const TSrcVec & | src, |
size_t | srcIdx, | ||
const size_t | srcCount | ||
) | const [inline] |
Definition at line 148 of file unicode.h.
{ TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
TStr TUniCodec::EncodeUtf8Str | ( | const TSrcVec & | src | ) | const [inline] |
Definition at line 149 of file unicode.h.
{ TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
uint TUniCodec::GetRndUint | ( | TRnd & | rnd | ) | [static, protected] |
Definition at line 62 of file unicode.cpp.
{ uint u = rnd.GetUniDevUInt(256) & 0xff; u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff); u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff); u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff); return u; }
uint TUniCodec::GetRndUint | ( | TRnd & | rnd, |
uint | minVal, | ||
uint | maxVal | ||
) | [static, protected] |
Definition at line 71 of file unicode.cpp.
{ if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd); uint range = maxVal - minVal + 1; if (range > (uint(1) << (8 * sizeof(uint) - 1))) while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; } uint mask = 1; while (mask < range) mask <<= 1; mask -= 1; while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; } }
bool TUniCodec::IsMachineLittleEndian | ( | ) | [static, protected] |
Definition at line 83 of file unicode.cpp.
{ static bool isLE, initialized = false; if (initialized) return isLE; int i = 0x0201; char *p = (char *) (&i); char c1, c2; memcpy(&c1, p, 1); memcpy(&c2, p + 1, 1); if (c1 == 1 && c2 == 2) isLE = true; else if (c1 == 2 && c2 == 1) isLE = false; else { FailR(("TUniCodec::IsMachineLittleEndian: c1 = " + TInt::GetStr(int(uchar(c1)), "%02x") + ", c2 = " + TInt::GetStr(int(uchar(c2)), "%02x") + ".").CStr()); isLE = true; } initialized = true; return isLE; }
static int TUniCodec::SwapBytes | ( | int | x | ) | [inline, static, protected] |
void TUniCodec::TestDecodeUtf16 | ( | TRnd & | rnd, |
const TStr & | testCaseDesc, | ||
const TUtf16BomHandling | bomHandling, | ||
const TUniByteOrder | defaultByteOrder, | ||
const bool | insertBom | ||
) | [protected] |
Definition at line 345 of file unicode.cpp.
{ TIntV src; TIntV expectedDest; int expectedRetVal = 0; bool expectedAbort = false; FILE *f = 0; bool isMachineLe = IsMachineLittleEndian(); bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe)); bool swap = (isMachineLe != isDefaultLe); if (insertBom) { src.Add(swap ? 0xfffe : 0xfeff); if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } } else if (bomHandling == bomRequired) { expectedAbort = true; expectedRetVal = -1; } // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where: // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y'); // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint. // (absent = 0, 'a' = 1). for (int i = 0; i < testCaseDesc.Len(); ) { const char c = testCaseDesc[i++]; uint cp = 0; int nWords = -1; if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16 else if (c == 'X') { cp = 0xfffe; nWords = 1; } else if (c == 'Y') { cp = 0xfeff; nWords = 1; } else Fail; if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C'); // Process 'e'. int nToDel = 0; if (i < testCaseDesc.Len()) { const char e = testCaseDesc[i]; if (e >= 'a') { i += 1; nToDel = 1; }} IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1))); if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C'); // Will an error occur during the decoding of this codepoint? bool errHere = false; if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true; else if (cp > 0x10ffff) { Fail; errHere = true; } else if (nToDel > 0) errHere = true; else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true; // Update 'expectedDest' and 'expectedRetVal'. if (! expectedAbort) { if (! errHere) { if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { } else { expectedDest.Add(cp); expectedRetVal += 1; } } else if (errorHandling == uehReplace) { expectedDest.Add(replacementChar); } if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; } // Update 'src'. if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp); else { int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate; int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate; src.Add(swap ? SwapBytes(c1) : c1); if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); } } if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr()); TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f); }
void TUniCodec::TestDecodeUtf8 | ( | TRnd & | rnd, |
const TStr & | testCaseDesc | ||
) | [protected] |
Definition at line 137 of file unicode.cpp.
{ TIntV src; TIntV expectedDest; int expectedRetVal = 0; bool expectedAbort = false; FILE *f = 0; // stderr // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where: // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z'); // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6'); // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint. // (absent = 0, 'a' = 1, 'b' = 2 and so on). for (int i = 0; i < testCaseDesc.Len(); ) { IAssert(i + 2 <= testCaseDesc.Len()); const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2; uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false; IAssert('1' <= d && d <= '6'); nBytes = d - '0'; if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits else if (c == 'X') { cp = 0xfffe; minBytes = 3; } else if (c == 'Y') { cp = 0xfeff; minBytes = 3; } else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f)) else Fail; IAssert(nBytes >= minBytes); // Process 'e'. int nToDel = 0; if (i < testCaseDesc.Len()) { const char e = testCaseDesc[i]; if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }} IAssert(nToDel < nBytes); // Will an error occur during the decoding of this codepoint? bool errHere = false; if (eighties) errHere = true; else if (nToDel > 0) errHere = true; else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true; // Update 'expectedDest' and 'expetedRetVal'. if (! expectedAbort) { if (! errHere) { if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { } else { expectedDest.Add(cp); expectedRetVal += 1; } } else if (errorHandling == uehReplace) { if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar); else expectedDest.Add(replacementChar); } if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; } // Update 'src'. if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff)); else if (nBytes == 1) src.Add(cp); else { int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes); src.Add(mask | (uint(cp) >> (6 * (nBytes - 1)))); for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); } } if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr()); TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f); }
void TUniCodec::TestUtf16 | ( | bool | decode, |
size_t | expectedRetVal, | ||
bool | expectedThrow, | ||
const TIntV & | src, | ||
const TIntV & | expectedDest, | ||
const TUtf16BomHandling | bomHandling, | ||
const TUniByteOrder | defaultByteOrder, | ||
const bool | insertBom, | ||
FILE * | f | ||
) | [protected] |
Definition at line 288 of file unicode.cpp.
{ TIntV srcBytes, expectedDestBytes; WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes); TIntV dest; if (f) { fprintf(f, "Settings: %s %s %s %s %s replacementChar = %x \n", (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"), (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")), (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"), (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"), uint(replacementChar)); fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); } for (int useBytes = 0; useBytes < 2; useBytes++) { const char *fmt = (useBytes ? " %02x" : " %04x"); try { dest.Clr(); size_t retVal; if (! useBytes) { if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); } else { if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder); else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); } const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest); if (f) { fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(dest[i])); fprintf(f, "\n expDest "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(ed[i])); fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); } bool ok = true; if (retVal != expectedRetVal) ok = false; if (dest.Len() != ed.Len()) ok = false; if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false; if (! ok) { printf("!!!\n"); } IAssert(retVal == expectedRetVal); IAssert(! expectedThrow); IAssert(dest.Len() == ed.Len()); for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]); } catch (TUnicodeException e) { if (f) { fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i])); fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); } IAssert(expectedThrow); } } }
void TUniCodec::TestUtf16 | ( | ) |
Definition at line 412 of file unicode.cpp.
{ TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar); for (int skipBom_ = 0; skipBom_ < 2; skipBom_++) for (int strict_ = 0; strict_ < 2; strict_++) for (int errMode_ = 0; errMode_ < 4; errMode_++) for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++) for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++) for (int insertBom_ = 0; insertBom_ < 2; insertBom_++) { strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1); bool insertBom = (insertBom_ == 1); TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_; TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_; TRnd rnd = TRnd(123); // Test DecodeUtf16 on various random UTF-16-encoded sequences. for (int i = 0; i < 10; i++) { TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom); TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom); } //continue; // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters // close to powers of 2. TIntV src, expectedDest, src2; expectedDest.Gen(1); src.Reserve(6); src2.Gen(1); for (int pow = 8; pow <= 32; pow++) { uint uFrom, uTo; if (pow == 8) uFrom = 0, uTo = 1u << pow; else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx; else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8); printf("%u..%u \r", uFrom, uTo); for (uint u = uFrom; ; u++) { int nWords = 0; if (u < 0x10000) nWords = 1; else nWords = 2; bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe)); bool swap = (isMachineLe != isDestLe); bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023); src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0)); if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff); if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023))) { // Try to encode 'u' and see if it gets decoded correctly. if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u); else { int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023); int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023); src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1); src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); } if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding { expectedDest.Reserve(2, 0); if (insertBom && ! skipBom) expectedDest.Add(0xfeff); if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar); else if (! err) expectedDest.Add(u); int erv = (err ? 0 : expectedDest.Len()); if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0; bool errD = err; if (bomHandling == bomRequired && ! insertBom) { expectedDest.Clr(false); if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); } else { erv = -1; errD = true; /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }} TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0); } } // We can also test the UTF-16 encoder. src2[0] = u; if (err) { src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff); if (errorHandling == uehReplace) { src.Add(swap ? SwapBytes(replacementChar) : replacementChar); /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); } else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */ }} TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0); // if (u == uTo) break; } } } }
void TUniCodec::TestUtf8 | ( | bool | decode, |
size_t | expectedRetVal, | ||
bool | expectedThrow, | ||
const TIntV & | src, | ||
const TIntV & | expectedDest, | ||
FILE * | f | ||
) | [protected] |
Definition at line 103 of file unicode.cpp.
{ TIntV dest; if (f) { fprintf(f, "Settings: %s %s %s replacementChar = %x\n", (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"), (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar)); fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); } try { size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true)); if (f) { fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(dest[i])); fprintf(f, "\n expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(expectedDest[i])); fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); } if (retVal != expectedRetVal) printf("!!!"); IAssert(retVal == expectedRetVal); IAssert(! expectedThrow); if (dest.Len() != expectedDest.Len()) printf("!!!"); IAssert(dest.Len() == expectedDest.Len()); for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]); } catch (TUnicodeException e) { if (f) { fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i])); fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); } IAssert(expectedThrow); } }
void TUniCodec::TestUtf8 | ( | ) |
Definition at line 198 of file unicode.cpp.
{ TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true); for (int skipBom_ = 0; skipBom_ < 2; skipBom_++) for (int strict_ = 0; strict_ < 2; strict_++) for (int errMode_ = 0; errMode_ < 4; errMode_++) { strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1); TRnd rnd = TRnd(123); // Test DecodeUtf8 on various random UTF-8-encoded sequences. for (int i = 0; i < 10; i++) { TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6"); TestDecodeUtf8(rnd, "X3A5dA6d"); TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1"); TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2"); TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2"); TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a"); TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1"); TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a"); TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b"); TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c"); TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d"); TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e"); } // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters // close to powers of 2. TIntV src, expectedDest, src2; expectedDest.Gen(1); src.Reserve(6); src2.Gen(1); for (int pow = 8; pow <= 32; pow++) { uint uFrom, uTo; if (pow == 8) uFrom = 0, uTo = 1u << pow; else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx; else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8); printf("%u..%u \r", uFrom, uTo); for (uint u = uFrom; ; u++) { int nBytes = 0; if (u < (1u << 7)) nBytes = 1; else if (u < (1u << 11)) nBytes = 2; else if (u < (1u << 16)) nBytes = 3; else if (u < (1u << 21)) nBytes = 4; else if (u < (1u << 26)) nBytes = 5; else nBytes = 6; src.Gen(6, nBytes); if (nBytes == 1) src[0] = u; else { src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1))); for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); } bool err = (strict && u > 0x10ffff); expectedDest.Reserve(1, 0); if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar); else if (! err) expectedDest.Add(u); int erv = (err ? 0 : 1); if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0; TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0); // We can also test the UTF-8 encoder. src2[0] = u; if (err) { if (errorHandling == uehReplace) src = utf8ReplCh; else src.Clr(false); } TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0); // if (u == uTo) break; } } } }
void TUniCodec::WordsToBytes | ( | const TIntV & | src, |
TIntV & | dest | ||
) | [protected] |
Definition at line 278 of file unicode.cpp.
friend class TUniCaseFolding [friend] |
bool TUniCodec::skipBom |
bool TUniCodec::strict |