SNAP Library 2.1, Developer Reference  2013-09-25 10:47:25
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
unicode.h File Reference
#include "bd.h"
#include <new>
Include dependency graph for unicode.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  TUnicodeException
class  TUniCodec
class  TUniCaseFolding
class  TCodecBase
class  TCodecWrapper< TCodecImpl_ >
class  TVecElt< TVector_ >
class  TVecElt< TVec< TDat > >
class  TVecElt< TChA >
class  TEncoding_ISO8859_1
class  TEncoding_ISO8859_2
class  TEncoding_ISO8859_3
class  TEncoding_ISO8859_4
class  TEncoding_YuAscii
class  TEncoding_CP437
class  TEncoding_CP852
class  TEncoding_CP1250
class  T8BitCodec< TEncoding_ >
class  TUniChInfo
class  TUniTrie< TItem_ >
class  TUniTrie< TItem_ >::TNode
class  TUniChDb
class  TUniChDb::TUcdFileReader
class  TUniChDb::TSubcatHelper
class  TUnicode

Defines

#define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0)   _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
#define DefineUniCat(cat, c)   uc ## cat = (int(uchar(c)) & 0xff)
#define DefineUniSubCat(cat, subCat, c)   uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
#define ___UniFwd1(name)   bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
#define ___UniFwd2(name1, name2)   ___UniFwd1(name1) ___UniFwd1(name2)
#define ___UniFwd3(name1, name2, name3)   ___UniFwd2(name1, name2) ___UniFwd1(name3)
#define ___UniFwd4(name1, name2, name3, name4)   ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
#define ___UniFwd5(name1, name2, name3, name4, name5)   ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
#define DECLARE_FORWARDED_PROPERTY_METHODS
#define ___UniFwd1(name)   bool name(const int cp) const { return ucd.name(cp); }
#define ___OutRepl   if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
#define TestCurNext(curFlag, nextFlag)   if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define TestCurNext2(curFlag, nextFlag, next2Flag)   if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
#define TestPrevCurNext(prevFlag, curFlag, nextFlag)   if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define TestCur(curFlag)   ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
#define Trans(curFlag, newState)   if (TestCur(curFlag)) { backState = st##newState; break; }
#define IsPeekAheadSkippable(sbf)   ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
#define TestCurNext(curFlag, nextFlag)   if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
#define TestCurNext2(curFlag, nextFlag, next2Flag)   if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
#define TestPrevCurNext(prevFlag, curFlag, nextFlag)   if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue

Typedefs

typedef int TUniVecIdx
typedef enum TUnicodeErrorHandling_ TUnicodeErrorHandling
typedef enum TUniByteOrder_ TUniByteOrder
typedef enum TUtf16BomHandling_ TUtf16BomHandling
typedef THash< TInt, TIntVTIntIntVH
typedef TPt< TCodecBasePCodecBase
typedef TVec< PCodecBaseTCodecBaseV
typedef T8BitCodec
< TEncoding_ISO8859_1
TCodec_ISO8859_1
typedef T8BitCodec
< TEncoding_ISO8859_2
TCodec_ISO8859_2
typedef T8BitCodec
< TEncoding_ISO8859_3
TCodec_ISO8859_3
typedef T8BitCodec
< TEncoding_ISO8859_4
TCodec_ISO8859_4
typedef T8BitCodec
< TEncoding_CP852
TCodec_CP852
typedef T8BitCodec
< TEncoding_CP437
TCodec_CP437
typedef T8BitCodec
< TEncoding_CP1250
TCodec_CP1250
typedef T8BitCodec
< TEncoding_YuAscii
TCodec_YuAscii
typedef enum TUniChCategory_ TUniChCategory
typedef enum TUniChSubCategory_ TUniChSubCategory
typedef enum TUniChFlags_ TUniChFlags
typedef enum TUniChProperties_ TUniChProperties
typedef enum TUniChPropertiesX_ TUniChPropertiesX

Enumerations

enum  TUnicodeErrorHandling_ { uehIgnore = 0, uehThrow = 1, uehReplace = 2, uehAbort = 3 }
enum  TUniByteOrder_ { boMachineEndian = 0, boLittleEndian = 1, boBigEndian = 2 }
enum  TUtf16BomHandling_ { bomAllowed = 0, bomRequired = 1, bomIgnored = 2 }
enum  TUniChCategory_ {
  DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'),
  DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L'), DefineUniCat = (Letter, 'L')
}
enum  TUniChSubCategory_ {
  DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'),
  DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'),
  DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'),
  DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'),
  DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'),
  DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'),
  DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'),
  DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u'), DefineUniSubCat = (Letter, Uppercase, 'u')
}
enum  TUniChFlags_ {
  ucfCompatibilityDecomposition = 1, ucfCompositionExclusion = 1 << 1, ucfWbFormat = 1 << 2, ucfWbKatakana = 1 << 3,
  ucfWbALetter = 1 << 4, ucfWbMidLetter = 1 << 5, ucfWbMidNum = 1 << 6, ucfWbNumeric = 1 << 7,
  ucfWbExtendNumLet = 1 << 8, ucfSbSep = 1 << 9, ucfSbFormat = 1 << 10, ucfSbSp = 1 << 11,
  ucfSbLower = 1 << 12, ucfSbUpper = 1 << 13, ucfSbOLetter = 1 << 14, ucfSbNumeric = 1 << 15,
  ucfSbATerm = 1 << 16, ucfSbSTerm = 1 << 17, ucfSbClose = 1 << 18, ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,
  ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep, ucfDcpAlphabetic = 1 << 19, ucfDcpDefaultIgnorableCodePoint = 1 << 20, ucfDcpLowercase = 1 << 21,
  ucfDcpGraphemeBase = 1 << 22, ucfDcpGraphemeExtend = 1 << 23, ucfDcpIdStart = 1 << 24, ucfDcpIdContinue = 1 << 25,
  ucfDcpMath = 1 << 26, ucfDcpUppercase = 1 << 27, ucfDcpXidStart = 1 << 28, ucfDcpXidContinue = 1 << 29,
  ucfDcpMask
}
enum  TUniChProperties_ {
  ucfPrAsciiHexDigit = 1, ucfPrBidiControl = 2, ucfPrDash = 4, ucfPrDeprecated = 8,
  ucfPrDiacritic = 0x10, ucfPrExtender = 0x20, ucfPrGraphemeLink = 0x40, ucfPrHexDigit = 0x80,
  ucfPrHyphen = 0x100, ucfPrIdeographic = 0x200, ucfPrJoinControl = 0x400, ucfPrLogicalOrderException = 0x800,
  ucfPrNoncharacterCodePoint = 0x1000, ucfPrPatternSyntax = 0x2000, ucfPrPatternWhiteSpace = 0x4000, ucfPrQuotationMark = 0x8000,
  ucfPrSoftDotted = 0x10000, ucfPrSTerm = 0x20000, ucfPrTerminalPunctuation = 0x40000, ucfPrVariationSelector = 0x80000,
  ucfPrWhiteSpace = 0x100000
}
enum  TUniChPropertiesX_ {
  ucfPxOtherAlphabetic = 1, ucfPxOtherDefaultIgnorableCodePoint = 2, ucfPxOtherGraphemeExtend = 4, ucfPxOtherIdContinue = 8,
  ucfPxOtherIdStart = 0x10, ucfPxOtherLowercase = 0x20, ucfPxOtherMath = 0x40, ucfPxOtherUppercase = 0x80,
  ucfPxIdsBinaryOperator = 0x100, ucfPxIdsTrinaryOperator = 0x200, ucfPxRadical = 0x400, ucfPxUnifiedIdeograph = 0x800
}

Functions

bool AlwaysFalse ()
bool AlwaysTrue ()

Define Documentation

#define ___OutRepl   if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
#define ___UniFwd1 (   name)    bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }

Definition at line 2014 of file unicode.h.

#define ___UniFwd1 (   name)    bool name(const int cp) const { return ucd.name(cp); }

Definition at line 2014 of file unicode.h.

#define ___UniFwd2 (   name1,
  name2 
)    ___UniFwd1(name1) ___UniFwd1(name2)

Definition at line 1362 of file unicode.h.

#define ___UniFwd3 (   name1,
  name2,
  name3 
)    ___UniFwd2(name1, name2) ___UniFwd1(name3)

Definition at line 1363 of file unicode.h.

#define ___UniFwd4 (   name1,
  name2,
  name3,
  name4 
)    ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)

Definition at line 1364 of file unicode.h.

#define ___UniFwd5 (   name1,
  name2,
  name3,
  name4,
  name5 
)    ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)

Definition at line 1365 of file unicode.h.

Value:
___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
        ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic)  \
        ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted)  \
        ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace)  \
        ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable)  \
        ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue)  \
        ___UniFwd2(IsXidStart, IsXidContinue)  \
        ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep)  \
        ___UniFwd1(IsGbExtend)  \
        ___UniFwd2(IsCased, IsCurrency)

Definition at line 1367 of file unicode.h.

#define DefineByte (   b7,
  b6,
  b5,
  b4,
  b3,
  b2,
  b1,
  b0 
)    _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0

Definition at line 102 of file unicode.h.

#define DefineUniCat (   cat,
 
)    uc ## cat = (int(uchar(c)) & 0xff)

Definition at line 664 of file unicode.h.

#define DefineUniSubCat (   cat,
  subCat,
 
)    uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)

Definition at line 678 of file unicode.h.

#define TestCur (   curFlag)    ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
#define TestCurNext (   curFlag,
  nextFlag 
)    if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define TestCurNext (   curFlag,
  nextFlag 
)    if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
#define TestCurNext2 (   curFlag,
  nextFlag,
  next2Flag 
)    if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
#define TestCurNext2 (   curFlag,
  nextFlag,
  next2Flag 
)    if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
#define TestPrevCurNext (   prevFlag,
  curFlag,
  nextFlag 
)    if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define TestPrevCurNext (   prevFlag,
  curFlag,
  nextFlag 
)    if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
#define Trans (   curFlag,
  newState 
)    if (TestCur(curFlag)) { backState = st##newState; break; }

Typedef Documentation

Definition at line 328 of file unicode.h.

Definition at line 655 of file unicode.h.

Definition at line 654 of file unicode.h.

Definition at line 653 of file unicode.h.

Definition at line 649 of file unicode.h.

Definition at line 650 of file unicode.h.

Definition at line 651 of file unicode.h.

Definition at line 652 of file unicode.h.

Definition at line 656 of file unicode.h.

Definition at line 330 of file unicode.h.

Definition at line 269 of file unicode.h.

typedef enum TUniChFlags_ TUniChFlags
typedef int TUniVecIdx

Definition at line 11 of file unicode.h.


Enumeration Type Documentation

Enumerator:
boMachineEndian 
boLittleEndian 
boBigEndian 

Definition at line 38 of file unicode.h.

Enumerator:
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 

Definition at line 662 of file unicode.h.

{
#define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
        DefineUniCat(Letter, 'L'),             // ucLetter
        DefineUniCat(Mark, 'M'),
        DefineUniCat(Number, 'N'),
        DefineUniCat(Punctuation, 'P'),
        DefineUniCat(Symbol, 'S'),
        DefineUniCat(Separator, 'Z'),
        DefineUniCat(Other, 'C')
#undef DefineUniCat
}
Enumerator:
ucfCompatibilityDecomposition 
ucfCompositionExclusion 
ucfWbFormat 
ucfWbKatakana 
ucfWbALetter 
ucfWbMidLetter 
ucfWbMidNum 
ucfWbNumeric 
ucfWbExtendNumLet 
ucfSbSep 
ucfSbFormat 
ucfSbSp 
ucfSbLower 
ucfSbUpper 
ucfSbOLetter 
ucfSbNumeric 
ucfSbATerm 
ucfSbSTerm 
ucfSbClose 
ucfSbMask 
ucfWbMask 
ucfDcpAlphabetic 
ucfDcpDefaultIgnorableCodePoint 
ucfDcpLowercase 
ucfDcpGraphemeBase 
ucfDcpGraphemeExtend 
ucfDcpIdStart 
ucfDcpIdContinue 
ucfDcpMath 
ucfDcpUppercase 
ucfDcpXidStart 
ucfDcpXidContinue 
ucfDcpMask 

Definition at line 712 of file unicode.h.

{
        ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical
        ucfCompositionExclusion = 1 << 1,       // from CompositionExclusions.txt
        // Flags used when searching for word boundaries.  See UAX #29.
        ucfWbFormat = 1 << 2,
        ucfWbKatakana = 1 << 3,
        ucfWbALetter = 1 << 4,
        ucfWbMidLetter = 1 << 5,
        ucfWbMidNum = 1 << 6,
        ucfWbNumeric = 1 << 7,
        ucfWbExtendNumLet = 1 << 8,
        // Flags used with sentence boundaries (Sep is also used with word boundaries).  See UAX #29.
        ucfSbSep = 1 << 9,
        ucfSbFormat = 1 << 10,
        ucfSbSp = 1 << 11,
        ucfSbLower = 1 << 12,
        ucfSbUpper = 1 << 13,
        ucfSbOLetter = 1 << 14,
        ucfSbNumeric = 1 << 15,
        ucfSbATerm = 1 << 16,
        ucfSbSTerm = 1 << 17,
        ucfSbClose = 1 << 18,
        ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,
        ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep,
        // Flags from DerivedCoreProperties.txt.
        // [The comments are from UCD.html.]
        // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].
        //   Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
        ucfDcpAlphabetic = 1 << 19,
        // - For programmatic determination of default-ignorable code points.
        //   New characters that should be ignored in processing (unless explicitly supported)
        //   will be assigned in these ranges, permitting programs to correctly handle the default
        //   behavior of such characters when not otherwise supported.  For more information, see
        //   UAX #29: Text Boundaries [Breaks].
        //   Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters
        //   [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]
        ucfDcpDefaultIgnorableCodePoint = 1 << 20,
        // - Characters with the Lowercase property.  For more information, see Chapter 4 in [Unicode].
        //   Generated from: Other_Lowercase + Ll
        ucfDcpLowercase = 1 << 21,
        // - For programmatic determination of grapheme cluster boundaries.
        //   For more information, see UAX #29: Text Boundaries [Breaks].
        //   Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
        ucfDcpGraphemeBase = 1 << 22,
        // - For programmatic determination of grapheme cluster boundaries.
        //   For more information, see UAX #29: Text Boundaries [Breaks].
        //   Generated from: Other_Grapheme_Extend + Me + Mn
        //   Note: depending on an application's interpretation of Co (private use), they may be either
        //         in Grapheme_Base, or in Grapheme_Extend, or in neither.
        ucfDcpGraphemeExtend = 1 << 23,
        // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
        ucfDcpIdStart = 1 << 24,
        ucfDcpIdContinue = 1 << 25,
        // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].
        //   Generated from: Sm + Other_Math
        ucfDcpMath = 1 << 26,
        // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].
        //   Generated from: Lu + Other_Uppercase
        ucfDcpUppercase = 1 << 27,
        // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
        ucfDcpXidStart = 1 << 28,
        ucfDcpXidContinue = 1 << 29,
        ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend |
                ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue,
}
Enumerator:
ucfPrAsciiHexDigit 
ucfPrBidiControl 
ucfPrDash 
ucfPrDeprecated 
ucfPrDiacritic 
ucfPrExtender 
ucfPrGraphemeLink 
ucfPrHexDigit 
ucfPrHyphen 
ucfPrIdeographic 
ucfPrJoinControl 
ucfPrLogicalOrderException 
ucfPrNoncharacterCodePoint 
ucfPrPatternSyntax 
ucfPrPatternWhiteSpace 
ucfPrQuotationMark 
ucfPrSoftDotted 
ucfPrSTerm 
ucfPrTerminalPunctuation 
ucfPrVariationSelector 
ucfPrWhiteSpace 

Definition at line 780 of file unicode.h.

{
        // The flags from PropList.txt.
        // [The comments are from UCD.html.]
        // - ASCII characters commonly used for the representation of hexadecimal numbers.
        //   [= 0123456789abcdefABCDEF]
        ucfPrAsciiHexDigit = 1,
        // - Those format control characters which have specific functions in the Bidirectional Algorithm.
        ucfPrBidiControl = 2,
        // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,
        //   plus compatibility equivalents to those. Most of these have the Pd General Category,
        //   but some have the Sm General Category because of their use in mathematics.
        //     U+0002d  HYPHEN-MINUS
        //     U+0058a  ARMENIAN HYPHEN
        //     U+005be  HEBREW PUNCTUATION MAQAF
        //     U+01806  MONGOLIAN TODO SOFT HYPHEN
        //     U+02010  HYPHEN
        //     U+02011  NON-BREAKING HYPHEN
        //     U+02012  FIGURE DASH
        //     U+02013  EN DASH
        //     U+02014  EM DASH
        //     U+02015  HORIZONTAL BAR
        //     U+02053  SWUNG DASH
        //     U+0207b  SUPERSCRIPT MINUS
        //     U+0208b  SUBSCRIPT MINUS
        //     U+02212  MINUS SIGN
        //     U+02e17  DOUBLE OBLIQUE HYPHEN
        //     U+0301c  WAVE DASH
        //     U+03030  WAVY DASH
        //     U+030a0  KATAKANA-HIRAGANA DOUBLE HYPHEN
        //     U+0fe31  PRESENTATION FORM FOR VERTICAL EM DASH
        //     U+0fe32  PRESENTATION FORM FOR VERTICAL EN DASH
        //     U+0fe58  SMALL EM DASH
        //     U+0fe63  SMALL HYPHEN-MINUS
        //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
        ucfPrDash = 4,
        // - For a machine-readable list of deprecated characters.  No characters will ever be removed
        //   from the standard, but the usage of deprecated characters is strongly discouraged.
        ucfPrDeprecated = 8,
        // - Characters that linguistically modify the meaning of another character to which they apply.
        //   Some diacritics are not combining characters, and some combining characters are not diacritics.
        ucfPrDiacritic = 0x10,
        // - Characters whose principal function is to extend the value or shape of a preceding alphabetic
        //   character.  Typical of these are length and iteration marks.
        ucfPrExtender = 0x20,
        // - Used in determining default grapheme cluster boundaries.  For more information, see UAX #29: Text Boundaries.
        ucfPrGraphemeLink = 0x40,
        // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.
        //   [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]
        ucfPrHexDigit = 0x80,
        // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.
        //   The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.
        //     U+0002d  HYPHEN-MINUS
        //     U+000ad  SOFT HYPHEN
        //     U+0058a  ARMENIAN HYPHEN
        //     U+01806  MONGOLIAN TODO SOFT HYPHEN
        //     U+02010  HYPHEN
        //     U+02011  NON-BREAKING HYPHEN
        //     U+02e17  DOUBLE OBLIQUE HYPHEN
        //     U+030fb  KATAKANA MIDDLE DOT
        //     U+0fe63  SMALL HYPHEN-MINUS
        //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
        //     U+0ff65  HALFWIDTH KATAKANA MIDDLE DOT
        ucfPrHyphen = 0x100,
        // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.
        ucfPrIdeographic = 0x200,
        // - Those format control characters which have specific functions for control of cursive joining and ligation.
        ucfPrJoinControl = 0x400,
        // - There are a small number of characters that do not use logical order.
        //   These characters require special handling in most processing.
        ucfPrLogicalOrderException = 0x800,
        // - Code points that are permanently reserved for internal use.
        ucfPrNoncharacterCodePoint = 0x1000,
        // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.
        ucfPrPatternSyntax = 0x2000,
        ucfPrPatternWhiteSpace = 0x4000,
        // - Those punctuation characters that function as quotation marks.
        //     U+00022  QUOTATION MARK
        //     U+00027  APOSTROPHE
        //     U+000ab  LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
        //     U+000bb  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
        //     U+02018  LEFT SINGLE QUOTATION MARK
        //     U+02019  RIGHT SINGLE QUOTATION MARK
        //     U+0201a  SINGLE LOW-9 QUOTATION MARK
        //     U+0201b  SINGLE HIGH-REVERSED-9 QUOTATION MARK
        //     U+0201c  LEFT DOUBLE QUOTATION MARK
        //     U+0201d  RIGHT DOUBLE QUOTATION MARK
        //     U+0201e  DOUBLE LOW-9 QUOTATION MARK
        //     U+0201f  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
        //     U+02039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
        //     U+0203a  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
        //     U+0300c  LEFT CORNER BRACKET
        //     U+0300d  RIGHT CORNER BRACKET
        //     U+0300e  LEFT WHITE CORNER BRACKET
        //     U+0300f  RIGHT WHITE CORNER BRACKET
        //     U+0301d  REVERSED DOUBLE PRIME QUOTATION MARK
        //     U+0301e  DOUBLE PRIME QUOTATION MARK
        //     U+0301f  LOW DOUBLE PRIME QUOTATION MARK
        //     U+0fe41  PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
        //     U+0fe42  PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
        //     U+0fe43  PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
        //     U+0fe44  PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
        //     U+0ff02  FULLWIDTH QUOTATION MARK
        //     U+0ff07  FULLWIDTH APOSTROPHE
        //     U+0ff62  HALFWIDTH LEFT CORNER BRACKET
        //     U+0ff63  HALFWIDTH RIGHT CORNER BRACKET
        ucfPrQuotationMark = 0x8000,
        // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.
        //   An explicit _dot above_ can be added where required, such as in Lithuanian.
        ucfPrSoftDotted = 0x10000,
        // - Sentence Terminal. Used in UAX #29: Text Boundaries.
        //     U+00021  EXCLAMATION MARK
        //     U+0002e  FULL STOP
        //     U+0003f  QUESTION MARK
        //     U+0203c  DOUBLE EXCLAMATION MARK
        //     U+0203d  INTERROBANG
        //     U+02047  DOUBLE QUESTION MARK
        //     U+02048  QUESTION EXCLAMATION MARK
        //     U+02049  EXCLAMATION QUESTION MARK
        //     U+03002  IDEOGRAPHIC FULL STOP
        //     [plus many characters from other writing systems]
        ucfPrSTerm = 0x20000,
        // - Those punctuation characters that generally mark the end of textual units.
        //   [JB note: this set contains more character than STerm.  For example, it contains
        //   the comma, colon and semicolon, whereas STerm doesn't.]
        //     U+00021  EXCLAMATION MARK
        //     U+0002c  COMMA
        //     U+0002e  FULL STOP
        //     U+0003a  COLON
        //     U+0003b  SEMICOLON
        //     U+0003f  QUESTION MARK
        //     U+0203c  DOUBLE EXCLAMATION MARK
        //     U+0203d  INTERROBANG
        //     U+02047  DOUBLE QUESTION MARK
        //     U+02048  QUESTION EXCLAMATION MARK
        //     U+02049  EXCLAMATION QUESTION MARK
        //     [plus *lots* of charcters from other writing systems]
        ucfPrTerminalPunctuation = 0x40000,
        // - Indicates all those characters that qualify as Variation Selectors.
        //   For details on the behavior of these characters, see StandardizedVariants.html and
        //   Section 16.4, Variation Selectors in [Unicode].
        ucfPrVariationSelector = 0x80000,
        // - Those separator characters and control characters which should be treated by
        //   programming languages as "white space" for the purpose of parsing elements.
        //   Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,
        //         since their functions are restricted to line-break control.
        //         Their names are unfortunately misleading in this respect.
        //   Note: There are other senses of "whitespace" that encompass a different set of characters.
        //         [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.
        //         There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]
        //   This includes the following characters:
        //     U+0009  <control>
        //     U+000a  <control>
        //     U+000b  <control>
        //     U+000c  <control>
        //     U+000d  <control>
        //     U+0020  SPACE
        //     U+0085  <control>
        //     U+00a0  NO-BREAK SPACE
        //     U+1680  OGHAM SPACE MARK
        //     U+180e  MONGOLIAN VOWEL SEPARATOR
        //     U+2000  EN QUAD
        //     U+2001  EM QUAD
        //     U+2002  EN SPACE
        //     U+2003  EM SPACE
        //     U+2004  THREE-PER-EM SPACE
        //     U+2005  FOUR-PER-EM SPACE
        //     U+2006  SIX-PER-EM SPACE
        //     U+2007  FIGURE SPACE
        //     U+2008  PUNCTUATION SPACE
        //     U+2009  THIN SPACE
        //     U+200a  HAIR SPACE
        //     U+2028  LINE SEPARATOR
        //     U+2029  PARAGRAPH SEPARATOR
        //     U+202f  NARROW NO-BREAK SPACE
        //     U+205f  MEDIUM MATHEMATICAL SPACE
        //     U+3000  IDEOGRAPHIC SPACE
        ucfPrWhiteSpace = 0x100000
}
Enumerator:
ucfPxOtherAlphabetic 
ucfPxOtherDefaultIgnorableCodePoint 
ucfPxOtherGraphemeExtend 
ucfPxOtherIdContinue 
ucfPxOtherIdStart 
ucfPxOtherLowercase 
ucfPxOtherMath 
ucfPxOtherUppercase 
ucfPxIdsBinaryOperator 
ucfPxIdsTrinaryOperator 
ucfPxRadical 
ucfPxUnifiedIdeograph 

Definition at line 961 of file unicode.h.

{
        // More properties from PropList.txt.
        // - Used to derive the properties in DerivedCoreProperties.txt.
        ucfPxOtherAlphabetic = 1,
        ucfPxOtherDefaultIgnorableCodePoint = 2,
        ucfPxOtherGraphemeExtend = 4,
        ucfPxOtherIdContinue = 8,
        ucfPxOtherIdStart = 0x10,
        ucfPxOtherLowercase = 0x20,
        ucfPxOtherMath = 0x40,
        ucfPxOtherUppercase = 0x80,
        // - Used in ideographic description sequences.
        ucfPxIdsBinaryOperator = 0x100,
        ucfPxIdsTrinaryOperator = 0x200,
        ucfPxRadical = 0x400,
        ucfPxUnifiedIdeograph = 0x800
}
Enumerator:
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 

Definition at line 676 of file unicode.h.

{
#define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
        DefineUniSubCat(Letter, Uppercase, 'u'),            // ucLetterUppercase
        DefineUniSubCat(Letter, Lowercase, 'l'),
        DefineUniSubCat(Letter, Titlecase, 't'),
        DefineUniSubCat(Letter, Modifier, 'm'),
        DefineUniSubCat(Letter, Other, 'o'),
        DefineUniSubCat(Mark, Nonspacing, 'n'),
        DefineUniSubCat(Mark, SpacingCombining, 'c'),
        DefineUniSubCat(Mark, Enclosing, 'e'),
        DefineUniSubCat(Number, DecimalDigit, 'd'),
        DefineUniSubCat(Number, Letter, 'l'),
        DefineUniSubCat(Number, Other, 'o'),
        DefineUniSubCat(Punctuation, Connector, 'c'),
        DefineUniSubCat(Punctuation, Dash, 'd'),
        DefineUniSubCat(Punctuation, Open, 's'),
        DefineUniSubCat(Punctuation, Close, 'e'),
        DefineUniSubCat(Punctuation, InitialQuote, 'i'),
        DefineUniSubCat(Punctuation, FinalQuote, 'f'),
        DefineUniSubCat(Punctuation, Other, 'o'),
        DefineUniSubCat(Symbol, Math, 'm'),
        DefineUniSubCat(Symbol, Currency, 'c'),
        DefineUniSubCat(Symbol, Modifier, 'k'),
        DefineUniSubCat(Symbol, Other, 'o'),
        DefineUniSubCat(Separator, Space, 's'),
        DefineUniSubCat(Separator, Line, 'l'),
        DefineUniSubCat(Separator, Paragraph, 'p'),
        DefineUniSubCat(Other, Control, 'c'),
        DefineUniSubCat(Other, Format, 'f'),
        DefineUniSubCat(Other, Surrogate, 's'),
        DefineUniSubCat(Other, PrivateUse, 'o'),
        DefineUniSubCat(Other, NotAssigned, 'n')
}
Enumerator:
uehIgnore 
uehThrow 
uehReplace 
uehAbort 

Definition at line 18 of file unicode.h.

{
        // What happens when an error occurs:
        uehIgnore = 0,  // - it is silently ignored (nothing is added to the output vector)
        uehThrow = 1,   // - an exception is thrown (TUnicodeException)
        uehReplace = 2, // - the replacement character is added to the output vector
        uehAbort = 3    // - the encoding/decoding process stops immediately
}
Enumerator:
bomAllowed 
bomRequired 
bomIgnored 

Definition at line 46 of file unicode.h.

{
        bomAllowed = 0,   // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used
        bomRequired = 1,  // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported
        bomIgnored = 2    // the default byte order is used; if a BOM is present, it is treated like any other character
}

Function Documentation

bool AlwaysFalse ( ) [inline]

Definition at line 3221 of file unicode.h.

Referenced by TUniChDb::InitScripts(), and TUniChDb::TestFindNextWordOrSentenceBoundary().

{
        int sum = 0;
        for (int i = 0; i < 5; i++) sum += i;
        return sum > 100;
}

Here is the caller graph for this function:

bool AlwaysTrue ( ) [inline]

Definition at line 3228 of file unicode.h.

{
        int sum = 0;
        for (int i = 0; i < 5; i++) sum += i;
        return sum < 100;
}