| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673 |
- //-------------------------------------------------------------------------------------------------------
- // Copyright (C) Microsoft. All rights reserved.
- // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
- //-------------------------------------------------------------------------------------------------------
- #include "ParserPch.h"
- namespace PlatformAgnostic
- {
- namespace UnicodeText
- {
- // Technically, this is not specific to Unicode (in fact, it's used in the non-Unicode case)
- // But it's in this namespace for convenience
- static const CharacterTypeFlags charFlags[128] =
- {
- UnknownChar, /* 0x00 */
- UnknownChar, /* 0x01 */
- UnknownChar, /* 0x02 */
- UnknownChar, /* 0x03 */
- UnknownChar, /* 0x04 */
- UnknownChar, /* 0x05 */
- UnknownChar, /* 0x06 */
- UnknownChar, /* 0x07 */
- UnknownChar, /* 0x08 */
- SpaceChar, /* 0x09 */
- LineCharGroup, /* 0x0A */
- SpaceChar, /* 0x0B */
- SpaceChar, /* 0x0C */
- LineCharGroup, /* 0x0D */
- UnknownChar, /* 0x0E */
- UnknownChar, /* 0x0F */
- UnknownChar, /* 0x10 */
- UnknownChar, /* 0x11 */
- UnknownChar, /* 0x12 */
- UnknownChar, /* 0x13 */
- UnknownChar, /* 0x14 */
- UnknownChar, /* 0x15 */
- UnknownChar, /* 0x16 */
- UnknownChar, /* 0x17 */
- UnknownChar, /* 0x18 */
- UnknownChar, /* 0x19 */
- UnknownChar, /* 0x1A */
- UnknownChar, /* 0x1B */
- UnknownChar, /* 0x1C */
- UnknownChar, /* 0x1D */
- UnknownChar, /* 0x1E */
- UnknownChar, /* 0x1F */
- SpaceChar, /* 0x20 */
- UnknownChar, /* 0x21 ! */
- UnknownChar, /* 0x22 */
- UnknownChar, /* 0x23 # */
- LetterCharGroup, /* 0x24 $ */
- UnknownChar, /* 0x25 % */
- UnknownChar, /* 0x26 & */
- UnknownChar, /* 0x27 */
- UnknownChar, /* 0x28 */
- UnknownChar, /* 0x29 */
- UnknownChar, /* 0x2A */
- UnknownChar, /* 0x2B */
- UnknownChar, /* 0x2C */
- UnknownChar, /* 0x2D */
- UnknownChar, /* 0x2E */
- UnknownChar, /* 0x2F */
- DecimalCharGroup, /* 0x30 0 */
- DecimalCharGroup, /* 0x31 1 */
- DecimalCharGroup, /* 0x32 2 */
- DecimalCharGroup, /* 0x33 3 */
- DecimalCharGroup, /* 0x34 4 */
- DecimalCharGroup, /* 0x35 5 */
- DecimalCharGroup, /* 0x36 6 */
- DecimalCharGroup, /* 0x37 7 */
- DecimalCharGroup, /* 0x38 8 */
- DecimalCharGroup, /* 0x39 9 */
- UnknownChar, /* 0x3A */
- UnknownChar, /* 0x3B */
- UnknownChar, /* 0x3C < */
- UnknownChar, /* 0x3D = */
- UnknownChar, /* 0x3E > */
- UnknownChar, /* 0x3F */
- UnknownChar, /* 0x40 @ */
- HexCharGroup, /* 0x41 A */
- HexCharGroup, /* 0x42 B */
- HexCharGroup, /* 0x43 C */
- HexCharGroup, /* 0x44 D */
- HexCharGroup, /* 0x45 E */
- HexCharGroup, /* 0x46 F */
- LetterCharGroup, /* 0x47 G */
- LetterCharGroup, /* 0x48 H */
- LetterCharGroup, /* 0x49 I */
- LetterCharGroup, /* 0x4A J */
- LetterCharGroup, /* 0x4B K */
- LetterCharGroup, /* 0x4C L */
- LetterCharGroup, /* 0x4D M */
- LetterCharGroup, /* 0x4E N */
- LetterCharGroup, /* 0x4F O */
- LetterCharGroup, /* 0x50 P */
- LetterCharGroup, /* 0x51 Q */
- LetterCharGroup, /* 0x52 R */
- LetterCharGroup, /* 0x53 S */
- LetterCharGroup, /* 0x54 T */
- LetterCharGroup, /* 0x55 U */
- LetterCharGroup, /* 0x56 V */
- LetterCharGroup, /* 0x57 W */
- LetterCharGroup, /* 0x58 X */
- LetterCharGroup, /* 0x59 Y */
- LetterCharGroup, /* 0x5A Z */
- UnknownChar, /* 0x5B */
- UnknownChar, /* 0x5C */
- UnknownChar, /* 0x5D */
- UnknownChar, /* 0x5E */
- LetterCharGroup, /* 0x5F _ */
- UnknownChar, /* 0x60 */
- HexCharGroup, /* 0x61 a */
- HexCharGroup, /* 0x62 b */
- HexCharGroup, /* 0x63 c */
- HexCharGroup, /* 0x64 d */
- HexCharGroup, /* 0x65 e */
- HexCharGroup, /* 0x66 f */
- LetterCharGroup, /* 0x67 g */
- LetterCharGroup, /* 0x68 h */
- LetterCharGroup, /* 0x69 i */
- LetterCharGroup, /* 0x6A j */
- LetterCharGroup, /* 0x6B k */
- LetterCharGroup, /* 0x6C l */
- LetterCharGroup, /* 0x6D m */
- LetterCharGroup, /* 0x6E n */
- LetterCharGroup, /* 0x6F o */
- LetterCharGroup, /* 0x70 p */
- LetterCharGroup, /* 0x71 q */
- LetterCharGroup, /* 0x72 r */
- LetterCharGroup, /* 0x73 s */
- LetterCharGroup, /* 0x74 t */
- LetterCharGroup, /* 0x75 u */
- LetterCharGroup, /* 0x76 v */
- LetterCharGroup, /* 0x77 w */
- LetterCharGroup, /* 0x78 x */
- LetterCharGroup, /* 0x79 y */
- LetterCharGroup, /* 0x7A z */
- UnknownChar, /* 0x7B */
- UnknownChar, /* 0x7C */
- UnknownChar, /* 0x7D */
- UnknownChar, /* 0x7E */
- UnknownChar /* 0x7F */
- };
- }
- };
- /*****************************************************************************
- *
- * The _C_xxx enum and charTypes[] table are used to map a character to
- * simple classification values and flags.
- */
- static const CharTypes charTypes[128] =
- {
- _C_NUL, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 00-07 */
- _C_ERR, _C_WSP, _C_NWL, _C_WSP, _C_WSP, _C_NWL, _C_ERR, _C_ERR, /* 08-0F */
- _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 10-17 */
- _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 18-1F */
- _C_WSP, _C_BNG, _C_QUO, _C_SHP, _C_DOL, _C_PCT, _C_AMP, _C_APO, /* 20-27 */
- _C_LPR, _C_RPR, _C_MUL, _C_PLS, _C_CMA, _C_MIN, _C_DOT, _C_SLH, /* 28-2F */
- _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, /* 30-37 */
- _C_DIG, _C_DIG, _C_COL, _C_SMC, _C_LT , _C_EQ , _C_GT , _C_QUE, /* 38-3F */
- _C_AT , _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 40-47 */
- _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 48-4F */
- _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 50-57 */
- _C_LET, _C_LET, _C_LET, _C_LBR, _C_BSL, _C_RBR, _C_XOR, _C_USC, /* 58-5F */
- _C_BKQ, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 60-67 */
- _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 68-6F */
- _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 70-77 */
- _C_LET, _C_LET, _C_LET, _C_LC , _C_BAR, _C_RC , _C_TIL, _C_ERR, /* 78-7F */
- };
- #if ENABLE_UNICODE_API
- bool Js::CharClassifier::BigCharIsWhitespaceDefault(codepoint_t ch, const Js::CharClassifier *instance)
- {
- return (instance->getBigCharFlagsFunc(ch, instance) & PlatformAgnostic::UnicodeText::CharacterTypeFlags::SpaceChar) != 0;
- }
- bool Js::CharClassifier::BigCharIsIdStartDefault(codepoint_t ch, const Js::CharClassifier *instance)
- {
- return (instance->getBigCharFlagsFunc(ch, instance) & PlatformAgnostic::UnicodeText::CharacterTypeFlags::IdLeadChar) != 0;
- }
- bool Js::CharClassifier::BigCharIsIdContinueDefault(codepoint_t ch, const Js::CharClassifier *instance)
- {
- return (instance->getBigCharFlagsFunc(ch, instance) & PlatformAgnostic::UnicodeText::CharacterTypeFlags::IdChar) != 0;
- }
- #endif
- CharTypes Js::CharClassifier::GetBigCharTypeES5(codepoint_t codepoint, const Js::CharClassifier *instance)
- {
- using namespace PlatformAgnostic::UnicodeText;
- if (codepoint > 0xFFFF)
- {
- return CharTypes::_C_ERR;
- }
- if (codepoint == kchLS || codepoint == kchPS)
- {
- return _C_NWL;
- }
- auto charType = GetLegacyCharacterClassificationType((char16)codepoint);
- if (charType == CharacterClassificationType::Letter)
- {
- return CharTypes::_C_LET;
- }
- else if (charType == CharacterClassificationType::Whitespace)
- {
- return CharTypes::_C_WSP;
- }
- return CharTypes::_C_ERR;
- }
- PlatformAgnostic::UnicodeText::CharacterTypeFlags Js::CharClassifier::GetBigCharFlagsES5(codepoint_t ch, const Js::CharClassifier *instance)
- {
- using namespace PlatformAgnostic::UnicodeText;
- //In ES5 the unicode <ZWNJ> and <ZWJ> could be identifier parts
- if (ch == 0x200c || ch == 0x200d)
- {
- return PlatformAgnostic::UnicodeText::CharacterTypeFlags::IdChar;
- }
- // Make sure that the codepoint fits within the char16 range
- if (ch > 0xFFFF)
- {
- return UnknownChar;
- }
- return PlatformAgnostic::UnicodeText::GetLegacyCharacterTypeFlags((char16)ch);
- }
- /*
- * CharClassifier implementation
- */
- CharTypes Js::CharClassifier::GetBigCharTypeES6(codepoint_t ch, const Js::CharClassifier *instance)
- {
- using namespace PlatformAgnostic::UnicodeText;
- Assert(ch > 0x7F);
- if (ch == 0xFEFF)
- {
- return CharTypes::_C_WSP;
- }
- UnicodeGeneralCategoryClass categoryClass = PlatformAgnostic::UnicodeText::GetGeneralCategoryClass(ch);
- switch(categoryClass)
- {
- case UnicodeGeneralCategoryClass::CategoryClassLetter:
- return CharTypes::_C_LET;
- case UnicodeGeneralCategoryClass::CategoryClassDigit:
- return CharTypes::_C_DIG;
- case UnicodeGeneralCategoryClass::CategoryClassLineSeparator:
- case UnicodeGeneralCategoryClass::CategoryClassParagraphSeparator:
- return CharTypes::_C_NWL;
- case UnicodeGeneralCategoryClass::CategoryClassSpaceSeparator:
- case UnicodeGeneralCategoryClass::CategoryClassSpacingCombiningMark:
- case UnicodeGeneralCategoryClass::CategoryClassNonSpacingMark:
- case UnicodeGeneralCategoryClass::CategoryClassConnectorPunctuation:
- return CharTypes::_C_WSP;
- default:
- break;
- }
- return CharTypes::_C_UNK;
- }
- /*
- From Unicode 6.3 http://www.unicode.org/reports/tr31/tr31-19.html
- ID_Start:::
- Characters having the Unicode General_Category of uppercase letters (Lu), lowercase letters (Ll), titlecase letters (Lt), modifier letters (Lm), other letters (Lo), letter numbers (Nl), minus Pattern_Syntax and Pattern_White_Space code points, plus stability extensions. Note that "other letters" includes ideographs.
- In set notation, this is [[:L:][:Nl:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] plus stability extensions.
- ID_Continue:::
- All of the above, plus characters having the Unicode General_Category of nonspacing marks (Mn), spacing combining marks (Mc), decimal number (Nd), connector punctuations (Pc), plus stability extensions, minus Pattern_Syntax and Pattern_White_Space code points.
- In set notation, this is [[:L:][:Nl:][:Mn:][:Mc:][:Nd:][:Pc:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] plus stability extensions.
- These are also known simply as Identifier Characters, because they are a superset of the ID_Start characters.
- */
- PlatformAgnostic::UnicodeText::CharacterTypeFlags Js::CharClassifier::GetBigCharFlagsES6(codepoint_t ch, const Js::CharClassifier *instance)
- {
- using namespace PlatformAgnostic::UnicodeText;
- Assert(ch > 0x7F);
- UnicodeGeneralCategoryClass categoryClass = PlatformAgnostic::UnicodeText::GetGeneralCategoryClass(ch);
- switch(categoryClass)
- {
- case UnicodeGeneralCategoryClass::CategoryClassLetter:
- return BigCharIsIdStartES6(ch, instance) ? CharacterTypeFlags::LetterCharGroup : CharacterTypeFlags::UnknownChar;
- case UnicodeGeneralCategoryClass::CategoryClassSpacingCombiningMark:
- return BigCharIsIdContinueES6(ch, instance) ? CharacterTypeFlags::IdChar : CharacterTypeFlags::SpaceChar;
- case UnicodeGeneralCategoryClass::CategoryClassNonSpacingMark:
- case UnicodeGeneralCategoryClass::CategoryClassConnectorPunctuation:
- return BigCharIsIdContinueES6(ch, instance) ? CharacterTypeFlags::IdChar : CharacterTypeFlags::UnknownChar;
- case UnicodeGeneralCategoryClass::CategoryClassDigit:
- return BigCharIsIdContinueES6(ch, instance) ? CharacterTypeFlags::DecimalCharGroup : CharacterTypeFlags::DecimalChar;
- case UnicodeGeneralCategoryClass::CategoryClassLineSeparator:
- return CharacterTypeFlags::LineFeedChar;
- case UnicodeGeneralCategoryClass::CategoryClassParagraphSeparator:
- case UnicodeGeneralCategoryClass::CategoryClassSpaceSeparator:
- return CharacterTypeFlags::SpaceChar;
- default:
- break;
- }
- return CharacterTypeFlags::UnknownChar;
- }
- bool Js::CharClassifier::BigCharIsWhitespaceES6(codepoint_t ch, const CharClassifier *instance)
- {
- Assert(ch > 0x7F);
- if (ch == 0xFEFF)
- {
- return true;
- }
- return PlatformAgnostic::UnicodeText::IsWhitespace(ch);
- }
- bool Js::CharClassifier::BigCharIsIdStartES6(codepoint_t codePoint, const CharClassifier *instance)
- {
- Assert(codePoint > 0x7F);
- return PlatformAgnostic::UnicodeText::IsIdStart(codePoint);
- }
- bool Js::CharClassifier::BigCharIsIdContinueES6(codepoint_t codePoint, const CharClassifier *instance)
- {
- Assert(codePoint > 0x7F);
- if (codePoint == '$' || codePoint == '_' || codePoint == 0x200C /* Zero-width non-joiner */ || codePoint == 0x200D /* Zero-width joiner */)
- {
- return true;
- }
- return PlatformAgnostic::UnicodeText::IsIdContinue(codePoint);
- }
- template <bool isBigChar>
- bool Js::CharClassifier::IsWhiteSpaceFast(codepoint_t ch) const
- {
- using namespace PlatformAgnostic::UnicodeText;
- Assert(isBigChar ? ch > 0x7F : ch < 0x80);
- return isBigChar ? this->bigCharIsWhitespaceFunc(ch, this) : (charFlags[ch] & CharacterTypeFlags::SpaceChar) != 0;
- }
- bool Js::CharClassifier::IsBiDirectionalChar(codepoint_t ch) const
- {
- //From http://www.unicode.org/reports/tr9/#Directional_Formatting_Codes
- switch (ch)
- {
- case 0x202A: //LEFT-TO-RIGHT EMBEDDING Treat the following text as embedded left-to-right
- case 0x202B: //RIGHT-TO-LEFT EMBEDDING Treat the following text as embedded right-to-left.
- case 0x202D: //LEFT-TO-RIGHT OVERRIDE Force following characters to be treated as strong left-to-right characters.
- case 0x202E: //RIGHT-TO-LEFT OVERRIDE Force following characters to be treated as strong right-to-left characters.
- case 0x202C: //POP DIRECTIONAL FORMATTING End the scope of the last LRE, RLE, RLO, or LRO.
- case 0x2066: //LEFT-TO-RIGHT ISOLATE Treat the following text as isolated and left-to-right.
- case 0x2067: //RIGHT-TO-LEFT ISOLATE Treat the following text as isolated and right-to-left.
- case 0x2068: //FIRST STRONG ISOLATE Treat the following text as isolated and in the direction of its first strong directional character that is not inside a nested isolate.
- case 0x2069: //POP DIRECTIONAL ISOLATE End the scope of the last LRI, RLI, or FSI.
- case 0x200E: //LEFT-TO-RIGHT MARK Left-to-right zero-width character
- case 0x200F: //RIGHT-TO-LEFT MARK Right-to-left zero-width non-Arabic character
- case 0x061C: //ARABIC LETTER MARK Right-to-left zero-width Arabic character
- return true;
- default:
- return false;
- }
- }
- template<bool isBigChar>
- bool Js::CharClassifier::IsIdStartFast(codepoint_t ch) const
- {
- using namespace PlatformAgnostic::UnicodeText;
- Assert(isBigChar ? ch > 0x7F : ch < 0x80);
- return isBigChar ? this->bigCharIsIdStartFunc(ch, this) : (charFlags[ch] & CharacterTypeFlags::IdLeadChar) != 0;
- }
- template<bool isBigChar>
- bool Js::CharClassifier::IsIdContinueFast(codepoint_t ch) const
- {
- using namespace PlatformAgnostic::UnicodeText;
- Assert(isBigChar ? ch > 0x7F : ch < 0x80);
- return isBigChar ? this->bigCharIsIdContinueFunc(ch, this) : (charFlags[ch] & CharacterTypeFlags::IdChar) != 0;
- }
- Js::CharClassifier::CharClassifier(void)
- {
- bool isES6UnicodeModeEnabled = CONFIG_FLAG(ES6Unicode);
- bool isFullUnicodeSupportAvailable = PlatformAgnostic::UnicodeText::IsExternalUnicodeLibraryAvailable();
- #ifdef NTBUILD
- AssertMsg(isFullUnicodeSupportAvailable, "Windows.Globalization needs to present with IUnicodeCharacterStatics support for Chakra.dll to work");
- if (!isFullUnicodeSupportAvailable)
- {
- Js::Throw::FatalInternalError();
- }
- #endif
- // If we're in ES6 mode, and we have full support for Unicode character classification
- // from an external library, then use the ES6/Surrogate pair supported versions of the functions
- // Otherwise, fallback to the ES5 versions which don't need an external library
- #if ENABLE_UNICODE_API
- if (isES6UnicodeModeEnabled && isFullUnicodeSupportAvailable)
- #endif
- {
- bigCharIsIdStartFunc = &CharClassifier::BigCharIsIdStartES6;
- bigCharIsIdContinueFunc = &CharClassifier::BigCharIsIdContinueES6;
- bigCharIsWhitespaceFunc = &CharClassifier::BigCharIsWhitespaceES6;
- skipWhiteSpaceFunc = &CharClassifier::SkipWhiteSpaceSurrogate;
- skipWhiteSpaceStartEndFunc = &CharClassifier::SkipWhiteSpaceSurrogateStartEnd;
- skipIdentifierFunc = &CharClassifier::SkipIdentifierSurrogate;
- skipIdentifierStartEndFunc = &CharClassifier::SkipIdentifierSurrogateStartEnd;
- getBigCharTypeFunc = &CharClassifier::GetBigCharTypeES6;
- getBigCharFlagsFunc = &CharClassifier::GetBigCharFlagsES6;
- }
- #if ENABLE_UNICODE_API
- else
- {
- bigCharIsIdStartFunc = &CharClassifier::BigCharIsIdStartDefault;
- bigCharIsIdContinueFunc = &CharClassifier::BigCharIsIdContinueDefault;
- bigCharIsWhitespaceFunc = &CharClassifier::BigCharIsWhitespaceDefault;
- skipWhiteSpaceFunc = &CharClassifier::SkipWhiteSpaceNonSurrogate;
- skipWhiteSpaceStartEndFunc = &CharClassifier::SkipWhiteSpaceNonSurrogateStartEnd;
- skipIdentifierFunc = &CharClassifier::SkipIdentifierNonSurrogate;
- skipIdentifierStartEndFunc = &CharClassifier::SkipIdentifierNonSurrogateStartEnd;
- getBigCharTypeFunc = &CharClassifier::GetBigCharTypeES5;
- getBigCharFlagsFunc = &CharClassifier::GetBigCharFlagsES5;
- }
- #endif
- }
- const OLECHAR* Js::CharClassifier::SkipWhiteSpaceNonSurrogate(LPCOLESTR psz, const CharClassifier *instance)
- {
- for ( ; instance->IsWhiteSpace(*psz); psz++)
- {
- }
- return psz;
- }
- const OLECHAR* Js::CharClassifier::SkipWhiteSpaceNonSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance)
- {
- for ( ; instance->IsWhiteSpace(*pStr) && pStr < pStrEnd; pStr++)
- {
- }
- return pStr;
- }
- const OLECHAR* Js::CharClassifier::SkipIdentifierNonSurrogate(LPCOLESTR psz, const CharClassifier *instance)
- {
- if (!instance->IsIdStart(*psz))
- {
- return psz;
- }
- for (psz++; instance->IsIdContinue(*psz); psz++)
- {
- }
- return psz;
- }
- const LPCUTF8 Js::CharClassifier::SkipIdentifierNonSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance)
- {
- utf8::DecodeOptions options = utf8::doAllowThreeByteSurrogates;
- LPCUTF8 p = psz;
- if (!instance->IsIdStart(utf8::Decode(p, end, options)))
- {
- return psz;
- }
- psz = p;
- while (instance->IsIdContinue(utf8::Decode(p, end, options)))
- {
- psz = p;
- }
- return psz;
- }
- const OLECHAR* Js::CharClassifier::SkipWhiteSpaceSurrogate(LPCOLESTR psz, const CharClassifier *instance)
- {
- char16 currentChar = 0x0;
- // Slow path is to check for a surrogate each iteration.
- // There is no new surrogate whitespaces as of yet, however, might be in the future, so surrogates still need to be checked
- // So, based on that, best way is to hit the slow path if the current character is not a whitespace in [0, FFFF];
- while((currentChar = *psz) != '\0')
- {
- if (!instance->IsWhiteSpace(*psz))
- {
- if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1)))
- {
- if (instance->IsWhiteSpace(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1))))
- {
- psz += 2;
- continue;
- }
- }
- // Above case failed, so we have reached the last whitespace
- return psz;
- }
- psz++;
- }
- return psz;
- }
- const OLECHAR* Js::CharClassifier::SkipWhiteSpaceSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance)
- {
- char16 currentChar = 0x0;
- // Same reasoning as above
- while(pStr < pStrEnd && (currentChar = *pStr) != '\0')
- {
- if (!instance->IsWhiteSpace(currentChar))
- {
- if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && (pStr + 1) < pStrEnd && Js::NumberUtilities::IsSurrogateUpperPart(*(pStr + 1)))
- {
- if (instance->IsWhiteSpace(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(pStr + 1))))
- {
- pStr += 2;
- continue;
- }
- }
- // Above case failed, so we have reached the last whitespace
- return pStr;
- }
- pStr++;
- }
- return pStr;
- }
- const OLECHAR* Js::CharClassifier::SkipIdentifierSurrogate(LPCOLESTR psz, const CharClassifier *instance)
- {
- // Similar reasoning to above, however we do have surrogate identifiers, but less likely to occur in code.
- char16 currentChar = *psz;
- if (!instance->IsIdStart(currentChar))
- {
- if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1))
- && instance->IsIdStart(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1))))
- {
- // For the extra surrogate char
- psz ++;
- }
- else
- {
- return psz;
- }
- }
- psz++;
- while((currentChar = *psz) != '\0')
- {
- if (!instance->IsIdContinue(*psz))
- {
- if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1)))
- {
- if (instance->IsIdContinue(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1))))
- {
- psz += 2;
- continue;
- }
- }
- // Above case failed, so we have reached the last IDContinue
- return psz;
- }
- psz++;
- }
- return psz;
- }
- const LPCUTF8 Js::CharClassifier::SkipIdentifierSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance)
- {
- LPCUTF8 currentPosition = psz;
- utf8::DecodeOptions options = utf8::doAllowThreeByteSurrogates;
- // Similar reasoning to above, however we do have surrogate identifiers, but less likely to occur in code.
- codepoint_t currentChar = utf8::Decode(currentPosition, end, options);
- if (options & utf8::doSecondSurrogatePair)
- {
- currentChar = Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, utf8::Decode(currentPosition, end, options));
- }
- if (!instance->IsIdStart(currentChar))
- {
- return psz;
- }
- psz = currentPosition;
- // Slow path is to check for a surrogate each iteration.
- // There is no new surrogate whitespaces as of yet, however, might be in the future, so surrogates still need to be checked
- // So, based on that, best way is to hit the slow path if the current character is not a whitespace in [0, FFFF];
- while((currentChar = utf8::Decode(currentPosition, end, options)) != '\0')
- {
- if (options & utf8::doSecondSurrogatePair)
- {
- currentChar = Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, utf8::Decode(currentPosition, end, options));
- }
- if (!instance->IsIdContinue(currentChar))
- {
- return psz;
- }
- psz = currentPosition;
- }
- return psz;
- }
- CharTypes Js::CharClassifier::GetCharType(codepoint_t ch) const
- {
- return FBigChar(ch) ? getBigCharTypeFunc(ch, this) : charTypes[ch];
- }
- #if ENABLE_UNICODE_API
- PlatformAgnostic::UnicodeText::CharacterTypeFlags Js::CharClassifier::GetCharFlags(codepoint_t ch) const
- {
- #if ENABLE_UNICODE_API
- return FBigChar(ch) ? getBigCharFlagsFunc(ch, this) : PlatformAgnostic::UnicodeText::charFlags[ch];
- #else
- return PlatformAgnostics::UnicodeText::charFlags[ch];
- #endif
- }
- #endif
- // Explicit instantiation
- template bool Js::CharClassifier::IsIdStartFast<true>(codepoint_t) const;
- template bool Js::CharClassifier::IsIdStartFast<false>(codepoint_t) const;
- template bool Js::CharClassifier::IsIdContinueFast<true>(codepoint_t) const;
- template bool Js::CharClassifier::IsIdContinueFast<false>(codepoint_t) const;
- template bool Js::CharClassifier::IsWhiteSpaceFast<true>(codepoint_t) const;
- template bool Js::CharClassifier::IsWhiteSpaceFast<false>(codepoint_t) const;
|