//------------------------------------------------------------------------------------------------------- // Copyright (C) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information. //------------------------------------------------------------------------------------------------------- #include "ParserPch.h" namespace PlatformAgnostic { namespace UnicodeText { // Technically, this is not specific to Unicode (in fact, it's used in the non-Unicode case) // But it's in this namespace for convenience static const CharacterTypeFlags charFlags[128] = { UnknownChar, /* 0x00 */ UnknownChar, /* 0x01 */ UnknownChar, /* 0x02 */ UnknownChar, /* 0x03 */ UnknownChar, /* 0x04 */ UnknownChar, /* 0x05 */ UnknownChar, /* 0x06 */ UnknownChar, /* 0x07 */ UnknownChar, /* 0x08 */ SpaceChar, /* 0x09 */ LineCharGroup, /* 0x0A */ SpaceChar, /* 0x0B */ SpaceChar, /* 0x0C */ LineCharGroup, /* 0x0D */ UnknownChar, /* 0x0E */ UnknownChar, /* 0x0F */ UnknownChar, /* 0x10 */ UnknownChar, /* 0x11 */ UnknownChar, /* 0x12 */ UnknownChar, /* 0x13 */ UnknownChar, /* 0x14 */ UnknownChar, /* 0x15 */ UnknownChar, /* 0x16 */ UnknownChar, /* 0x17 */ UnknownChar, /* 0x18 */ UnknownChar, /* 0x19 */ UnknownChar, /* 0x1A */ UnknownChar, /* 0x1B */ UnknownChar, /* 0x1C */ UnknownChar, /* 0x1D */ UnknownChar, /* 0x1E */ UnknownChar, /* 0x1F */ SpaceChar, /* 0x20 */ UnknownChar, /* 0x21 ! */ UnknownChar, /* 0x22 */ UnknownChar, /* 0x23 # */ LetterCharGroup, /* 0x24 $ */ UnknownChar, /* 0x25 % */ UnknownChar, /* 0x26 & */ UnknownChar, /* 0x27 */ UnknownChar, /* 0x28 */ UnknownChar, /* 0x29 */ UnknownChar, /* 0x2A */ UnknownChar, /* 0x2B */ UnknownChar, /* 0x2C */ UnknownChar, /* 0x2D */ UnknownChar, /* 0x2E */ UnknownChar, /* 0x2F */ DecimalCharGroup, /* 0x30 0 */ DecimalCharGroup, /* 0x31 1 */ DecimalCharGroup, /* 0x32 2 */ DecimalCharGroup, /* 0x33 3 */ DecimalCharGroup, /* 0x34 4 */ DecimalCharGroup, /* 0x35 5 */ DecimalCharGroup, /* 0x36 6 */ DecimalCharGroup, /* 0x37 7 */ DecimalCharGroup, /* 0x38 8 */ DecimalCharGroup, /* 0x39 9 */ UnknownChar, /* 0x3A */ UnknownChar, /* 0x3B */ UnknownChar, /* 0x3C < */ UnknownChar, /* 0x3D = */ UnknownChar, /* 0x3E > */ UnknownChar, /* 0x3F */ UnknownChar, /* 0x40 @ */ HexCharGroup, /* 0x41 A */ HexCharGroup, /* 0x42 B */ HexCharGroup, /* 0x43 C */ HexCharGroup, /* 0x44 D */ HexCharGroup, /* 0x45 E */ HexCharGroup, /* 0x46 F */ LetterCharGroup, /* 0x47 G */ LetterCharGroup, /* 0x48 H */ LetterCharGroup, /* 0x49 I */ LetterCharGroup, /* 0x4A J */ LetterCharGroup, /* 0x4B K */ LetterCharGroup, /* 0x4C L */ LetterCharGroup, /* 0x4D M */ LetterCharGroup, /* 0x4E N */ LetterCharGroup, /* 0x4F O */ LetterCharGroup, /* 0x50 P */ LetterCharGroup, /* 0x51 Q */ LetterCharGroup, /* 0x52 R */ LetterCharGroup, /* 0x53 S */ LetterCharGroup, /* 0x54 T */ LetterCharGroup, /* 0x55 U */ LetterCharGroup, /* 0x56 V */ LetterCharGroup, /* 0x57 W */ LetterCharGroup, /* 0x58 X */ LetterCharGroup, /* 0x59 Y */ LetterCharGroup, /* 0x5A Z */ UnknownChar, /* 0x5B */ UnknownChar, /* 0x5C */ UnknownChar, /* 0x5D */ UnknownChar, /* 0x5E */ LetterCharGroup, /* 0x5F _ */ UnknownChar, /* 0x60 */ HexCharGroup, /* 0x61 a */ HexCharGroup, /* 0x62 b */ HexCharGroup, /* 0x63 c */ HexCharGroup, /* 0x64 d */ HexCharGroup, /* 0x65 e */ HexCharGroup, /* 0x66 f */ LetterCharGroup, /* 0x67 g */ LetterCharGroup, /* 0x68 h */ LetterCharGroup, /* 0x69 i */ LetterCharGroup, /* 0x6A j */ LetterCharGroup, /* 0x6B k */ LetterCharGroup, /* 0x6C l */ LetterCharGroup, /* 0x6D m */ LetterCharGroup, /* 0x6E n */ LetterCharGroup, /* 0x6F o */ LetterCharGroup, /* 0x70 p */ LetterCharGroup, /* 0x71 q */ LetterCharGroup, /* 0x72 r */ LetterCharGroup, /* 0x73 s */ LetterCharGroup, /* 0x74 t */ LetterCharGroup, /* 0x75 u */ LetterCharGroup, /* 0x76 v */ LetterCharGroup, /* 0x77 w */ LetterCharGroup, /* 0x78 x */ LetterCharGroup, /* 0x79 y */ LetterCharGroup, /* 0x7A z */ UnknownChar, /* 0x7B */ UnknownChar, /* 0x7C */ UnknownChar, /* 0x7D */ UnknownChar, /* 0x7E */ UnknownChar /* 0x7F */ }; } }; /***************************************************************************** * * The _C_xxx enum and charTypes[] table are used to map a character to * simple classification values and flags. */ static const CharTypes charTypes[128] = { _C_NUL, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 00-07 */ _C_ERR, _C_WSP, _C_NWL, _C_WSP, _C_WSP, _C_NWL, _C_ERR, _C_ERR, /* 08-0F */ _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 10-17 */ _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 18-1F */ _C_WSP, _C_BNG, _C_QUO, _C_SHP, _C_DOL, _C_PCT, _C_AMP, _C_APO, /* 20-27 */ _C_LPR, _C_RPR, _C_MUL, _C_PLS, _C_CMA, _C_MIN, _C_DOT, _C_SLH, /* 28-2F */ _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, /* 30-37 */ _C_DIG, _C_DIG, _C_COL, _C_SMC, _C_LT , _C_EQ , _C_GT , _C_QUE, /* 38-3F */ _C_AT , _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 40-47 */ _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 48-4F */ _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 50-57 */ _C_LET, _C_LET, _C_LET, _C_LBR, _C_BSL, _C_RBR, _C_XOR, _C_USC, /* 58-5F */ _C_BKQ, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 60-67 */ _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 68-6F */ _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 70-77 */ _C_LET, _C_LET, _C_LET, _C_LC , _C_BAR, _C_RC , _C_TIL, _C_ERR, /* 78-7F */ }; #if ENABLE_UNICODE_API bool Js::CharClassifier::BigCharIsWhitespaceDefault(codepoint_t ch, const Js::CharClassifier *instance) { return (instance->getBigCharFlagsFunc(ch, instance) & PlatformAgnostic::UnicodeText::CharacterTypeFlags::SpaceChar) != 0; } bool Js::CharClassifier::BigCharIsIdStartDefault(codepoint_t ch, const Js::CharClassifier *instance) { return (instance->getBigCharFlagsFunc(ch, instance) & PlatformAgnostic::UnicodeText::CharacterTypeFlags::IdLeadChar) != 0; } bool Js::CharClassifier::BigCharIsIdContinueDefault(codepoint_t ch, const Js::CharClassifier *instance) { return (instance->getBigCharFlagsFunc(ch, instance) & PlatformAgnostic::UnicodeText::CharacterTypeFlags::IdChar) != 0; } #endif CharTypes Js::CharClassifier::GetBigCharTypeES5(codepoint_t codepoint, const Js::CharClassifier *instance) { using namespace PlatformAgnostic::UnicodeText; if (codepoint > 0xFFFF) { return CharTypes::_C_ERR; } if (codepoint == kchLS || codepoint == kchPS) { return _C_NWL; } auto charType = GetLegacyCharacterClassificationType((char16)codepoint); if (charType == CharacterClassificationType::Letter) { return CharTypes::_C_LET; } else if (charType == CharacterClassificationType::Whitespace) { return CharTypes::_C_WSP; } return CharTypes::_C_ERR; } PlatformAgnostic::UnicodeText::CharacterTypeFlags Js::CharClassifier::GetBigCharFlagsES5(codepoint_t ch, const Js::CharClassifier *instance) { using namespace PlatformAgnostic::UnicodeText; //In ES5 the unicode and could be identifier parts if (ch == 0x200c || ch == 0x200d) { return PlatformAgnostic::UnicodeText::CharacterTypeFlags::IdChar; } // Make sure that the codepoint fits within the char16 range if (ch > 0xFFFF) { return UnknownChar; } return PlatformAgnostic::UnicodeText::GetLegacyCharacterTypeFlags((char16)ch); } /* * CharClassifier implementation */ CharTypes Js::CharClassifier::GetBigCharTypeES6(codepoint_t ch, const Js::CharClassifier *instance) { using namespace PlatformAgnostic::UnicodeText; Assert(ch > 0x7F); if (ch == 0xFEFF) { return CharTypes::_C_WSP; } UnicodeGeneralCategoryClass categoryClass = PlatformAgnostic::UnicodeText::GetGeneralCategoryClass(ch); switch(categoryClass) { case UnicodeGeneralCategoryClass::CategoryClassLetter: return CharTypes::_C_LET; case UnicodeGeneralCategoryClass::CategoryClassDigit: return CharTypes::_C_DIG; case UnicodeGeneralCategoryClass::CategoryClassLineSeparator: case UnicodeGeneralCategoryClass::CategoryClassParagraphSeparator: return CharTypes::_C_NWL; case UnicodeGeneralCategoryClass::CategoryClassSpaceSeparator: case UnicodeGeneralCategoryClass::CategoryClassSpacingCombiningMark: case UnicodeGeneralCategoryClass::CategoryClassNonSpacingMark: case UnicodeGeneralCategoryClass::CategoryClassConnectorPunctuation: return CharTypes::_C_WSP; default: break; } return CharTypes::_C_UNK; } /* From Unicode 6.3 http://www.unicode.org/reports/tr31/tr31-19.html ID_Start::: Characters having the Unicode General_Category of uppercase letters (Lu), lowercase letters (Ll), titlecase letters (Lt), modifier letters (Lm), other letters (Lo), letter numbers (Nl), minus Pattern_Syntax and Pattern_White_Space code points, plus stability extensions. Note that "other letters" includes ideographs. In set notation, this is [[:L:][:Nl:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] plus stability extensions. ID_Continue::: All of the above, plus characters having the Unicode General_Category of nonspacing marks (Mn), spacing combining marks (Mc), decimal number (Nd), connector punctuations (Pc), plus stability extensions, minus Pattern_Syntax and Pattern_White_Space code points. In set notation, this is [[:L:][:Nl:][:Mn:][:Mc:][:Nd:][:Pc:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] plus stability extensions. These are also known simply as Identifier Characters, because they are a superset of the ID_Start characters. */ PlatformAgnostic::UnicodeText::CharacterTypeFlags Js::CharClassifier::GetBigCharFlagsES6(codepoint_t ch, const Js::CharClassifier *instance) { using namespace PlatformAgnostic::UnicodeText; Assert(ch > 0x7F); UnicodeGeneralCategoryClass categoryClass = PlatformAgnostic::UnicodeText::GetGeneralCategoryClass(ch); switch(categoryClass) { case UnicodeGeneralCategoryClass::CategoryClassLetter: return BigCharIsIdStartES6(ch, instance) ? CharacterTypeFlags::LetterCharGroup : CharacterTypeFlags::UnknownChar; case UnicodeGeneralCategoryClass::CategoryClassSpacingCombiningMark: return BigCharIsIdContinueES6(ch, instance) ? CharacterTypeFlags::IdChar : CharacterTypeFlags::SpaceChar; case UnicodeGeneralCategoryClass::CategoryClassNonSpacingMark: case UnicodeGeneralCategoryClass::CategoryClassConnectorPunctuation: return BigCharIsIdContinueES6(ch, instance) ? CharacterTypeFlags::IdChar : CharacterTypeFlags::UnknownChar; case UnicodeGeneralCategoryClass::CategoryClassDigit: return BigCharIsIdContinueES6(ch, instance) ? CharacterTypeFlags::DecimalCharGroup : CharacterTypeFlags::DecimalChar; case UnicodeGeneralCategoryClass::CategoryClassLineSeparator: return CharacterTypeFlags::LineFeedChar; case UnicodeGeneralCategoryClass::CategoryClassParagraphSeparator: case UnicodeGeneralCategoryClass::CategoryClassSpaceSeparator: return CharacterTypeFlags::SpaceChar; default: break; } return CharacterTypeFlags::UnknownChar; } bool Js::CharClassifier::BigCharIsWhitespaceES6(codepoint_t ch, const CharClassifier *instance) { Assert(ch > 0x7F); if (ch == 0xFEFF) { return true; } return PlatformAgnostic::UnicodeText::IsWhitespace(ch); } bool Js::CharClassifier::BigCharIsIdStartES6(codepoint_t codePoint, const CharClassifier *instance) { Assert(codePoint > 0x7F); return PlatformAgnostic::UnicodeText::IsIdStart(codePoint); } bool Js::CharClassifier::BigCharIsIdContinueES6(codepoint_t codePoint, const CharClassifier *instance) { Assert(codePoint > 0x7F); if (codePoint == '$' || codePoint == '_' || codePoint == 0x200C /* Zero-width non-joiner */ || codePoint == 0x200D /* Zero-width joiner */) { return true; } return PlatformAgnostic::UnicodeText::IsIdContinue(codePoint); } template bool Js::CharClassifier::IsWhiteSpaceFast(codepoint_t ch) const { using namespace PlatformAgnostic::UnicodeText; Assert(isBigChar ? ch > 0x7F : ch < 0x80); return isBigChar ? this->bigCharIsWhitespaceFunc(ch, this) : (charFlags[ch] & CharacterTypeFlags::SpaceChar) != 0; } bool Js::CharClassifier::IsBiDirectionalChar(codepoint_t ch) const { //From http://www.unicode.org/reports/tr9/#Directional_Formatting_Codes switch (ch) { case 0x202A: //LEFT-TO-RIGHT EMBEDDING Treat the following text as embedded left-to-right case 0x202B: //RIGHT-TO-LEFT EMBEDDING Treat the following text as embedded right-to-left. case 0x202D: //LEFT-TO-RIGHT OVERRIDE Force following characters to be treated as strong left-to-right characters. case 0x202E: //RIGHT-TO-LEFT OVERRIDE Force following characters to be treated as strong right-to-left characters. case 0x202C: //POP DIRECTIONAL FORMATTING End the scope of the last LRE, RLE, RLO, or LRO. case 0x2066: //LEFT-TO-RIGHT ISOLATE Treat the following text as isolated and left-to-right. case 0x2067: //RIGHT-TO-LEFT ISOLATE Treat the following text as isolated and right-to-left. case 0x2068: //FIRST STRONG ISOLATE Treat the following text as isolated and in the direction of its first strong directional character that is not inside a nested isolate. case 0x2069: //POP DIRECTIONAL ISOLATE End the scope of the last LRI, RLI, or FSI. case 0x200E: //LEFT-TO-RIGHT MARK Left-to-right zero-width character case 0x200F: //RIGHT-TO-LEFT MARK Right-to-left zero-width non-Arabic character case 0x061C: //ARABIC LETTER MARK Right-to-left zero-width Arabic character return true; default: return false; } } template bool Js::CharClassifier::IsIdStartFast(codepoint_t ch) const { using namespace PlatformAgnostic::UnicodeText; Assert(isBigChar ? ch > 0x7F : ch < 0x80); return isBigChar ? this->bigCharIsIdStartFunc(ch, this) : (charFlags[ch] & CharacterTypeFlags::IdLeadChar) != 0; } template bool Js::CharClassifier::IsIdContinueFast(codepoint_t ch) const { using namespace PlatformAgnostic::UnicodeText; Assert(isBigChar ? ch > 0x7F : ch < 0x80); return isBigChar ? this->bigCharIsIdContinueFunc(ch, this) : (charFlags[ch] & CharacterTypeFlags::IdChar) != 0; } Js::CharClassifier::CharClassifier(void) { bool isES6UnicodeModeEnabled = CONFIG_FLAG(ES6Unicode); bool isFullUnicodeSupportAvailable = PlatformAgnostic::UnicodeText::IsExternalUnicodeLibraryAvailable(); // The following assertions are intentionally excluded from ChakraCore by guarding on NTBUILD. // This is to work around limitations of the i18n library downlevel (Win7, Win 8.0) // where CharClassifier functionality is not available. // TODO: Ideally, we would use the following guard instead to assert when an i18n library is available: // #if INTL_ICU || INTL_WINGLOB #ifdef NTBUILD AssertMsg(isFullUnicodeSupportAvailable, "Windows.Globalization needs to present with IUnicodeCharacterStatics support for Chakra.dll to work"); if (!isFullUnicodeSupportAvailable) { Js::Throw::FatalInternalGlobalizationError(); } #endif // If we're in ES6 mode, and we have full support for Unicode character classification // from an external library, then use the ES6/Surrogate pair supported versions of the functions // Otherwise, fallback to the ES5 versions which don't need an external library #if ENABLE_UNICODE_API if (isES6UnicodeModeEnabled && isFullUnicodeSupportAvailable) #endif { bigCharIsIdStartFunc = &CharClassifier::BigCharIsIdStartES6; bigCharIsIdContinueFunc = &CharClassifier::BigCharIsIdContinueES6; bigCharIsWhitespaceFunc = &CharClassifier::BigCharIsWhitespaceES6; skipWhiteSpaceFunc = &CharClassifier::SkipWhiteSpaceSurrogate; skipWhiteSpaceStartEndFunc = &CharClassifier::SkipWhiteSpaceSurrogateStartEnd; skipIdentifierFunc = &CharClassifier::SkipIdentifierSurrogate; skipIdentifierStartEndFunc = &CharClassifier::SkipIdentifierSurrogateStartEnd; getBigCharTypeFunc = &CharClassifier::GetBigCharTypeES6; getBigCharFlagsFunc = &CharClassifier::GetBigCharFlagsES6; } #if ENABLE_UNICODE_API else { bigCharIsIdStartFunc = &CharClassifier::BigCharIsIdStartDefault; bigCharIsIdContinueFunc = &CharClassifier::BigCharIsIdContinueDefault; bigCharIsWhitespaceFunc = &CharClassifier::BigCharIsWhitespaceDefault; skipWhiteSpaceFunc = &CharClassifier::SkipWhiteSpaceNonSurrogate; skipWhiteSpaceStartEndFunc = &CharClassifier::SkipWhiteSpaceNonSurrogateStartEnd; skipIdentifierFunc = &CharClassifier::SkipIdentifierNonSurrogate; skipIdentifierStartEndFunc = &CharClassifier::SkipIdentifierNonSurrogateStartEnd; getBigCharTypeFunc = &CharClassifier::GetBigCharTypeES5; getBigCharFlagsFunc = &CharClassifier::GetBigCharFlagsES5; } #endif } const OLECHAR* Js::CharClassifier::SkipWhiteSpaceNonSurrogate(LPCOLESTR psz, const CharClassifier *instance) { for ( ; instance->IsWhiteSpace(*psz); psz++) { } return psz; } const OLECHAR* Js::CharClassifier::SkipWhiteSpaceNonSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance) { for ( ; instance->IsWhiteSpace(*pStr) && pStr < pStrEnd; pStr++) { } return pStr; } const OLECHAR* Js::CharClassifier::SkipIdentifierNonSurrogate(LPCOLESTR psz, const CharClassifier *instance) { if (!instance->IsIdStart(*psz)) { return psz; } for (psz++; instance->IsIdContinue(*psz); psz++) { } return psz; } const LPCUTF8 Js::CharClassifier::SkipIdentifierNonSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance) { utf8::DecodeOptions options = utf8::doAllowThreeByteSurrogates; LPCUTF8 p = psz; if (!instance->IsIdStart(utf8::Decode(p, end, options))) { return psz; } psz = p; while (instance->IsIdContinue(utf8::Decode(p, end, options))) { psz = p; } return psz; } const OLECHAR* Js::CharClassifier::SkipWhiteSpaceSurrogate(LPCOLESTR psz, const CharClassifier *instance) { char16 currentChar = 0x0; // Slow path is to check for a surrogate each iteration. // There is no new surrogate whitespaces as of yet, however, might be in the future, so surrogates still need to be checked // So, based on that, best way is to hit the slow path if the current character is not a whitespace in [0, FFFF]; while((currentChar = *psz) != '\0') { if (!instance->IsWhiteSpace(*psz)) { if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1))) { if (instance->IsWhiteSpace(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1)))) { psz += 2; continue; } } // Above case failed, so we have reached the last whitespace return psz; } psz++; } return psz; } const OLECHAR* Js::CharClassifier::SkipWhiteSpaceSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance) { char16 currentChar = 0x0; // Same reasoning as above while(pStr < pStrEnd && (currentChar = *pStr) != '\0') { if (!instance->IsWhiteSpace(currentChar)) { if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && (pStr + 1) < pStrEnd && Js::NumberUtilities::IsSurrogateUpperPart(*(pStr + 1))) { if (instance->IsWhiteSpace(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(pStr + 1)))) { pStr += 2; continue; } } // Above case failed, so we have reached the last whitespace return pStr; } pStr++; } return pStr; } const OLECHAR* Js::CharClassifier::SkipIdentifierSurrogate(LPCOLESTR psz, const CharClassifier *instance) { // Similar reasoning to above, however we do have surrogate identifiers, but less likely to occur in code. char16 currentChar = *psz; if (!instance->IsIdStart(currentChar)) { if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1)) && instance->IsIdStart(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1)))) { // For the extra surrogate char psz ++; } else { return psz; } } psz++; while((currentChar = *psz) != '\0') { if (!instance->IsIdContinue(*psz)) { if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1))) { if (instance->IsIdContinue(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1)))) { psz += 2; continue; } } // Above case failed, so we have reached the last IDContinue return psz; } psz++; } return psz; } const LPCUTF8 Js::CharClassifier::SkipIdentifierSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance) { LPCUTF8 currentPosition = psz; utf8::DecodeOptions options = utf8::doAllowThreeByteSurrogates; // Similar reasoning to above, however we do have surrogate identifiers, but less likely to occur in code. codepoint_t currentChar = utf8::Decode(currentPosition, end, options); if (options & utf8::doSecondSurrogatePair) { currentChar = Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, utf8::Decode(currentPosition, end, options)); } if (!instance->IsIdStart(currentChar)) { return psz; } psz = currentPosition; // Slow path is to check for a surrogate each iteration. // There is no new surrogate whitespaces as of yet, however, might be in the future, so surrogates still need to be checked // So, based on that, best way is to hit the slow path if the current character is not a whitespace in [0, FFFF]; while((currentChar = utf8::Decode(currentPosition, end, options)) != '\0') { if (options & utf8::doSecondSurrogatePair) { currentChar = Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, utf8::Decode(currentPosition, end, options)); } if (!instance->IsIdContinue(currentChar)) { return psz; } psz = currentPosition; } return psz; } CharTypes Js::CharClassifier::GetCharType(codepoint_t ch) const { return FBigChar(ch) ? getBigCharTypeFunc(ch, this) : charTypes[ch]; } #if ENABLE_UNICODE_API PlatformAgnostic::UnicodeText::CharacterTypeFlags Js::CharClassifier::GetCharFlags(codepoint_t ch) const { #if ENABLE_UNICODE_API return FBigChar(ch) ? getBigCharFlagsFunc(ch, this) : PlatformAgnostic::UnicodeText::charFlags[ch]; #else return PlatformAgnostics::UnicodeText::charFlags[ch]; #endif } #endif // Explicit instantiation template bool Js::CharClassifier::IsIdStartFast(codepoint_t) const; template bool Js::CharClassifier::IsIdStartFast(codepoint_t) const; template bool Js::CharClassifier::IsIdContinueFast(codepoint_t) const; template bool Js::CharClassifier::IsIdContinueFast(codepoint_t) const; template bool Js::CharClassifier::IsWhiteSpaceFast(codepoint_t) const; template bool Js::CharClassifier::IsWhiteSpaceFast(codepoint_t) const;