CharClassifier.cpp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "ParserPch.h"
  6. namespace PlatformAgnostic
  7. {
  8. namespace UnicodeText
  9. {
  10. // Technically, this is not specific to Unicode (in fact, it's used in the non-Unicode case)
  11. // But it's in this namespace for convenience
  12. static const CharacterTypeFlags charFlags[128] =
  13. {
  14. UnknownChar, /* 0x00 */
  15. UnknownChar, /* 0x01 */
  16. UnknownChar, /* 0x02 */
  17. UnknownChar, /* 0x03 */
  18. UnknownChar, /* 0x04 */
  19. UnknownChar, /* 0x05 */
  20. UnknownChar, /* 0x06 */
  21. UnknownChar, /* 0x07 */
  22. UnknownChar, /* 0x08 */
  23. SpaceChar, /* 0x09 */
  24. LineCharGroup, /* 0x0A */
  25. SpaceChar, /* 0x0B */
  26. SpaceChar, /* 0x0C */
  27. LineCharGroup, /* 0x0D */
  28. UnknownChar, /* 0x0E */
  29. UnknownChar, /* 0x0F */
  30. UnknownChar, /* 0x10 */
  31. UnknownChar, /* 0x11 */
  32. UnknownChar, /* 0x12 */
  33. UnknownChar, /* 0x13 */
  34. UnknownChar, /* 0x14 */
  35. UnknownChar, /* 0x15 */
  36. UnknownChar, /* 0x16 */
  37. UnknownChar, /* 0x17 */
  38. UnknownChar, /* 0x18 */
  39. UnknownChar, /* 0x19 */
  40. UnknownChar, /* 0x1A */
  41. UnknownChar, /* 0x1B */
  42. UnknownChar, /* 0x1C */
  43. UnknownChar, /* 0x1D */
  44. UnknownChar, /* 0x1E */
  45. UnknownChar, /* 0x1F */
  46. SpaceChar, /* 0x20 */
  47. UnknownChar, /* 0x21 ! */
  48. UnknownChar, /* 0x22 */
  49. UnknownChar, /* 0x23 # */
  50. LetterCharGroup, /* 0x24 $ */
  51. UnknownChar, /* 0x25 % */
  52. UnknownChar, /* 0x26 & */
  53. UnknownChar, /* 0x27 */
  54. UnknownChar, /* 0x28 */
  55. UnknownChar, /* 0x29 */
  56. UnknownChar, /* 0x2A */
  57. UnknownChar, /* 0x2B */
  58. UnknownChar, /* 0x2C */
  59. UnknownChar, /* 0x2D */
  60. UnknownChar, /* 0x2E */
  61. UnknownChar, /* 0x2F */
  62. DecimalCharGroup, /* 0x30 0 */
  63. DecimalCharGroup, /* 0x31 1 */
  64. DecimalCharGroup, /* 0x32 2 */
  65. DecimalCharGroup, /* 0x33 3 */
  66. DecimalCharGroup, /* 0x34 4 */
  67. DecimalCharGroup, /* 0x35 5 */
  68. DecimalCharGroup, /* 0x36 6 */
  69. DecimalCharGroup, /* 0x37 7 */
  70. DecimalCharGroup, /* 0x38 8 */
  71. DecimalCharGroup, /* 0x39 9 */
  72. UnknownChar, /* 0x3A */
  73. UnknownChar, /* 0x3B */
  74. UnknownChar, /* 0x3C < */
  75. UnknownChar, /* 0x3D = */
  76. UnknownChar, /* 0x3E > */
  77. UnknownChar, /* 0x3F */
  78. UnknownChar, /* 0x40 @ */
  79. HexCharGroup, /* 0x41 A */
  80. HexCharGroup, /* 0x42 B */
  81. HexCharGroup, /* 0x43 C */
  82. HexCharGroup, /* 0x44 D */
  83. HexCharGroup, /* 0x45 E */
  84. HexCharGroup, /* 0x46 F */
  85. LetterCharGroup, /* 0x47 G */
  86. LetterCharGroup, /* 0x48 H */
  87. LetterCharGroup, /* 0x49 I */
  88. LetterCharGroup, /* 0x4A J */
  89. LetterCharGroup, /* 0x4B K */
  90. LetterCharGroup, /* 0x4C L */
  91. LetterCharGroup, /* 0x4D M */
  92. LetterCharGroup, /* 0x4E N */
  93. LetterCharGroup, /* 0x4F O */
  94. LetterCharGroup, /* 0x50 P */
  95. LetterCharGroup, /* 0x51 Q */
  96. LetterCharGroup, /* 0x52 R */
  97. LetterCharGroup, /* 0x53 S */
  98. LetterCharGroup, /* 0x54 T */
  99. LetterCharGroup, /* 0x55 U */
  100. LetterCharGroup, /* 0x56 V */
  101. LetterCharGroup, /* 0x57 W */
  102. LetterCharGroup, /* 0x58 X */
  103. LetterCharGroup, /* 0x59 Y */
  104. LetterCharGroup, /* 0x5A Z */
  105. UnknownChar, /* 0x5B */
  106. UnknownChar, /* 0x5C */
  107. UnknownChar, /* 0x5D */
  108. UnknownChar, /* 0x5E */
  109. LetterCharGroup, /* 0x5F _ */
  110. UnknownChar, /* 0x60 */
  111. HexCharGroup, /* 0x61 a */
  112. HexCharGroup, /* 0x62 b */
  113. HexCharGroup, /* 0x63 c */
  114. HexCharGroup, /* 0x64 d */
  115. HexCharGroup, /* 0x65 e */
  116. HexCharGroup, /* 0x66 f */
  117. LetterCharGroup, /* 0x67 g */
  118. LetterCharGroup, /* 0x68 h */
  119. LetterCharGroup, /* 0x69 i */
  120. LetterCharGroup, /* 0x6A j */
  121. LetterCharGroup, /* 0x6B k */
  122. LetterCharGroup, /* 0x6C l */
  123. LetterCharGroup, /* 0x6D m */
  124. LetterCharGroup, /* 0x6E n */
  125. LetterCharGroup, /* 0x6F o */
  126. LetterCharGroup, /* 0x70 p */
  127. LetterCharGroup, /* 0x71 q */
  128. LetterCharGroup, /* 0x72 r */
  129. LetterCharGroup, /* 0x73 s */
  130. LetterCharGroup, /* 0x74 t */
  131. LetterCharGroup, /* 0x75 u */
  132. LetterCharGroup, /* 0x76 v */
  133. LetterCharGroup, /* 0x77 w */
  134. LetterCharGroup, /* 0x78 x */
  135. LetterCharGroup, /* 0x79 y */
  136. LetterCharGroup, /* 0x7A z */
  137. UnknownChar, /* 0x7B */
  138. UnknownChar, /* 0x7C */
  139. UnknownChar, /* 0x7D */
  140. UnknownChar, /* 0x7E */
  141. UnknownChar /* 0x7F */
  142. };
  143. }
  144. };
  145. /*****************************************************************************
  146. *
  147. * The _C_xxx enum and charTypes[] table are used to map a character to
  148. * simple classification values and flags.
  149. */
  150. static const CharTypes charTypes[128] =
  151. {
  152. _C_NUL, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 00-07 */
  153. _C_ERR, _C_WSP, _C_NWL, _C_WSP, _C_WSP, _C_NWL, _C_ERR, _C_ERR, /* 08-0F */
  154. _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 10-17 */
  155. _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 18-1F */
  156. _C_WSP, _C_BNG, _C_QUO, _C_SHP, _C_DOL, _C_PCT, _C_AMP, _C_APO, /* 20-27 */
  157. _C_LPR, _C_RPR, _C_MUL, _C_PLS, _C_CMA, _C_MIN, _C_DOT, _C_SLH, /* 28-2F */
  158. _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, /* 30-37 */
  159. _C_DIG, _C_DIG, _C_COL, _C_SMC, _C_LT , _C_EQ , _C_GT , _C_QUE, /* 38-3F */
  160. _C_AT , _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 40-47 */
  161. _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 48-4F */
  162. _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 50-57 */
  163. _C_LET, _C_LET, _C_LET, _C_LBR, _C_BSL, _C_RBR, _C_XOR, _C_USC, /* 58-5F */
  164. _C_BKQ, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 60-67 */
  165. _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 68-6F */
  166. _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 70-77 */
  167. _C_LET, _C_LET, _C_LET, _C_LC , _C_BAR, _C_RC , _C_TIL, _C_ERR, /* 78-7F */
  168. };
  169. #if ENABLE_UNICODE_API
  170. bool Js::CharClassifier::BigCharIsWhitespaceDefault(codepoint_t ch, const Js::CharClassifier *instance)
  171. {
  172. return (instance->getBigCharFlagsFunc(ch, instance) & PlatformAgnostic::UnicodeText::CharacterTypeFlags::SpaceChar) != 0;
  173. }
  174. bool Js::CharClassifier::BigCharIsIdStartDefault(codepoint_t ch, const Js::CharClassifier *instance)
  175. {
  176. return (instance->getBigCharFlagsFunc(ch, instance) & PlatformAgnostic::UnicodeText::CharacterTypeFlags::IdLeadChar) != 0;
  177. }
  178. bool Js::CharClassifier::BigCharIsIdContinueDefault(codepoint_t ch, const Js::CharClassifier *instance)
  179. {
  180. return (instance->getBigCharFlagsFunc(ch, instance) & PlatformAgnostic::UnicodeText::CharacterTypeFlags::IdChar) != 0;
  181. }
  182. #endif
  183. CharTypes Js::CharClassifier::GetBigCharTypeES5(codepoint_t codepoint, const Js::CharClassifier *instance)
  184. {
  185. using namespace PlatformAgnostic::UnicodeText;
  186. if (codepoint > 0xFFFF)
  187. {
  188. return CharTypes::_C_ERR;
  189. }
  190. if (codepoint == kchLS || codepoint == kchPS)
  191. {
  192. return _C_NWL;
  193. }
  194. auto charType = GetLegacyCharacterClassificationType((char16)codepoint);
  195. if (charType == CharacterClassificationType::Letter)
  196. {
  197. return CharTypes::_C_LET;
  198. }
  199. else if (charType == CharacterClassificationType::Whitespace)
  200. {
  201. return CharTypes::_C_WSP;
  202. }
  203. return CharTypes::_C_ERR;
  204. }
  205. PlatformAgnostic::UnicodeText::CharacterTypeFlags Js::CharClassifier::GetBigCharFlagsES5(codepoint_t ch, const Js::CharClassifier *instance)
  206. {
  207. using namespace PlatformAgnostic::UnicodeText;
  208. //In ES5 the unicode <ZWNJ> and <ZWJ> could be identifier parts
  209. if (ch == 0x200c || ch == 0x200d)
  210. {
  211. return PlatformAgnostic::UnicodeText::CharacterTypeFlags::IdChar;
  212. }
  213. // Make sure that the codepoint fits within the char16 range
  214. if (ch > 0xFFFF)
  215. {
  216. return UnknownChar;
  217. }
  218. return PlatformAgnostic::UnicodeText::GetLegacyCharacterTypeFlags((char16)ch);
  219. }
  220. /*
  221. * CharClassifier implementation
  222. */
  223. CharTypes Js::CharClassifier::GetBigCharTypeES6(codepoint_t ch, const Js::CharClassifier *instance)
  224. {
  225. using namespace PlatformAgnostic::UnicodeText;
  226. Assert(ch > 0x7F);
  227. if (ch == 0xFEFF)
  228. {
  229. return CharTypes::_C_WSP;
  230. }
  231. UnicodeGeneralCategoryClass categoryClass = PlatformAgnostic::UnicodeText::GetGeneralCategoryClass(ch);
  232. switch(categoryClass)
  233. {
  234. case UnicodeGeneralCategoryClass::CategoryClassLetter:
  235. return CharTypes::_C_LET;
  236. case UnicodeGeneralCategoryClass::CategoryClassDigit:
  237. return CharTypes::_C_DIG;
  238. case UnicodeGeneralCategoryClass::CategoryClassLineSeparator:
  239. case UnicodeGeneralCategoryClass::CategoryClassParagraphSeparator:
  240. return CharTypes::_C_NWL;
  241. case UnicodeGeneralCategoryClass::CategoryClassSpaceSeparator:
  242. case UnicodeGeneralCategoryClass::CategoryClassSpacingCombiningMark:
  243. case UnicodeGeneralCategoryClass::CategoryClassNonSpacingMark:
  244. case UnicodeGeneralCategoryClass::CategoryClassConnectorPunctuation:
  245. return CharTypes::_C_WSP;
  246. default:
  247. break;
  248. }
  249. return CharTypes::_C_UNK;
  250. }
  251. /*
  252. From Unicode 6.3 http://www.unicode.org/reports/tr31/tr31-19.html
  253. ID_Start:::
  254. Characters having the Unicode General_Category of uppercase letters (Lu), lowercase letters (Ll), titlecase letters (Lt), modifier letters (Lm), other letters (Lo), letter numbers (Nl), minus Pattern_Syntax and Pattern_White_Space code points, plus stability extensions. Note that "other letters" includes ideographs.
  255. In set notation, this is [[:L:][:Nl:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] plus stability extensions.
  256. ID_Continue:::
  257. All of the above, plus characters having the Unicode General_Category of nonspacing marks (Mn), spacing combining marks (Mc), decimal number (Nd), connector punctuations (Pc), plus stability extensions, minus Pattern_Syntax and Pattern_White_Space code points.
  258. In set notation, this is [[:L:][:Nl:][:Mn:][:Mc:][:Nd:][:Pc:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] plus stability extensions.
  259. These are also known simply as Identifier Characters, because they are a superset of the ID_Start characters.
  260. */
  261. PlatformAgnostic::UnicodeText::CharacterTypeFlags Js::CharClassifier::GetBigCharFlagsES6(codepoint_t ch, const Js::CharClassifier *instance)
  262. {
  263. using namespace PlatformAgnostic::UnicodeText;
  264. Assert(ch > 0x7F);
  265. UnicodeGeneralCategoryClass categoryClass = PlatformAgnostic::UnicodeText::GetGeneralCategoryClass(ch);
  266. switch(categoryClass)
  267. {
  268. case UnicodeGeneralCategoryClass::CategoryClassLetter:
  269. return BigCharIsIdStartES6(ch, instance) ? CharacterTypeFlags::LetterCharGroup : CharacterTypeFlags::UnknownChar;
  270. case UnicodeGeneralCategoryClass::CategoryClassSpacingCombiningMark:
  271. return BigCharIsIdContinueES6(ch, instance) ? CharacterTypeFlags::IdChar : CharacterTypeFlags::SpaceChar;
  272. case UnicodeGeneralCategoryClass::CategoryClassNonSpacingMark:
  273. case UnicodeGeneralCategoryClass::CategoryClassConnectorPunctuation:
  274. return BigCharIsIdContinueES6(ch, instance) ? CharacterTypeFlags::IdChar : CharacterTypeFlags::UnknownChar;
  275. case UnicodeGeneralCategoryClass::CategoryClassDigit:
  276. return BigCharIsIdContinueES6(ch, instance) ? CharacterTypeFlags::DecimalCharGroup : CharacterTypeFlags::DecimalChar;
  277. case UnicodeGeneralCategoryClass::CategoryClassLineSeparator:
  278. return CharacterTypeFlags::LineFeedChar;
  279. case UnicodeGeneralCategoryClass::CategoryClassParagraphSeparator:
  280. case UnicodeGeneralCategoryClass::CategoryClassSpaceSeparator:
  281. return CharacterTypeFlags::SpaceChar;
  282. default:
  283. break;
  284. }
  285. return CharacterTypeFlags::UnknownChar;
  286. }
  287. bool Js::CharClassifier::BigCharIsWhitespaceES6(codepoint_t ch, const CharClassifier *instance)
  288. {
  289. Assert(ch > 0x7F);
  290. if (ch == 0xFEFF)
  291. {
  292. return true;
  293. }
  294. return PlatformAgnostic::UnicodeText::IsWhitespace(ch);
  295. }
  296. bool Js::CharClassifier::BigCharIsIdStartES6(codepoint_t codePoint, const CharClassifier *instance)
  297. {
  298. Assert(codePoint > 0x7F);
  299. return PlatformAgnostic::UnicodeText::IsIdStart(codePoint);
  300. }
  301. bool Js::CharClassifier::BigCharIsIdContinueES6(codepoint_t codePoint, const CharClassifier *instance)
  302. {
  303. Assert(codePoint > 0x7F);
  304. if (codePoint == '$' || codePoint == '_' || codePoint == 0x200C /* Zero-width non-joiner */ || codePoint == 0x200D /* Zero-width joiner */)
  305. {
  306. return true;
  307. }
  308. return PlatformAgnostic::UnicodeText::IsIdContinue(codePoint);
  309. }
  310. template <bool isBigChar>
  311. bool Js::CharClassifier::IsWhiteSpaceFast(codepoint_t ch) const
  312. {
  313. using namespace PlatformAgnostic::UnicodeText;
  314. Assert(isBigChar ? ch > 0x7F : ch < 0x80);
  315. return isBigChar ? this->bigCharIsWhitespaceFunc(ch, this) : (charFlags[ch] & CharacterTypeFlags::SpaceChar) != 0;
  316. }
  317. bool Js::CharClassifier::IsBiDirectionalChar(codepoint_t ch) const
  318. {
  319. //From http://www.unicode.org/reports/tr9/#Directional_Formatting_Codes
  320. switch (ch)
  321. {
  322. case 0x202A: //LEFT-TO-RIGHT EMBEDDING Treat the following text as embedded left-to-right
  323. case 0x202B: //RIGHT-TO-LEFT EMBEDDING Treat the following text as embedded right-to-left.
  324. case 0x202D: //LEFT-TO-RIGHT OVERRIDE Force following characters to be treated as strong left-to-right characters.
  325. case 0x202E: //RIGHT-TO-LEFT OVERRIDE Force following characters to be treated as strong right-to-left characters.
  326. case 0x202C: //POP DIRECTIONAL FORMATTING End the scope of the last LRE, RLE, RLO, or LRO.
  327. case 0x2066: //LEFT-TO-RIGHT ISOLATE Treat the following text as isolated and left-to-right.
  328. case 0x2067: //RIGHT-TO-LEFT ISOLATE Treat the following text as isolated and right-to-left.
  329. case 0x2068: //FIRST STRONG ISOLATE Treat the following text as isolated and in the direction of its first strong directional character that is not inside a nested isolate.
  330. case 0x2069: //POP DIRECTIONAL ISOLATE End the scope of the last LRI, RLI, or FSI.
  331. case 0x200E: //LEFT-TO-RIGHT MARK Left-to-right zero-width character
  332. case 0x200F: //RIGHT-TO-LEFT MARK Right-to-left zero-width non-Arabic character
  333. case 0x061C: //ARABIC LETTER MARK Right-to-left zero-width Arabic character
  334. return true;
  335. default:
  336. return false;
  337. }
  338. }
  339. template<bool isBigChar>
  340. bool Js::CharClassifier::IsIdStartFast(codepoint_t ch) const
  341. {
  342. using namespace PlatformAgnostic::UnicodeText;
  343. Assert(isBigChar ? ch > 0x7F : ch < 0x80);
  344. return isBigChar ? this->bigCharIsIdStartFunc(ch, this) : (charFlags[ch] & CharacterTypeFlags::IdLeadChar) != 0;
  345. }
  346. template<bool isBigChar>
  347. bool Js::CharClassifier::IsIdContinueFast(codepoint_t ch) const
  348. {
  349. using namespace PlatformAgnostic::UnicodeText;
  350. Assert(isBigChar ? ch > 0x7F : ch < 0x80);
  351. return isBigChar ? this->bigCharIsIdContinueFunc(ch, this) : (charFlags[ch] & CharacterTypeFlags::IdChar) != 0;
  352. }
  353. Js::CharClassifier::CharClassifier(void)
  354. {
  355. bool isES6UnicodeModeEnabled = CONFIG_FLAG(ES6Unicode);
  356. bool isFullUnicodeSupportAvailable = PlatformAgnostic::UnicodeText::IsExternalUnicodeLibraryAvailable();
  357. #ifdef NTBUILD
  358. AssertMsg(isFullUnicodeSupportAvailable, "Windows.Globalization needs to present with IUnicodeCharacterStatics support for Chakra.dll to work");
  359. if (!isFullUnicodeSupportAvailable)
  360. {
  361. Js::Throw::FatalInternalError();
  362. }
  363. #endif
  364. // If we're in ES6 mode, and we have full support for Unicode character classification
  365. // from an external library, then use the ES6/Surrogate pair supported versions of the functions
  366. // Otherwise, fallback to the ES5 versions which don't need an external library
  367. #if ENABLE_UNICODE_API
  368. if (isES6UnicodeModeEnabled && isFullUnicodeSupportAvailable)
  369. #endif
  370. {
  371. bigCharIsIdStartFunc = &CharClassifier::BigCharIsIdStartES6;
  372. bigCharIsIdContinueFunc = &CharClassifier::BigCharIsIdContinueES6;
  373. bigCharIsWhitespaceFunc = &CharClassifier::BigCharIsWhitespaceES6;
  374. skipWhiteSpaceFunc = &CharClassifier::SkipWhiteSpaceSurrogate;
  375. skipWhiteSpaceStartEndFunc = &CharClassifier::SkipWhiteSpaceSurrogateStartEnd;
  376. skipIdentifierFunc = &CharClassifier::SkipIdentifierSurrogate;
  377. skipIdentifierStartEndFunc = &CharClassifier::SkipIdentifierSurrogateStartEnd;
  378. getBigCharTypeFunc = &CharClassifier::GetBigCharTypeES6;
  379. getBigCharFlagsFunc = &CharClassifier::GetBigCharFlagsES6;
  380. }
  381. #if ENABLE_UNICODE_API
  382. else
  383. {
  384. bigCharIsIdStartFunc = &CharClassifier::BigCharIsIdStartDefault;
  385. bigCharIsIdContinueFunc = &CharClassifier::BigCharIsIdContinueDefault;
  386. bigCharIsWhitespaceFunc = &CharClassifier::BigCharIsWhitespaceDefault;
  387. skipWhiteSpaceFunc = &CharClassifier::SkipWhiteSpaceNonSurrogate;
  388. skipWhiteSpaceStartEndFunc = &CharClassifier::SkipWhiteSpaceNonSurrogateStartEnd;
  389. skipIdentifierFunc = &CharClassifier::SkipIdentifierNonSurrogate;
  390. skipIdentifierStartEndFunc = &CharClassifier::SkipIdentifierNonSurrogateStartEnd;
  391. getBigCharTypeFunc = &CharClassifier::GetBigCharTypeES5;
  392. getBigCharFlagsFunc = &CharClassifier::GetBigCharFlagsES5;
  393. }
  394. #endif
  395. }
  396. const OLECHAR* Js::CharClassifier::SkipWhiteSpaceNonSurrogate(LPCOLESTR psz, const CharClassifier *instance)
  397. {
  398. for ( ; instance->IsWhiteSpace(*psz); psz++)
  399. {
  400. }
  401. return psz;
  402. }
  403. const OLECHAR* Js::CharClassifier::SkipWhiteSpaceNonSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance)
  404. {
  405. for ( ; instance->IsWhiteSpace(*pStr) && pStr < pStrEnd; pStr++)
  406. {
  407. }
  408. return pStr;
  409. }
  410. const OLECHAR* Js::CharClassifier::SkipIdentifierNonSurrogate(LPCOLESTR psz, const CharClassifier *instance)
  411. {
  412. if (!instance->IsIdStart(*psz))
  413. {
  414. return psz;
  415. }
  416. for (psz++; instance->IsIdContinue(*psz); psz++)
  417. {
  418. }
  419. return psz;
  420. }
  421. const LPCUTF8 Js::CharClassifier::SkipIdentifierNonSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance)
  422. {
  423. utf8::DecodeOptions options = utf8::doAllowThreeByteSurrogates;
  424. LPCUTF8 p = psz;
  425. if (!instance->IsIdStart(utf8::Decode(p, end, options)))
  426. {
  427. return psz;
  428. }
  429. psz = p;
  430. while (instance->IsIdContinue(utf8::Decode(p, end, options)))
  431. {
  432. psz = p;
  433. }
  434. return psz;
  435. }
  436. const OLECHAR* Js::CharClassifier::SkipWhiteSpaceSurrogate(LPCOLESTR psz, const CharClassifier *instance)
  437. {
  438. char16 currentChar = 0x0;
  439. // Slow path is to check for a surrogate each iteration.
  440. // There is no new surrogate whitespaces as of yet, however, might be in the future, so surrogates still need to be checked
  441. // So, based on that, best way is to hit the slow path if the current character is not a whitespace in [0, FFFF];
  442. while((currentChar = *psz) != '\0')
  443. {
  444. if (!instance->IsWhiteSpace(*psz))
  445. {
  446. if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1)))
  447. {
  448. if (instance->IsWhiteSpace(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1))))
  449. {
  450. psz += 2;
  451. continue;
  452. }
  453. }
  454. // Above case failed, so we have reached the last whitespace
  455. return psz;
  456. }
  457. psz++;
  458. }
  459. return psz;
  460. }
  461. const OLECHAR* Js::CharClassifier::SkipWhiteSpaceSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance)
  462. {
  463. char16 currentChar = 0x0;
  464. // Same reasoning as above
  465. while(pStr < pStrEnd && (currentChar = *pStr) != '\0')
  466. {
  467. if (!instance->IsWhiteSpace(currentChar))
  468. {
  469. if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && (pStr + 1) < pStrEnd && Js::NumberUtilities::IsSurrogateUpperPart(*(pStr + 1)))
  470. {
  471. if (instance->IsWhiteSpace(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(pStr + 1))))
  472. {
  473. pStr += 2;
  474. continue;
  475. }
  476. }
  477. // Above case failed, so we have reached the last whitespace
  478. return pStr;
  479. }
  480. pStr++;
  481. }
  482. return pStr;
  483. }
  484. const OLECHAR* Js::CharClassifier::SkipIdentifierSurrogate(LPCOLESTR psz, const CharClassifier *instance)
  485. {
  486. // Similar reasoning to above, however we do have surrogate identifiers, but less likely to occur in code.
  487. char16 currentChar = *psz;
  488. if (!instance->IsIdStart(currentChar))
  489. {
  490. if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1))
  491. && instance->IsIdStart(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1))))
  492. {
  493. // For the extra surrogate char
  494. psz ++;
  495. }
  496. else
  497. {
  498. return psz;
  499. }
  500. }
  501. psz++;
  502. while((currentChar = *psz) != '\0')
  503. {
  504. if (!instance->IsIdContinue(*psz))
  505. {
  506. if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1)))
  507. {
  508. if (instance->IsIdContinue(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1))))
  509. {
  510. psz += 2;
  511. continue;
  512. }
  513. }
  514. // Above case failed, so we have reached the last IDContinue
  515. return psz;
  516. }
  517. psz++;
  518. }
  519. return psz;
  520. }
  521. const LPCUTF8 Js::CharClassifier::SkipIdentifierSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance)
  522. {
  523. LPCUTF8 currentPosition = psz;
  524. utf8::DecodeOptions options = utf8::doAllowThreeByteSurrogates;
  525. // Similar reasoning to above, however we do have surrogate identifiers, but less likely to occur in code.
  526. codepoint_t currentChar = utf8::Decode(currentPosition, end, options);
  527. if (options & utf8::doSecondSurrogatePair)
  528. {
  529. currentChar = Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, utf8::Decode(currentPosition, end, options));
  530. }
  531. if (!instance->IsIdStart(currentChar))
  532. {
  533. return psz;
  534. }
  535. psz = currentPosition;
  536. // Slow path is to check for a surrogate each iteration.
  537. // There is no new surrogate whitespaces as of yet, however, might be in the future, so surrogates still need to be checked
  538. // So, based on that, best way is to hit the slow path if the current character is not a whitespace in [0, FFFF];
  539. while((currentChar = utf8::Decode(currentPosition, end, options)) != '\0')
  540. {
  541. if (options & utf8::doSecondSurrogatePair)
  542. {
  543. currentChar = Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, utf8::Decode(currentPosition, end, options));
  544. }
  545. if (!instance->IsIdContinue(currentChar))
  546. {
  547. return psz;
  548. }
  549. psz = currentPosition;
  550. }
  551. return psz;
  552. }
  553. CharTypes Js::CharClassifier::GetCharType(codepoint_t ch) const
  554. {
  555. return FBigChar(ch) ? getBigCharTypeFunc(ch, this) : charTypes[ch];
  556. }
  557. #if ENABLE_UNICODE_API
  558. PlatformAgnostic::UnicodeText::CharacterTypeFlags Js::CharClassifier::GetCharFlags(codepoint_t ch) const
  559. {
  560. #if ENABLE_UNICODE_API
  561. return FBigChar(ch) ? getBigCharFlagsFunc(ch, this) : PlatformAgnostic::UnicodeText::charFlags[ch];
  562. #else
  563. return PlatformAgnostics::UnicodeText::charFlags[ch];
  564. #endif
  565. }
  566. #endif
  567. // Explicit instantiation
  568. template bool Js::CharClassifier::IsIdStartFast<true>(codepoint_t) const;
  569. template bool Js::CharClassifier::IsIdStartFast<false>(codepoint_t) const;
  570. template bool Js::CharClassifier::IsIdContinueFast<true>(codepoint_t) const;
  571. template bool Js::CharClassifier::IsIdContinueFast<false>(codepoint_t) const;
  572. template bool Js::CharClassifier::IsWhiteSpaceFast<true>(codepoint_t) const;
  573. template bool Js::CharClassifier::IsWhiteSpaceFast<false>(codepoint_t) const;