CharClassifier.cpp 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "ParserPch.h"
  6. #include "../Runtime/Base/WindowsGlobalizationAdapter.h"
  7. using namespace Windows::Data::Text;
  8. static const CharTypeFlags charFlags[128] =
  9. {
  10. UnknownChar, /* 0x00 */
  11. UnknownChar, /* 0x01 */
  12. UnknownChar, /* 0x02 */
  13. UnknownChar, /* 0x03 */
  14. UnknownChar, /* 0x04 */
  15. UnknownChar, /* 0x05 */
  16. UnknownChar, /* 0x06 */
  17. UnknownChar, /* 0x07 */
  18. UnknownChar, /* 0x08 */
  19. SpaceChar, /* 0x09 */
  20. LineCharGroup, /* 0x0A */
  21. SpaceChar, /* 0x0B */
  22. SpaceChar, /* 0x0C */
  23. LineCharGroup, /* 0x0D */
  24. UnknownChar, /* 0x0E */
  25. UnknownChar, /* 0x0F */
  26. UnknownChar, /* 0x10 */
  27. UnknownChar, /* 0x11 */
  28. UnknownChar, /* 0x12 */
  29. UnknownChar, /* 0x13 */
  30. UnknownChar, /* 0x14 */
  31. UnknownChar, /* 0x15 */
  32. UnknownChar, /* 0x16 */
  33. UnknownChar, /* 0x17 */
  34. UnknownChar, /* 0x18 */
  35. UnknownChar, /* 0x19 */
  36. UnknownChar, /* 0x1A */
  37. UnknownChar, /* 0x1B */
  38. UnknownChar, /* 0x1C */
  39. UnknownChar, /* 0x1D */
  40. UnknownChar, /* 0x1E */
  41. UnknownChar, /* 0x1F */
  42. SpaceChar, /* 0x20 */
  43. UnknownChar, /* 0x21 ! */
  44. UnknownChar, /* 0x22 */
  45. UnknownChar, /* 0x23 # */
  46. LetterCharGroup, /* 0x24 $ */
  47. UnknownChar, /* 0x25 % */
  48. UnknownChar, /* 0x26 & */
  49. UnknownChar, /* 0x27 */
  50. UnknownChar, /* 0x28 */
  51. UnknownChar, /* 0x29 */
  52. UnknownChar, /* 0x2A */
  53. UnknownChar, /* 0x2B */
  54. UnknownChar, /* 0x2C */
  55. UnknownChar, /* 0x2D */
  56. UnknownChar, /* 0x2E */
  57. UnknownChar, /* 0x2F */
  58. DecimalCharGroup, /* 0x30 0 */
  59. DecimalCharGroup, /* 0x31 1 */
  60. DecimalCharGroup, /* 0x32 2 */
  61. DecimalCharGroup, /* 0x33 3 */
  62. DecimalCharGroup, /* 0x34 4 */
  63. DecimalCharGroup, /* 0x35 5 */
  64. DecimalCharGroup, /* 0x36 6 */
  65. DecimalCharGroup, /* 0x37 7 */
  66. DecimalCharGroup, /* 0x38 8 */
  67. DecimalCharGroup, /* 0x39 9 */
  68. UnknownChar, /* 0x3A */
  69. UnknownChar, /* 0x3B */
  70. UnknownChar, /* 0x3C < */
  71. UnknownChar, /* 0x3D = */
  72. UnknownChar, /* 0x3E > */
  73. UnknownChar, /* 0x3F */
  74. UnknownChar, /* 0x40 @ */
  75. HexCharGroup, /* 0x41 A */
  76. HexCharGroup, /* 0x42 B */
  77. HexCharGroup, /* 0x43 C */
  78. HexCharGroup, /* 0x44 D */
  79. HexCharGroup, /* 0x45 E */
  80. HexCharGroup, /* 0x46 F */
  81. LetterCharGroup, /* 0x47 G */
  82. LetterCharGroup, /* 0x48 H */
  83. LetterCharGroup, /* 0x49 I */
  84. LetterCharGroup, /* 0x4A J */
  85. LetterCharGroup, /* 0x4B K */
  86. LetterCharGroup, /* 0x4C L */
  87. LetterCharGroup, /* 0x4D M */
  88. LetterCharGroup, /* 0x4E N */
  89. LetterCharGroup, /* 0x4F O */
  90. LetterCharGroup, /* 0x50 P */
  91. LetterCharGroup, /* 0x51 Q */
  92. LetterCharGroup, /* 0x52 R */
  93. LetterCharGroup, /* 0x53 S */
  94. LetterCharGroup, /* 0x54 T */
  95. LetterCharGroup, /* 0x55 U */
  96. LetterCharGroup, /* 0x56 V */
  97. LetterCharGroup, /* 0x57 W */
  98. LetterCharGroup, /* 0x58 X */
  99. LetterCharGroup, /* 0x59 Y */
  100. LetterCharGroup, /* 0x5A Z */
  101. UnknownChar, /* 0x5B */
  102. UnknownChar, /* 0x5C */
  103. UnknownChar, /* 0x5D */
  104. UnknownChar, /* 0x5E */
  105. LetterCharGroup, /* 0x5F _ */
  106. UnknownChar, /* 0x60 */
  107. HexCharGroup, /* 0x61 a */
  108. HexCharGroup, /* 0x62 b */
  109. HexCharGroup, /* 0x63 c */
  110. HexCharGroup, /* 0x64 d */
  111. HexCharGroup, /* 0x65 e */
  112. HexCharGroup, /* 0x66 f */
  113. LetterCharGroup, /* 0x67 g */
  114. LetterCharGroup, /* 0x68 h */
  115. LetterCharGroup, /* 0x69 i */
  116. LetterCharGroup, /* 0x6A j */
  117. LetterCharGroup, /* 0x6B k */
  118. LetterCharGroup, /* 0x6C l */
  119. LetterCharGroup, /* 0x6D m */
  120. LetterCharGroup, /* 0x6E n */
  121. LetterCharGroup, /* 0x6F o */
  122. LetterCharGroup, /* 0x70 p */
  123. LetterCharGroup, /* 0x71 q */
  124. LetterCharGroup, /* 0x72 r */
  125. LetterCharGroup, /* 0x73 s */
  126. LetterCharGroup, /* 0x74 t */
  127. LetterCharGroup, /* 0x75 u */
  128. LetterCharGroup, /* 0x76 v */
  129. LetterCharGroup, /* 0x77 w */
  130. LetterCharGroup, /* 0x78 x */
  131. LetterCharGroup, /* 0x79 y */
  132. LetterCharGroup, /* 0x7A z */
  133. UnknownChar, /* 0x7B */
  134. UnknownChar, /* 0x7C */
  135. UnknownChar, /* 0x7D */
  136. UnknownChar, /* 0x7E */
  137. UnknownChar /* 0x7F */
  138. };
  139. /*****************************************************************************
  140. *
  141. * The _C_xxx enum and charTypes[] table are used to map a character to
  142. * simple classification values and flags.
  143. */
  144. static const CharTypes charTypes[128] =
  145. {
  146. _C_NUL, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 00-07 */
  147. _C_ERR, _C_WSP, _C_NWL, _C_WSP, _C_WSP, _C_NWL, _C_ERR, _C_ERR, /* 08-0F */
  148. _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 10-17 */
  149. _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, _C_ERR, /* 18-1F */
  150. _C_WSP, _C_BNG, _C_QUO, _C_SHP, _C_DOL, _C_PCT, _C_AMP, _C_APO, /* 20-27 */
  151. _C_LPR, _C_RPR, _C_MUL, _C_PLS, _C_CMA, _C_MIN, _C_DOT, _C_SLH, /* 28-2F */
  152. _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, _C_DIG, /* 30-37 */
  153. _C_DIG, _C_DIG, _C_COL, _C_SMC, _C_LT , _C_EQ , _C_GT , _C_QUE, /* 38-3F */
  154. _C_AT , _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 40-47 */
  155. _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 48-4F */
  156. _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 50-57 */
  157. _C_LET, _C_LET, _C_LET, _C_LBR, _C_BSL, _C_RBR, _C_XOR, _C_USC, /* 58-5F */
  158. _C_BKQ, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 60-67 */
  159. _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 68-6F */
  160. _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, _C_LET, /* 70-77 */
  161. _C_LET, _C_LET, _C_LET, _C_LC , _C_BAR, _C_RC , _C_TIL, _C_ERR, /* 78-7F */
  162. };
  163. typedef struct
  164. {
  165. OLECHAR chStart;
  166. OLECHAR chFinish;
  167. } oldCharTypesRangeStruct;
  168. static const int cOldDigits = 156;
  169. static const oldCharTypesRangeStruct oldDigits[] = {
  170. { 688, 734 }, { 736, 745 }, { 768, 837 }, { 864, 865 }, { 884, 885 },
  171. { 890, 890 }, { 900, 901 }, { 1154, 1158 }, { 1369, 1369 }, { 1425, 1441 },
  172. { 1443, 1465 }, { 1467, 1469 }, { 1471, 1471 }, { 1473, 1474 }, { 1476, 1476 },
  173. { 1600, 1600 }, { 1611, 1618 }, { 1648, 1648 }, { 1750, 1773 }, { 2305, 2307 },
  174. { 2364, 2381 }, { 2384, 2388 }, { 2402, 2403 }, { 2433, 2435 }, { 2492, 2492 },
  175. { 2494, 2500 }, { 2503, 2504 }, { 2507, 2509 }, { 2519, 2519 }, { 2530, 2531 },
  176. { 2546, 2554 }, { 2562, 2562 }, { 2620, 2620 }, { 2622, 2626 }, { 2631, 2632 },
  177. { 2635, 2637 }, { 2672, 2676 }, { 2689, 2691 }, { 2748, 2757 }, { 2759, 2761 },
  178. { 2763, 2765 }, { 2768, 2768 }, { 2817, 2819 }, { 2876, 2883 }, { 2887, 2888 },
  179. { 2891, 2893 }, { 2902, 2903 }, { 2928, 2928 }, { 2946, 2947 }, { 3006, 3010 },
  180. { 3014, 3016 }, { 3018, 3021 }, { 3031, 3031 }, { 3056, 3058 }, { 3073, 3075 },
  181. { 3134, 3140 }, { 3142, 3144 }, { 3146, 3149 }, { 3157, 3158 }, { 3202, 3203 },
  182. { 3262, 3268 }, { 3270, 3272 }, { 3274, 3277 }, { 3285, 3286 }, { 3330, 3331 },
  183. { 3390, 3395 }, { 3398, 3400 }, { 3402, 3405 }, { 3415, 3415 }, { 3647, 3647 },
  184. { 3759, 3769 }, { 3771, 3773 }, { 3776, 3780 }, { 3782, 3782 }, { 3784, 3789 },
  185. { 3840, 3843 }, { 3859, 3871 }, { 3882, 3897 }, { 3902, 3903 }, { 3953, 3972 },
  186. { 3974, 3979 }, { 8125, 8129 }, { 8141, 8143 }, { 8157, 8159 }, { 8173, 8175 },
  187. { 8189, 8190 }, { 8192, 8207 }, { 8232, 8238 }, { 8260, 8260 }, { 8298, 8304 },
  188. { 8308, 8316 }, { 8319, 8332 }, { 8352, 8364 }, { 8400, 8417 }, { 8448, 8504 },
  189. { 8531, 8578 }, { 8592, 8682 }, { 8704, 8945 }, { 8960, 8960 }, { 8962, 9000 },
  190. { 9003, 9082 }, { 9216, 9252 }, { 9280, 9290 }, { 9312, 9371 }, { 9450, 9450 },
  191. { 9472, 9621 }, { 9632, 9711 }, { 9728, 9747 }, { 9754, 9839 }, { 9985, 9988 },
  192. { 9990, 9993 }, { 9996, 10023 }, { 10025, 10059 }, { 10061, 10061 }, { 10063, 10066 },
  193. { 10070, 10070 }, { 10072, 10078 }, { 10081, 10087 }, { 10102, 10132 }, { 10136, 10159 },
  194. { 10161, 10174 }, { 12292, 12292 }, { 12294, 12294 }, { 12306, 12307 }, { 12320, 12335 },
  195. { 12337, 12343 }, { 12351, 12351 }, { 12441, 12442 }, { 12688, 12703 }, { 12800, 12828 },
  196. { 12832, 12867 }, { 12896, 12923 }, { 12927, 12976 }, { 12992, 13003 }, { 13008, 13054 },
  197. { 13056, 13174 }, { 13179, 13277 }, { 13280, 13310 }, { 64286, 64286 }, { 65056, 65059 },
  198. { 65122, 65122 }, { 65124, 65126 }, { 65129, 65129 }, { 65136, 65138 }, { 65140, 65140 },
  199. { 65142, 65151 }, { 65284, 65284 }, { 65291, 65291 }, { 65308, 65310 }, { 65342, 65342 },
  200. { 65344, 65344 }, { 65372, 65372 }, { 65374, 65374 }, { 65440, 65440 }, { 65504, 65510 },
  201. { 65512, 65518 }
  202. };
  203. static const int cOldAlphas = 11;
  204. static const oldCharTypesRangeStruct oldAlphas[] = {
  205. { 402, 402 }, { 9372, 9449 }, { 12293, 12293 }, { 12295, 12295 }, { 12443, 12446 },
  206. { 12540, 12542 }, { 64297, 64297 }, { 65152, 65276 }, { 65392, 65392 }, { 65438, 65439 },
  207. { 65533, 65533 }
  208. };
  209. CharTypes GetBigCharType(codepoint_t ch);
  210. CharTypes GetBigCharTypeES6(codepoint_t ch);
  211. CharTypeFlags GetBigCharFlags(codepoint_t ch, const Js::CharClassifier *instance);
  212. CharTypeFlags GetBigCharFlags5(codepoint_t ch, const Js::CharClassifier *instanceh);
  213. CharTypeFlags GetBigCharFlagsES6(codepoint_t ch, const Js::CharClassifier *instance);
  214. BOOL doBinSearch(OLECHAR ch, const oldCharTypesRangeStruct *pRanges, int cSize)
  215. {
  216. int lo = 0;
  217. int hi = cSize;
  218. int mid;
  219. while (lo != hi)
  220. {
  221. mid = lo + (hi - lo) / 2;
  222. if (pRanges[mid].chStart <= ch && ch <= pRanges[mid].chFinish)
  223. return true;
  224. if (ch < pRanges[mid].chStart)
  225. hi = mid;
  226. else
  227. lo = mid + 1;
  228. }
  229. return false;
  230. }
  231. WORD oFindOldCharType(OLECHAR ch)
  232. {
  233. if ((OLECHAR) 65279 == ch)
  234. return C1_SPACE;
  235. if (doBinSearch(ch, oldAlphas, cOldAlphas))
  236. return C1_ALPHA;
  237. if (doBinSearch(ch, oldDigits, cOldDigits))
  238. return C1_DIGIT;
  239. return 0;
  240. }
  241. BOOL oGetCharType( DWORD dwInfoType, OLECHAR ch, LPWORD lpwCharType )
  242. {
  243. BOOL res = GetStringTypeW( dwInfoType, &ch, 1, lpwCharType );
  244. // BOM ( 0xfeff) is recognized as GetStringTypeW as WS.
  245. if ((0x03FF & *lpwCharType) == 0x0200)
  246. {
  247. // Some of the char types changed for Whistler (Unicode 3.0).
  248. // They will return 0x0200 on Whistler, indicating a defined char
  249. // with no type attributes. We want to continue to support these
  250. // characters, so we return the Win2K (Unicode 2.1) attributes.
  251. // We only return the ones we care about - ALPHA for ALPHA, PUNCT
  252. // for PUNCT or DIGIT, and SPACE for SPACE or BLANK.
  253. WORD wOldCharType = oFindOldCharType(ch);
  254. if (0 == wOldCharType)
  255. return res;
  256. *lpwCharType = wOldCharType;
  257. return TRUE;
  258. }
  259. return res;
  260. }
  261. CharTypes GetBigCharType(codepoint_t ch, const Js::CharClassifier *instance)
  262. {
  263. if(ch > 0xFFFF)
  264. {
  265. return CharTypes::_C_ERR;
  266. }
  267. OLECHAR oCh = (OLECHAR)ch;
  268. WORD chType;
  269. Assert( oCh >= 128 );
  270. #if (_WIN32 || _WIN64) // We use the Win32 API function GetStringTypeW for Unicode char. classification
  271. if( oCh == 0x2028 || oCh == 0x2029 )
  272. {
  273. return _C_NWL;
  274. }
  275. if( oGetCharType( CT_CTYPE1, oCh, &chType) )
  276. {
  277. if( chType & C1_ALPHA )
  278. return _C_LET;
  279. else if( chType & (C1_SPACE|C1_BLANK) )
  280. return _C_WSP;
  281. }
  282. #else
  283. #warning No Unicode character support on this platform
  284. #endif
  285. return _C_ERR;
  286. }
  287. CharTypeFlags GetBigCharFlags(codepoint_t ch, const Js::CharClassifier *instance)
  288. {
  289. WORD chType;
  290. if(ch > 0xFFFF)
  291. {
  292. return CharTypeFlags::UnknownChar;
  293. }
  294. OLECHAR oCh = (OLECHAR)ch;
  295. Assert( oCh >= 128 );
  296. #if (_WIN32 || _WIN64) // We use the Win32 API function GetStringTypeW for Unicode char. classification
  297. if( oCh == kchLS || oCh == kchPS )
  298. {
  299. return LineCharGroup;
  300. }
  301. if( oGetCharType( CT_CTYPE1, oCh, &chType) )
  302. {
  303. if( chType & C1_ALPHA )
  304. return LetterCharGroup;
  305. else if ( chType & (C1_DIGIT|C1_PUNCT) )
  306. {
  307. // non-ANSI digits can be used in identifiers but not in numeric constants - hence we
  308. // return fChId instead of kgrfchDec
  309. return IdChar;
  310. }
  311. else if( chType & (C1_SPACE|C1_BLANK) )
  312. return SpaceChar;
  313. }
  314. #else
  315. #warning No Unicode character support on this platform
  316. #endif
  317. return UnknownChar;
  318. }
  319. CharTypeFlags GetBigCharFlags5(codepoint_t ch, const Js::CharClassifier *instance)
  320. {
  321. //In ES5 the unicode <ZWNJ> and <ZWJ> could be identifier parts
  322. if(ch == 0x200c || ch == 0x200d)
  323. {
  324. return IdChar;
  325. }
  326. return GetBigCharFlags(ch, instance);
  327. }
  328. /*
  329. * CharClassifier implementation
  330. */
  331. UnicodeGeneralCategory Js::CharClassifier::GetUnicodeCategoryFor(codepoint_t ch) const
  332. {
  333. UnicodeGeneralCategory category;
  334. AssertMsg(this->winGlobCharApi != nullptr, "ES6 Mode 'GetUnicodeCategoryFor' must mean winGlobCharApi is initialized.");
  335. if(FAILED(this->winGlobCharApi->GetGeneralCategory(ch, &category)))
  336. {
  337. AssertMsg(false, "Should not fail here!");
  338. return UnicodeGeneralCategory::UnicodeGeneralCategory_NotAssigned;
  339. }
  340. return category;
  341. }
  342. CharTypes Js::CharClassifier::GetBigCharTypeES6(codepoint_t ch, const Js::CharClassifier *instance)
  343. {
  344. Assert(ch > 0x7F);
  345. UnicodeGeneralCategory category = instance->GetUnicodeCategoryFor(ch);
  346. switch(category)
  347. {
  348. case UnicodeGeneralCategory::UnicodeGeneralCategory_LowercaseLetter:
  349. case UnicodeGeneralCategory::UnicodeGeneralCategory_UppercaseLetter:
  350. case UnicodeGeneralCategory::UnicodeGeneralCategory_TitlecaseLetter:
  351. case UnicodeGeneralCategory::UnicodeGeneralCategory_ModifierLetter:
  352. case UnicodeGeneralCategory::UnicodeGeneralCategory_OtherLetter:
  353. case UnicodeGeneralCategory::UnicodeGeneralCategory_LetterNumber:
  354. return CharTypes::_C_LET;
  355. case UnicodeGeneralCategory::UnicodeGeneralCategory_LineSeparator:
  356. case UnicodeGeneralCategory::UnicodeGeneralCategory_ParagraphSeparator:
  357. return CharTypes::_C_NWL;
  358. case UnicodeGeneralCategory::UnicodeGeneralCategory_SpaceSeparator:
  359. case UnicodeGeneralCategory::UnicodeGeneralCategory_SpacingCombiningMark:
  360. case UnicodeGeneralCategory::UnicodeGeneralCategory_NonspacingMark:
  361. case UnicodeGeneralCategory::UnicodeGeneralCategory_ConnectorPunctuation:
  362. return CharTypes::_C_WSP;
  363. case UnicodeGeneralCategory::UnicodeGeneralCategory_DecimalDigitNumber:
  364. return CharTypes::_C_DIG;
  365. case UnicodeGeneralCategory::UnicodeGeneralCategory_ClosePunctuation:
  366. case UnicodeGeneralCategory::UnicodeGeneralCategory_EnclosingMark:
  367. case UnicodeGeneralCategory::UnicodeGeneralCategory_Control:
  368. case UnicodeGeneralCategory::UnicodeGeneralCategory_Format:
  369. if (ch == 0xFEFF)
  370. {
  371. return CharTypes::_C_WSP;
  372. }
  373. // Fall through, otherwise
  374. case UnicodeGeneralCategory::UnicodeGeneralCategory_Surrogate:
  375. case UnicodeGeneralCategory::UnicodeGeneralCategory_PrivateUse:
  376. case UnicodeGeneralCategory::UnicodeGeneralCategory_DashPunctuation:
  377. case UnicodeGeneralCategory::UnicodeGeneralCategory_OpenPunctuation:
  378. case UnicodeGeneralCategory::UnicodeGeneralCategory_InitialQuotePunctuation:
  379. case UnicodeGeneralCategory::UnicodeGeneralCategory_FinalQuotePunctuation:
  380. case UnicodeGeneralCategory::UnicodeGeneralCategory_OtherPunctuation:
  381. case UnicodeGeneralCategory::UnicodeGeneralCategory_MathSymbol:
  382. case UnicodeGeneralCategory::UnicodeGeneralCategory_CurrencySymbol:
  383. case UnicodeGeneralCategory::UnicodeGeneralCategory_ModifierSymbol:
  384. case UnicodeGeneralCategory::UnicodeGeneralCategory_OtherSymbol:
  385. case UnicodeGeneralCategory::UnicodeGeneralCategory_NotAssigned:
  386. return CharTypes::_C_UNK;
  387. }
  388. return CharTypes::_C_UNK;
  389. }
  390. /*
  391. From Unicode 6.3 http://www.unicode.org/reports/tr31/tr31-19.html
  392. ID_Start:::
  393. Characters having the Unicode General_Category of uppercase letters (Lu), lowercase letters (Ll), titlecase letters (Lt), modifier letters (Lm), other letters (Lo), letter numbers (Nl), minus Pattern_Syntax and Pattern_White_Space code points, plus stability extensions. Note that "other letters" includes ideographs.
  394. In set notation, this is [[:L:][:Nl:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] plus stability extensions.
  395. ID_Continue:::
  396. All of the above, plus characters having the Unicode General_Category of nonspacing marks (Mn), spacing combining marks (Mc), decimal number (Nd), connector punctuations (Pc), plus stability extensions, minus Pattern_Syntax and Pattern_White_Space code points.
  397. In set notation, this is [[:L:][:Nl:][:Mn:][:Mc:][:Nd:][:Pc:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] plus stability extensions.
  398. These are also known simply as Identifier Characters, because they are a superset of the ID_Start characters.
  399. */
  400. CharTypeFlags Js::CharClassifier::GetBigCharFlagsES6(codepoint_t ch, const Js::CharClassifier *instance)
  401. {
  402. Assert(ch > 0x7F);
  403. UnicodeGeneralCategory category = instance->GetUnicodeCategoryFor(ch);
  404. switch(category)
  405. {
  406. case UnicodeGeneralCategory::UnicodeGeneralCategory_LowercaseLetter:
  407. case UnicodeGeneralCategory::UnicodeGeneralCategory_UppercaseLetter:
  408. case UnicodeGeneralCategory::UnicodeGeneralCategory_TitlecaseLetter:
  409. case UnicodeGeneralCategory::UnicodeGeneralCategory_ModifierLetter:
  410. case UnicodeGeneralCategory::UnicodeGeneralCategory_OtherLetter:
  411. case UnicodeGeneralCategory::UnicodeGeneralCategory_LetterNumber:
  412. return BigCharIsIdStartES6(ch, instance) ? CharTypeFlags::LetterCharGroup : CharTypeFlags::UnknownChar;
  413. case UnicodeGeneralCategory::UnicodeGeneralCategory_SpacingCombiningMark:
  414. return BigCharIsIdContinueES6(ch, instance) ? CharTypeFlags::IdChar : CharTypeFlags::SpaceChar;
  415. case UnicodeGeneralCategory::UnicodeGeneralCategory_NonspacingMark:
  416. case UnicodeGeneralCategory::UnicodeGeneralCategory_ConnectorPunctuation:
  417. return BigCharIsIdContinueES6(ch, instance) ? CharTypeFlags::IdChar : CharTypeFlags::UnknownChar;
  418. case UnicodeGeneralCategory::UnicodeGeneralCategory_DecimalDigitNumber:
  419. return BigCharIsIdContinueES6(ch, instance) ? CharTypeFlags::DecimalCharGroup : CharTypeFlags::DecimalChar;
  420. case UnicodeGeneralCategory::UnicodeGeneralCategory_LineSeparator:
  421. return CharTypeFlags::LineFeedChar;
  422. case UnicodeGeneralCategory::UnicodeGeneralCategory_ParagraphSeparator:
  423. case UnicodeGeneralCategory::UnicodeGeneralCategory_SpaceSeparator:
  424. return CharTypeFlags::SpaceChar;
  425. case UnicodeGeneralCategory::UnicodeGeneralCategory_ClosePunctuation:
  426. case UnicodeGeneralCategory::UnicodeGeneralCategory_EnclosingMark:
  427. case UnicodeGeneralCategory::UnicodeGeneralCategory_Control:
  428. case UnicodeGeneralCategory::UnicodeGeneralCategory_Format:
  429. case UnicodeGeneralCategory::UnicodeGeneralCategory_Surrogate:
  430. case UnicodeGeneralCategory::UnicodeGeneralCategory_PrivateUse:
  431. case UnicodeGeneralCategory::UnicodeGeneralCategory_DashPunctuation:
  432. case UnicodeGeneralCategory::UnicodeGeneralCategory_OpenPunctuation:
  433. case UnicodeGeneralCategory::UnicodeGeneralCategory_InitialQuotePunctuation:
  434. case UnicodeGeneralCategory::UnicodeGeneralCategory_FinalQuotePunctuation:
  435. case UnicodeGeneralCategory::UnicodeGeneralCategory_OtherPunctuation:
  436. case UnicodeGeneralCategory::UnicodeGeneralCategory_MathSymbol:
  437. case UnicodeGeneralCategory::UnicodeGeneralCategory_CurrencySymbol:
  438. case UnicodeGeneralCategory::UnicodeGeneralCategory_ModifierSymbol:
  439. case UnicodeGeneralCategory::UnicodeGeneralCategory_OtherSymbol:
  440. case UnicodeGeneralCategory::UnicodeGeneralCategory_NotAssigned:
  441. return CharTypeFlags::UnknownChar;
  442. }
  443. return CharTypeFlags::UnknownChar;
  444. }
  445. BOOL Js::CharClassifier::BigCharIsWhitespaceES6(codepoint_t ch, const CharClassifier *instance)
  446. {
  447. Assert(ch > 0x7F);
  448. if (ch == 0xFEFF)
  449. {
  450. return true;
  451. }
  452. boolean toReturn = false;
  453. AssertMsg(instance->winGlobCharApi != nullptr, "ES6 Mode 'BigCharIsWhitespaceES6' must mean winGlobCharApi is initialized.");
  454. if (FAILED(instance->winGlobCharApi->IsWhitespace(ch, &toReturn)))
  455. {
  456. AssertMsg(false, "Should not fail here!");
  457. return toReturn;
  458. }
  459. return toReturn;
  460. }
  461. BOOL Js::CharClassifier::BigCharIsIdStartES6(codepoint_t codePoint, const CharClassifier *instance)
  462. {
  463. Assert(codePoint > 0x7F);
  464. boolean toReturn = false;
  465. AssertMsg(instance->winGlobCharApi != nullptr, "ES6 Mode 'BigCharIsIdStartES6' must mean winGlobCharApi is initialized.");
  466. if (FAILED(instance->winGlobCharApi->IsIdStart(codePoint, &toReturn)))
  467. {
  468. AssertMsg(false, "Should not fail here!");
  469. return toReturn;
  470. }
  471. return toReturn;
  472. }
  473. BOOL Js::CharClassifier::BigCharIsIdContinueES6(codepoint_t codePoint, const CharClassifier *instance)
  474. {
  475. Assert(codePoint > 0x7F);
  476. if (codePoint == '$' || codePoint == '_' || codePoint == 0x200C /* Zero-width non-joiner */ || codePoint == 0x200D /* Zero-width joiner */)
  477. {
  478. return true;
  479. }
  480. boolean toReturn = false;
  481. AssertMsg(instance->winGlobCharApi != nullptr, "ES6 Mode 'BigCharIsIdContinueES6' must mean winGlobCharApi is initialized.");
  482. if (FAILED(instance->winGlobCharApi->IsIdContinue(codePoint, &toReturn)))
  483. {
  484. AssertMsg(false, "Should not fail here!");
  485. return toReturn;
  486. }
  487. return toReturn;
  488. }
  489. template <bool isBigChar>
  490. BOOL Js::CharClassifier::IsWhiteSpaceFast(codepoint_t ch) const
  491. {
  492. Assert(isBigChar ? ch > 0x7F : ch < 0x80);
  493. return isBigChar ? this->bigCharIsWhitespaceFunc(ch, this) : (charFlags[ch] & CharTypeFlags::SpaceChar);
  494. }
  495. BOOL Js::CharClassifier::IsBiDirectionalChar(codepoint_t ch) const
  496. {
  497. //From http://www.unicode.org/reports/tr9/#Directional_Formatting_Codes
  498. switch (ch)
  499. {
  500. case 0x202A: //LEFT-TO-RIGHT EMBEDDING Treat the following text as embedded left-to-right
  501. case 0x202B: //RIGHT-TO-LEFT EMBEDDING Treat the following text as embedded right-to-left.
  502. case 0x202D: //LEFT-TO-RIGHT OVERRIDE Force following characters to be treated as strong left-to-right characters.
  503. case 0x202E: //RIGHT-TO-LEFT OVERRIDE Force following characters to be treated as strong right-to-left characters.
  504. case 0x202C: //POP DIRECTIONAL FORMATTING End the scope of the last LRE, RLE, RLO, or LRO.
  505. case 0x2066: //LEFT-TO-RIGHT ISOLATE Treat the following text as isolated and left-to-right.
  506. case 0x2067: //RIGHT-TO-LEFT ISOLATE Treat the following text as isolated and right-to-left.
  507. case 0x2068: //FIRST STRONG ISOLATE Treat the following text as isolated and in the direction of its first strong directional character that is not inside a nested isolate.
  508. case 0x2069: //POP DIRECTIONAL ISOLATE End the scope of the last LRI, RLI, or FSI.
  509. case 0x200E: //LEFT-TO-RIGHT MARK Left-to-right zero-width character
  510. case 0x200F: //RIGHT-TO-LEFT MARK Right-to-left zero-width non-Arabic character
  511. case 0x061C: //ARABIC LETTER MARK Right-to-left zero-width Arabic character
  512. return TRUE;
  513. default:
  514. return FALSE;
  515. }
  516. }
  517. template<bool isBigChar>
  518. BOOL Js::CharClassifier::IsIdStartFast(codepoint_t ch) const
  519. {
  520. Assert(isBigChar ? ch > 0x7F : ch < 0x80);
  521. return isBigChar ? this->bigCharIsIdStartFunc(ch, this) : (charFlags[ch] & CharTypeFlags::IdLeadChar);
  522. }
  523. template<bool isBigChar>
  524. BOOL Js::CharClassifier::IsIdContinueFast(codepoint_t ch) const
  525. {
  526. Assert(isBigChar ? ch > 0x7F : ch < 0x80);
  527. return isBigChar ? this->bigCharIsIdContinueFunc(ch, this) : (charFlags[ch] & CharTypeFlags::IdChar);
  528. }
  529. Js::CharClassifier::CharClassifier(ScriptContext * scriptContext)
  530. {
  531. CharClassifierModes overallMode = (CONFIG_FLAG(ES6Unicode)) ? CharClassifierModes::ES6 : CharClassifierModes::ES5;
  532. bool codePointSupport = overallMode == CharClassifierModes::ES6;
  533. bool isES6UnicodeVerboseEnabled = scriptContext->GetConfig()->IsES6UnicodeVerboseEnabled();
  534. initClassifier(scriptContext, overallMode, overallMode, overallMode, codePointSupport, isES6UnicodeVerboseEnabled, CharClassifierModes::ES6); // no fallback for chk
  535. }
  536. void Js::CharClassifier::initClassifier(ScriptContext * scriptContext, CharClassifierModes identifierSupport,
  537. CharClassifierModes whiteSpaceSupport, CharClassifierModes generalCharClassificationSupport, bool codePointSupport, bool isES6UnicodeVerboseEnabled, CharClassifierModes es6FallbackMode)
  538. {
  539. bool es6Supported = true;
  540. bool es6ModeNeeded = identifierSupport == CharClassifierModes::ES6 || whiteSpaceSupport == CharClassifierModes::ES6 || generalCharClassificationSupport == CharClassifierModes::ES6;
  541. #ifdef ENABLE_ES6_CHAR_CLASSIFIER
  542. ThreadContext* threadContext = scriptContext->GetThreadContext();
  543. Js::WindowsGlobalizationAdapter* globalizationAdapter = threadContext->GetWindowsGlobalizationAdapter();
  544. Js::DelayLoadWindowsGlobalization* globLibrary = threadContext->GetWindowsGlobalizationLibrary();
  545. if (es6ModeNeeded)
  546. {
  547. HRESULT hr = globalizationAdapter->EnsureDataTextObjectsInitialized(globLibrary);
  548. // Failed to load windows.globalization.dll or jsintl.dll. No unicodeStatics support
  549. // in that case.
  550. if (FAILED(hr))
  551. {
  552. es6Supported = false;
  553. es6FallbackMode = CharClassifierModes::ES5;
  554. }
  555. else
  556. {
  557. this->winGlobCharApi = globalizationAdapter->GetUnicodeStatics();
  558. if (this->winGlobCharApi == nullptr)
  559. {
  560. // No fallback mode, then assert
  561. if (es6FallbackMode == CharClassifierModes::ES6)
  562. {
  563. AssertMsg(false, "Windows::Data::Text::IUnicodeCharactersStatics not initialized");
  564. //Fallback to ES5 just in case for fre builds.
  565. es6FallbackMode = CharClassifierModes::ES5;
  566. }
  567. if (isES6UnicodeVerboseEnabled)
  568. {
  569. Output::Print(_u("Windows::Data::Text::IUnicodeCharactersStatics not initialized\r\n"));
  570. }
  571. //Default to non-es6
  572. es6Supported = false;
  573. }
  574. }
  575. }
  576. #else
  577. es6Supported = false;
  578. es6FallbackMode = CharClassifierModes::ES5;
  579. #endif
  580. if (es6ModeNeeded && !es6Supported)
  581. {
  582. identifierSupport = identifierSupport == CharClassifierModes::ES6 ? es6FallbackMode : identifierSupport;
  583. whiteSpaceSupport = whiteSpaceSupport == CharClassifierModes::ES6 ? es6FallbackMode : whiteSpaceSupport;
  584. generalCharClassificationSupport = generalCharClassificationSupport == CharClassifierModes::ES6 ? es6FallbackMode : generalCharClassificationSupport;
  585. }
  586. bigCharIsIdStartFunc = identifierSupport == CharClassifierModes::ES6 ? &CharClassifier::BigCharIsIdStartES6 : &CharClassifier::BigCharIsIdStartDefault;
  587. bigCharIsIdContinueFunc = identifierSupport == CharClassifierModes::ES6 ? &CharClassifier::BigCharIsIdContinueES6 : &CharClassifier::BigCharIsIdContinueDefault;
  588. bigCharIsWhitespaceFunc = whiteSpaceSupport == CharClassifierModes::ES6 ? &CharClassifier::BigCharIsWhitespaceES6 : &CharClassifier::BigCharIsWhitespaceDefault;
  589. skipWhiteSpaceFunc = codePointSupport ? &CharClassifier::SkipWhiteSpaceSurrogate : &CharClassifier::SkipWhiteSpaceNonSurrogate;
  590. skipWhiteSpaceStartEndFunc = codePointSupport ? &CharClassifier::SkipWhiteSpaceSurrogateStartEnd : &CharClassifier::SkipWhiteSpaceNonSurrogateStartEnd;
  591. skipIdentifierFunc = codePointSupport ? &CharClassifier::SkipIdentifierSurrogate : &CharClassifier::SkipIdentifierNonSurrogate;
  592. skipIdentifierStartEndFunc = codePointSupport ? &CharClassifier::SkipIdentifierSurrogateStartEnd : &CharClassifier::SkipIdentifierNonSurrogateStartEnd;
  593. if (generalCharClassificationSupport == CharClassifierModes::ES6)
  594. {
  595. getBigCharTypeFunc = &CharClassifier::GetBigCharTypeES6;
  596. getBigCharFlagsFunc = &CharClassifier::GetBigCharFlagsES6;
  597. }
  598. else if (generalCharClassificationSupport == CharClassifierModes::ES5)
  599. {
  600. getBigCharTypeFunc = &GetBigCharType;
  601. getBigCharFlagsFunc = &GetBigCharFlags5;
  602. }
  603. else
  604. {
  605. getBigCharTypeFunc = &GetBigCharType;
  606. getBigCharFlagsFunc = &GetBigCharFlags;
  607. }
  608. }
  609. const OLECHAR* Js::CharClassifier::SkipWhiteSpaceNonSurrogate(LPCOLESTR psz, const CharClassifier *instance)
  610. {
  611. for ( ; instance->IsWhiteSpace(*psz); psz++)
  612. {
  613. }
  614. return psz;
  615. }
  616. const OLECHAR* Js::CharClassifier::SkipWhiteSpaceNonSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance)
  617. {
  618. for ( ; instance->IsWhiteSpace(*pStr) && pStr < pStrEnd; pStr++)
  619. {
  620. }
  621. return pStr;
  622. }
  623. const OLECHAR* Js::CharClassifier::SkipIdentifierNonSurrogate(LPCOLESTR psz, const CharClassifier *instance)
  624. {
  625. if (!instance->IsIdStart(*psz))
  626. {
  627. return psz;
  628. }
  629. for (psz++; instance->IsIdContinue(*psz); psz++)
  630. {
  631. }
  632. return psz;
  633. }
  634. const LPCUTF8 Js::CharClassifier::SkipIdentifierNonSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance)
  635. {
  636. utf8::DecodeOptions options = utf8::doAllowThreeByteSurrogates;
  637. LPCUTF8 p = psz;
  638. if (!instance->IsIdStart(utf8::Decode(p, end, options)))
  639. {
  640. return psz;
  641. }
  642. psz = p;
  643. while (instance->IsIdContinue(utf8::Decode(p, end, options)))
  644. {
  645. psz = p;
  646. }
  647. return psz;
  648. }
  649. const OLECHAR* Js::CharClassifier::SkipWhiteSpaceSurrogate(LPCOLESTR psz, const CharClassifier *instance)
  650. {
  651. char16 currentChar = 0x0;
  652. // Slow path is to check for a surrogate each iteration.
  653. // There is no new surrogate whitespaces as of yet, however, might be in the future, so surrogates still need to be checked
  654. // So, based on that, best way is to hit the slow path if the current character is not a whitespace in [0, FFFF];
  655. while((currentChar = *psz) != '\0')
  656. {
  657. if (!instance->IsWhiteSpace(*psz))
  658. {
  659. if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1)))
  660. {
  661. if (instance->IsWhiteSpace(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1))))
  662. {
  663. psz += 2;
  664. continue;
  665. }
  666. }
  667. // Above case failed, so we have reached the last whitespace
  668. return psz;
  669. }
  670. psz++;
  671. }
  672. return psz;
  673. }
  674. const OLECHAR* Js::CharClassifier::SkipWhiteSpaceSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance)
  675. {
  676. char16 currentChar = 0x0;
  677. // Same reasoning as above
  678. while(pStr < pStrEnd && (currentChar = *pStr) != '\0')
  679. {
  680. if (!instance->IsWhiteSpace(currentChar))
  681. {
  682. if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && (pStr + 1) < pStrEnd && Js::NumberUtilities::IsSurrogateUpperPart(*(pStr + 1)))
  683. {
  684. if (instance->IsWhiteSpace(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(pStr + 1))))
  685. {
  686. pStr += 2;
  687. continue;
  688. }
  689. }
  690. // Above case failed, so we have reached the last whitespace
  691. return pStr;
  692. }
  693. pStr++;
  694. }
  695. return pStr;
  696. }
  697. const OLECHAR* Js::CharClassifier::SkipIdentifierSurrogate(LPCOLESTR psz, const CharClassifier *instance)
  698. {
  699. // Similar reasoning to above, however we do have surrogate identifiers, but less likely to occur in code.
  700. char16 currentChar = *psz;
  701. if (!instance->IsIdStart(currentChar))
  702. {
  703. if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1))
  704. && instance->IsIdStart(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1))))
  705. {
  706. // For the extra surrogate char
  707. psz ++;
  708. }
  709. else
  710. {
  711. return psz;
  712. }
  713. }
  714. psz++;
  715. while((currentChar = *psz) != '\0')
  716. {
  717. if (!instance->IsIdContinue(*psz))
  718. {
  719. if (Js::NumberUtilities::IsSurrogateLowerPart(currentChar) && Js::NumberUtilities::IsSurrogateUpperPart(*(psz + 1)))
  720. {
  721. if (instance->IsIdContinue(Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, *(psz + 1))))
  722. {
  723. psz += 2;
  724. continue;
  725. }
  726. }
  727. // Above case failed, so we have reached the last IDContinue
  728. return psz;
  729. }
  730. psz++;
  731. }
  732. return psz;
  733. }
  734. const LPCUTF8 Js::CharClassifier::SkipIdentifierSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance)
  735. {
  736. LPCUTF8 currentPosition = psz;
  737. utf8::DecodeOptions options = utf8::doAllowThreeByteSurrogates;
  738. // Similar reasoning to above, however we do have surrogate identifiers, but less likely to occur in code.
  739. codepoint_t currentChar = utf8::Decode(currentPosition, end, options);
  740. if (options & utf8::doSecondSurrogatePair)
  741. {
  742. currentChar = Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, utf8::Decode(currentPosition, end, options));
  743. }
  744. if (!instance->IsIdStart(currentChar))
  745. {
  746. return psz;
  747. }
  748. psz = currentPosition;
  749. // Slow path is to check for a surrogate each iteration.
  750. // There is no new surrogate whitespaces as of yet, however, might be in the future, so surrogates still need to be checked
  751. // So, based on that, best way is to hit the slow path if the current character is not a whitespace in [0, FFFF];
  752. while((currentChar = utf8::Decode(currentPosition, end, options)) != '\0')
  753. {
  754. if (options & utf8::doSecondSurrogatePair)
  755. {
  756. currentChar = Js::NumberUtilities::SurrogatePairAsCodePoint(currentChar, utf8::Decode(currentPosition, end, options));
  757. }
  758. if (!instance->IsIdContinue(currentChar))
  759. {
  760. return psz;
  761. }
  762. psz = currentPosition;
  763. }
  764. return psz;
  765. }
  766. CharTypes Js::CharClassifier::GetCharType(codepoint_t ch) const
  767. {
  768. return FBigChar(ch) ? getBigCharTypeFunc(ch, this) : charTypes[ch];
  769. }
  770. CharTypeFlags Js::CharClassifier::GetCharFlags(codepoint_t ch) const
  771. {
  772. return FBigChar(ch) ? getBigCharFlagsFunc(ch, this) : charFlags[ch];
  773. }