CharClassifier.h 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. #ifdef NTBUILD
  7. #include "Windows.Globalization.h"
  8. #else
  9. #include "Windows.Data.Text.h"
  10. using namespace ABI;
  11. #endif
  12. //Helpers
  13. static inline BOOL FBigChar(codepoint_t ch) { return ch >= 128u; }
  14. static inline BOOL BoolFromDbl(double dbl) { return !Js::NumberUtilities::IsNan(dbl) && (0 != dbl); }
  15. enum CharTypeFlags : uint
  16. {
  17. UnknownChar = 0x0,
  18. IdChar = 0x01,
  19. IdLeadChar = 0x02,
  20. HexChar = 0x04,
  21. DecimalChar = 0x08,
  22. SpaceChar = 0x10,
  23. LineFeedChar = 0x20,
  24. LineCharGroup = SpaceChar | LineFeedChar,
  25. LetterCharGroup = IdChar | IdLeadChar,
  26. HexCharGroup = IdChar | IdLeadChar | HexChar,
  27. DecimalCharGroup = IdChar | DecimalChar,
  28. };
  29. enum CharTypes
  30. {
  31. _C_UNK, // Unkown grouping
  32. _C_ERR, // illegal character
  33. _C_NUL, // NUL character
  34. _C_LET, // letter (A-Z,a-z)
  35. _C_DIG, // digit (0-9)
  36. _C_WSP, // white space
  37. _C_NWL, // new line
  38. _C_DOL, // $
  39. _C_BSL, // \ (backslash)
  40. _C_BKQ, // `
  41. _C_AT, // @
  42. _C_SHP, // #
  43. _C_BNG, // !
  44. _C_QUO, // "
  45. _C_APO, // '
  46. _C_PCT, // %
  47. _C_AMP, // &
  48. _C_LPR, // (
  49. _C_RPR, // )
  50. _C_PLS, // +
  51. _C_MIN, // -
  52. _C_MUL, // *
  53. _C_SLH, // /
  54. _C_XOR, // ^
  55. _C_CMA, // ,
  56. _C_DOT, // .
  57. _C_LT, // <
  58. _C_EQ, // =
  59. _C_GT, // >
  60. _C_QUE, // ?
  61. _C_LBR, // [
  62. _C_RBR, // ]
  63. _C_USC, // _
  64. _C_LC, // {
  65. _C_RC, // }
  66. _C_BAR, // |
  67. _C_TIL, // ~
  68. _C_COL, // :
  69. _C_SMC, // ;
  70. };
  71. enum
  72. {
  73. kchNUL = 0x00,
  74. kchNWL = 0x0A,
  75. kchRET = 0x0D,
  76. kchBSL = '\\',
  77. kchSHP = '#',
  78. kchBNG = '!',
  79. kchQUO = '"',
  80. kchAPO = '\'',
  81. kchPCT = '%',
  82. kchAMP = '&',
  83. kchLPR = '(',
  84. kchRPR = ')',
  85. kchPLS = '+',
  86. kchMIN = '-',
  87. kchMUL = '*',
  88. kchSLH = '/',
  89. kchXOR = '^',
  90. kchCMA = ',',
  91. kchDOT = '.',
  92. kchLT = '<',
  93. kchEQ = '=',
  94. kchGT = '>',
  95. kchQUE = '?',
  96. kchLBR = '[',
  97. kchRBR = ']',
  98. kchUSC = '_',
  99. kchLC = '{',
  100. kchRC = '}',
  101. kchBAR = '|',
  102. kchTIL = '~',
  103. kchCOL = ':',
  104. kchSMC = ';',
  105. kchLS = 0x2028, //classifies as new line
  106. kchPS = 0x2029 //classifies as new line
  107. };
  108. namespace Js
  109. {
  110. class WindowsGlobalizationAdapter;
  111. class DelayLoadWindowsGlobalization;
  112. typedef
  113. enum CharClassifierModes {
  114. ES5 = 1,
  115. ES6 = 2
  116. } CharClassifierModes;
  117. class CharClassifier
  118. {
  119. private:
  120. Windows::Data::Text::IUnicodeCharactersStatics* winGlobCharApi;
  121. static BOOL BigCharIsWhitespaceDefault(codepoint_t ch, const CharClassifier *instance)
  122. {
  123. return (instance->getBigCharFlagsFunc(ch, instance) & CharTypeFlags::SpaceChar);
  124. }
  125. static BOOL BigCharIsIdStartDefault(codepoint_t ch, const CharClassifier *instance)
  126. {
  127. return (instance->getBigCharFlagsFunc(ch, instance) & CharTypeFlags::IdLeadChar);
  128. }
  129. static BOOL BigCharIsIdContinueDefault(codepoint_t ch, const CharClassifier *instance)
  130. {
  131. return (instance->getBigCharFlagsFunc(ch, instance) & CharTypeFlags::IdChar);
  132. }
  133. static BOOL BigCharIsWhitespaceES6(codepoint_t ch, const CharClassifier *instance);
  134. static BOOL BigCharIsIdStartES6(codepoint_t codePoint, const CharClassifier *instance);
  135. static BOOL BigCharIsIdContinueES6(codepoint_t codePoint, const CharClassifier *instance);
  136. static CharTypes GetBigCharTypeES6(codepoint_t ch, const CharClassifier *instance);
  137. static CharTypeFlags GetBigCharFlagsES6(codepoint_t ch, const CharClassifier *instance);
  138. static const OLECHAR* SkipWhiteSpaceSurrogate(LPCOLESTR psz, const CharClassifier *instance);
  139. static const OLECHAR* SkipWhiteSpaceSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance);
  140. static const OLECHAR* SkipIdentifierSurrogate(LPCOLESTR psz, const CharClassifier *instance);
  141. static const LPCUTF8 SkipIdentifierSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance);
  142. static const OLECHAR* SkipWhiteSpaceNonSurrogate(LPCOLESTR psz, const CharClassifier *instance);
  143. static const OLECHAR* SkipWhiteSpaceNonSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance);
  144. static const OLECHAR* SkipIdentifierNonSurrogate(LPCOLESTR psz, const CharClassifier *instance);
  145. static const LPCUTF8 SkipIdentifierNonSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance);
  146. Windows::Data::Text::UnicodeGeneralCategory GetUnicodeCategoryFor(codepoint_t ch) const;
  147. CharTypes (*getBigCharTypeFunc)(codepoint_t ch, const CharClassifier *instance);
  148. CharTypeFlags (*getBigCharFlagsFunc)(codepoint_t ch, const CharClassifier *instance);
  149. BOOL (*bigCharIsWhitespaceFunc)(codepoint_t ch, const CharClassifier *instance);
  150. BOOL (*bigCharIsIdStartFunc)(codepoint_t ch, const CharClassifier *instance);
  151. BOOL (*bigCharIsIdContinueFunc)(codepoint_t ch, const CharClassifier *instance);
  152. const OLECHAR* (*skipWhiteSpaceFunc)(LPCOLESTR psz, const CharClassifier* instance);
  153. const OLECHAR* (*skipWhiteSpaceStartEndFunc)(LPCOLESTR pStr, LPCOLESTR pStrEnd, const CharClassifier* instance);
  154. const OLECHAR* (*skipIdentifierFunc)(LPCOLESTR pcz, const CharClassifier* instance);
  155. const LPCUTF8 (*skipIdentifierStartEndFunc)(LPCUTF8 psz, LPCUTF8 end, const CharClassifier* instance);
  156. void initClassifier(ScriptContext* scriptContext, CharClassifierModes identifierSupport,
  157. CharClassifierModes whiteSpaceSupport, CharClassifierModes generalCharClassificationSupport, bool codePointSupport, bool isES6UnicodeVerboseEnabled, CharClassifierModes fallbackMode = CharClassifierModes::ES5);
  158. public:
  159. CharTypes GetCharType(codepoint_t ch) const;
  160. CharTypeFlags GetCharFlags(codepoint_t ch) const;
  161. template <bool isBigChar>
  162. BOOL IsWhiteSpaceFast(codepoint_t ch) const;
  163. BOOL IsWhiteSpace(codepoint_t ch) const
  164. {
  165. return FBigChar(ch) ? IsWhiteSpaceFast<true>(ch) : IsWhiteSpaceFast<false>(ch);
  166. }
  167. BOOL IsBiDirectionalChar(codepoint_t ch) const;
  168. template<bool isBigChar>
  169. BOOL IsIdStartFast(codepoint_t ch) const;
  170. BOOL IsIdStart(codepoint_t ch) const
  171. {
  172. return FBigChar(ch) ? IsIdStartFast<true>(ch) : IsIdStartFast<false>(ch);
  173. }
  174. template<bool isBigChar>
  175. BOOL IsIdContinueFast(codepoint_t ch) const;
  176. BOOL IsIdContinue(codepoint_t ch) const
  177. {
  178. return FBigChar(ch) ? IsIdContinueFast<true>(ch) : IsIdContinueFast<false>(ch);
  179. }
  180. const size_t SkipBiDirectionalChars(_In_z_bytecount_(2 * length) LPCOLESTR psz, _In_ size_t startIndex, _In_ size_t length) const
  181. {
  182. size_t count = 0;
  183. while (startIndex < length)
  184. {
  185. if (!IsBiDirectionalChar(psz[startIndex + count]))
  186. {
  187. return count;
  188. }
  189. count++;
  190. }
  191. return count;
  192. }
  193. const wchar_t SkipBiDirectionalChars(_In_z_ wchar_t* &pszRef) const
  194. {
  195. while (*pszRef != '\0')
  196. {
  197. if (!IsBiDirectionalChar(*pszRef))
  198. {
  199. return *pszRef;
  200. }
  201. pszRef++;
  202. }
  203. return '\0';
  204. }
  205. const OLECHAR* SkipWhiteSpace(LPCOLESTR psz) const
  206. {
  207. // Fast path for the case in which first character is not space
  208. wchar_t firstChar = *psz;
  209. if (firstChar == 0)
  210. {
  211. return psz;
  212. }
  213. if (!this->IsWhiteSpace(firstChar) &&
  214. (skipWhiteSpaceFunc != &SkipWhiteSpaceSurrogate
  215. || !Js::NumberUtilities::IsSurrogateLowerPart(firstChar)))
  216. {
  217. return psz;
  218. }
  219. return skipWhiteSpaceFunc(psz, this);
  220. }
  221. const OLECHAR* SkipWhiteSpace(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd) const
  222. {
  223. // Fast path for the case in which first character is not space
  224. if (pStr == pStrEnd)
  225. {
  226. return pStr;
  227. }
  228. wchar_t firstChar = *pStr;
  229. if (!this->IsWhiteSpace(firstChar) &&
  230. (skipWhiteSpaceStartEndFunc != &SkipWhiteSpaceSurrogateStartEnd
  231. || !Js::NumberUtilities::IsSurrogateLowerPart(firstChar)))
  232. {
  233. return pStr;
  234. }
  235. return skipWhiteSpaceStartEndFunc(pStr, pStrEnd, this);
  236. }
  237. const OLECHAR* SkipIdentifier(LPCOLESTR psz) const
  238. {
  239. return skipIdentifierFunc(psz, this);
  240. }
  241. const LPCUTF8 SkipIdentifier(LPCUTF8 psz, LPCUTF8 end) const
  242. {
  243. return skipIdentifierStartEndFunc(psz, end, this);
  244. }
  245. const OLECHAR* SkipIdentifier(LPCOLESTR psz, LPCOLESTR end) const
  246. {
  247. return SkipIdentifier(psz);
  248. }
  249. CharClassifier(Js::ScriptContext* scriptContext);
  250. };
  251. };