CharClassifier.h 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. //Helpers
  7. static inline BOOL FBigChar(codepoint_t ch) { return ch >= 128u; }
  8. static inline BOOL BoolFromDbl(double dbl) { return !Js::NumberUtilities::IsNan(dbl) && (0 != dbl); }
  9. enum CharTypes
  10. {
  11. _C_UNK, // Unknown grouping
  12. _C_ERR, // illegal character
  13. _C_NUL, // NUL character
  14. _C_LET, // letter (A-Z,a-z)
  15. _C_DIG, // digit (0-9)
  16. _C_WSP, // white space
  17. _C_NWL, // new line
  18. _C_DOL, // $
  19. _C_BSL, // \ (backslash)
  20. _C_BKQ, // `
  21. _C_AT, // @
  22. _C_SHP, // #
  23. _C_BNG, // !
  24. _C_QUO, // "
  25. _C_APO, // '
  26. _C_PCT, // %
  27. _C_AMP, // &
  28. _C_LPR, // (
  29. _C_RPR, // )
  30. _C_PLS, // +
  31. _C_MIN, // -
  32. _C_MUL, // *
  33. _C_SLH, // /
  34. _C_XOR, // ^
  35. _C_CMA, // ,
  36. _C_DOT, // .
  37. _C_LT, // <
  38. _C_EQ, // =
  39. _C_GT, // >
  40. _C_QUE, // ?
  41. _C_LBR, // [
  42. _C_RBR, // ]
  43. _C_USC, // _
  44. _C_LC, // {
  45. _C_RC, // }
  46. _C_BAR, // |
  47. _C_TIL, // ~
  48. _C_COL, // :
  49. _C_SMC, // ;
  50. };
  51. enum
  52. {
  53. kchNUL = 0x00,
  54. kchNWL = 0x0A,
  55. kchRET = 0x0D,
  56. kchBSL = '\\',
  57. kchSHP = '#',
  58. kchBNG = '!',
  59. kchQUO = '"',
  60. kchAPO = '\'',
  61. kchPCT = '%',
  62. kchAMP = '&',
  63. kchLPR = '(',
  64. kchRPR = ')',
  65. kchPLS = '+',
  66. kchMIN = '-',
  67. kchMUL = '*',
  68. kchSLH = '/',
  69. kchXOR = '^',
  70. kchCMA = ',',
  71. kchDOT = '.',
  72. kchLT = '<',
  73. kchEQ = '=',
  74. kchGT = '>',
  75. kchQUE = '?',
  76. kchLBR = '[',
  77. kchRBR = ']',
  78. kchUSC = '_',
  79. kchLC = '{',
  80. kchRC = '}',
  81. kchBAR = '|',
  82. kchTIL = '~',
  83. kchCOL = ':',
  84. kchSMC = ';',
  85. kchLS = 0x2028, //classifies as new line
  86. kchPS = 0x2029 //classifies as new line
  87. };
  88. namespace Js
  89. {
  90. class WindowsGlobalizationAdapter;
  91. class DelayLoadWindowsGlobalization;
  92. typedef
  93. enum CharClassifierModes {
  94. ES5 = 1,
  95. ES6 = 2
  96. } CharClassifierModes;
  97. class CharClassifier
  98. {
  99. private:
  100. #if ENABLE_UNICODE_API
  101. static bool BigCharIsWhitespaceDefault(codepoint_t ch, const CharClassifier *instance);
  102. static bool BigCharIsIdStartDefault(codepoint_t ch, const CharClassifier *instance);
  103. static bool BigCharIsIdContinueDefault(codepoint_t ch, const CharClassifier *instance);
  104. #endif
  105. static bool BigCharIsWhitespaceES6(codepoint_t ch, const CharClassifier *instance);
  106. static bool BigCharIsIdStartES6(codepoint_t codePoint, const CharClassifier *instance);
  107. static bool BigCharIsIdContinueES6(codepoint_t codePoint, const CharClassifier *instance);
  108. static CharTypes GetBigCharTypeES6(codepoint_t ch, const CharClassifier *instance);
  109. static PlatformAgnostic::UnicodeText::CharacterTypeFlags GetBigCharFlagsES6(codepoint_t ch, const CharClassifier *instance);
  110. static CharTypes GetBigCharTypeES5(codepoint_t ch, const CharClassifier *instance);
  111. static PlatformAgnostic::UnicodeText::CharacterTypeFlags GetBigCharFlagsES5(codepoint_t ch, const CharClassifier *instance);
  112. static const OLECHAR* SkipWhiteSpaceSurrogate(LPCOLESTR psz, const CharClassifier *instance);
  113. static const OLECHAR* SkipWhiteSpaceSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance);
  114. static const OLECHAR* SkipIdentifierSurrogate(LPCOLESTR psz, const CharClassifier *instance);
  115. static const LPCUTF8 SkipIdentifierSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance);
  116. static const OLECHAR* SkipWhiteSpaceNonSurrogate(LPCOLESTR psz, const CharClassifier *instance);
  117. static const OLECHAR* SkipWhiteSpaceNonSurrogateStartEnd(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd, const CharClassifier *instance);
  118. static const OLECHAR* SkipIdentifierNonSurrogate(LPCOLESTR psz, const CharClassifier *instance);
  119. static const LPCUTF8 SkipIdentifierNonSurrogateStartEnd(LPCUTF8 psz, LPCUTF8 end, const CharClassifier *instance);
  120. CharTypes (*getBigCharTypeFunc)(codepoint_t ch, const CharClassifier *instance);
  121. PlatformAgnostic::UnicodeText::CharacterTypeFlags (*getBigCharFlagsFunc)(codepoint_t ch, const CharClassifier *instance);
  122. bool (*bigCharIsWhitespaceFunc)(codepoint_t ch, const CharClassifier *instance);
  123. bool (*bigCharIsIdStartFunc)(codepoint_t ch, const CharClassifier *instance);
  124. bool (*bigCharIsIdContinueFunc)(codepoint_t ch, const CharClassifier *instance);
  125. const OLECHAR* (*skipWhiteSpaceFunc)(LPCOLESTR psz, const CharClassifier* instance);
  126. const OLECHAR* (*skipWhiteSpaceStartEndFunc)(LPCOLESTR pStr, LPCOLESTR pStrEnd, const CharClassifier* instance);
  127. const OLECHAR* (*skipIdentifierFunc)(LPCOLESTR pcz, const CharClassifier* instance);
  128. const LPCUTF8 (*skipIdentifierStartEndFunc)(LPCUTF8 psz, LPCUTF8 end, const CharClassifier* instance);
  129. void initClassifier(ScriptContext* scriptContext, CharClassifierModes identifierSupport,
  130. CharClassifierModes whiteSpaceSupport, CharClassifierModes generalCharClassificationSupport, bool codePointSupport, bool isES6UnicodeVerboseEnabled, CharClassifierModes fallbackMode = CharClassifierModes::ES5);
  131. public:
  132. CharTypes GetCharType(codepoint_t ch) const;
  133. #if ENABLE_UNICODE_API
  134. PlatformAgnostic::UnicodeText::CharacterTypeFlags GetCharFlags(codepoint_t ch) const;
  135. #endif
  136. template <bool isBigChar>
  137. bool IsWhiteSpaceFast(codepoint_t ch) const;
  138. bool IsWhiteSpace(codepoint_t ch) const
  139. {
  140. return FBigChar(ch) ? IsWhiteSpaceFast<true>(ch) : IsWhiteSpaceFast<false>(ch);
  141. }
  142. bool IsBiDirectionalChar(codepoint_t ch) const;
  143. template<bool isBigChar>
  144. bool IsIdStartFast(codepoint_t ch) const;
  145. bool IsIdStart(codepoint_t ch) const
  146. {
  147. return FBigChar(ch) ? IsIdStartFast<true>(ch) : IsIdStartFast<false>(ch);
  148. }
  149. template<bool isBigChar>
  150. bool IsIdContinueFast(codepoint_t ch) const;
  151. bool IsIdContinue(codepoint_t ch) const
  152. {
  153. return FBigChar(ch) ? IsIdContinueFast<true>(ch) : IsIdContinueFast<false>(ch);
  154. }
  155. const size_t SkipBiDirectionalChars(_In_z_bytecount_(2 * length) LPCOLESTR psz, _In_ size_t startIndex, _In_ size_t length) const
  156. {
  157. size_t count = 0;
  158. while (startIndex < length)
  159. {
  160. if (!IsBiDirectionalChar(psz[startIndex + count]))
  161. {
  162. return count;
  163. }
  164. count++;
  165. }
  166. return count;
  167. }
  168. const char16 SkipBiDirectionalChars(_In_z_ char16* &pszRef) const
  169. {
  170. while (*pszRef != '\0')
  171. {
  172. if (!IsBiDirectionalChar(*pszRef))
  173. {
  174. return *pszRef;
  175. }
  176. pszRef++;
  177. }
  178. return '\0';
  179. }
  180. const OLECHAR* SkipWhiteSpace(LPCOLESTR psz) const
  181. {
  182. // Fast path for the case in which first character is not space
  183. char16 firstChar = *psz;
  184. if (firstChar == 0)
  185. {
  186. return psz;
  187. }
  188. if (!this->IsWhiteSpace(firstChar) &&
  189. (skipWhiteSpaceFunc != &SkipWhiteSpaceSurrogate
  190. || !Js::NumberUtilities::IsSurrogateLowerPart(firstChar)))
  191. {
  192. return psz;
  193. }
  194. return skipWhiteSpaceFunc(psz, this);
  195. }
  196. const OLECHAR* SkipWhiteSpace(_In_reads_(pStrEnd - pStr) LPCOLESTR pStr, _In_ LPCOLESTR pStrEnd) const
  197. {
  198. // Fast path for the case in which first character is not space
  199. if (pStr == pStrEnd)
  200. {
  201. return pStr;
  202. }
  203. char16 firstChar = *pStr;
  204. if (!this->IsWhiteSpace(firstChar) &&
  205. (skipWhiteSpaceStartEndFunc != &SkipWhiteSpaceSurrogateStartEnd
  206. || !Js::NumberUtilities::IsSurrogateLowerPart(firstChar)))
  207. {
  208. return pStr;
  209. }
  210. return skipWhiteSpaceStartEndFunc(pStr, pStrEnd, this);
  211. }
  212. const OLECHAR* SkipIdentifier(LPCOLESTR psz) const
  213. {
  214. return skipIdentifierFunc(psz, this);
  215. }
  216. const LPCUTF8 SkipIdentifier(LPCUTF8 psz, LPCUTF8 end) const
  217. {
  218. return skipIdentifierStartEndFunc(psz, end, this);
  219. }
  220. const OLECHAR* SkipIdentifier(LPCOLESTR psz, LPCOLESTR end) const
  221. {
  222. return SkipIdentifier(psz);
  223. }
  224. CharClassifier(Js::ScriptContext* scriptContext);
  225. };
  226. };