RegexParser.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. namespace UnifiedRegex
  7. {
  8. struct ParseError
  9. {
  10. bool isBody;
  11. CharCount pos; // Position in unicode characters
  12. CharCount encodedPos; // Position in underlying characters (eg utf-8 bytes)
  13. HRESULT error;
  14. ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error);
  15. };
  16. template <typename EncodingPolicy, const bool IsLiteral>
  17. class Parser : private EncodingPolicy, private Chars<char16>
  18. {
  19. private:
  20. typedef typename EncodingPolicy::EncodedChar EncodedChar;
  21. // A linked list node to track indices of surrogate pairs.
  22. struct SurrogatePairTracker
  23. {
  24. const EncodedChar* location;
  25. // If this surrogate pair is inside a range, then rangeLocation isn't null.
  26. const EncodedChar* rangeLocation;
  27. codepoint_t value;
  28. uint32 length;
  29. size_t multiUnits;
  30. SurrogatePairTracker* next;
  31. SurrogatePairTracker(const EncodedChar* location, codepoint_t value, uint32 length, size_t multiUnits)
  32. : location(location)
  33. , next(nullptr)
  34. , value(value)
  35. , length(length)
  36. , multiUnits(multiUnits)
  37. , rangeLocation(nullptr)
  38. {
  39. }
  40. SurrogatePairTracker(const EncodedChar* location, const EncodedChar* rangeLocation, codepoint_t value, uint32 length, size_t multiUnits)
  41. : location(location)
  42. , next(nullptr)
  43. , value(value)
  44. , length(length)
  45. , multiUnits(multiUnits)
  46. , rangeLocation(rangeLocation)
  47. {
  48. }
  49. bool IsInsideRange() const
  50. {
  51. return this->rangeLocation != nullptr;
  52. }
  53. };
  54. static const CharCount initLitbufSize = 16;
  55. Js::ScriptContext* scriptContext;
  56. // Arena for nodes and items needed only during compilation
  57. ArenaAllocator* ctAllocator;
  58. // Standard characters using raw encoding character representation (eg char for utf-8)
  59. StandardChars<EncodedChar>* standardEncodedChars;
  60. // Standard characters using final character representation (eg char16 for Unicode)
  61. StandardChars<Char>* standardChars;
  62. #if ENABLE_REGEX_CONFIG_OPTIONS
  63. DebugWriter* w;
  64. #endif
  65. const EncodedChar* input;
  66. const EncodedChar* inputLim;
  67. const EncodedChar* next;
  68. bool inBody;
  69. // Maximum number of capturing groups allowed, including the entire regexp, which is always
  70. // considered a capturing group. Using INT16_MAX allows us to pass one value for each
  71. // group, plus a few additional values, to a JavaScript function without overflowing the
  72. // number of arguments. This is important, for example, in the implementation of
  73. // String.prototype.replace, where the second argument is a function.
  74. static const uint16 MAX_NUM_GROUPS = INT16_MAX;
  75. uint16 numGroups; // determined in first parse
  76. // Keeps track of how many capturing groups we've seen during parsing. We use an int, rather than
  77. // a uint16, to be sure we don't overflow during parsing and only check it against MAX_NUM_GROUPS at
  78. // the end. (We know we can't overflow an int because strings and regex literals are limited to
  79. // 2G characters and therefore to 1G pairs of parentheses, which can fit into an int. I'd prefer
  80. // to use size_t here, but making that change would go down a serious rabbit hole changing the
  81. // interface to UnifiedRegex::Node::AccumDefineGroups.)
  82. int nextGroupId;
  83. // Buffer accumulating all literals.
  84. // In compile-time allocator, must be transferred to runtime allocator when build program
  85. Char* litbuf;
  86. CharCount litbufLen;
  87. CharCount litbufNext;
  88. // During pass 0, if /u option for regex is provided, a linked list will be built up to
  89. // track positions of surrogate pairs in the buffer. During pass 1, these linked lists will be used
  90. // to figure out when to output a surrogate pair node.
  91. SurrogatePairTracker* surrogatePairList;
  92. SurrogatePairTracker* currentSurrogatePairNode;
  93. bool unicodeFlagPresent;
  94. bool caseInsensitiveFlagPresent;
  95. // The following two variables are used to determine if the the surrogate pair has been encountered
  96. // First holds the temporary location, second holds the value of the codepoint
  97. const EncodedChar* tempLocationOfSurrogatePair;
  98. // This will be set to a location when we are parsing a range in TermPass0, and cleared when we are out of it.
  99. const EncodedChar* tempLocationOfRange;
  100. codepoint_t codePointAtTempLocation;
  101. // When a surrogate is added for tracking, this will be updated.
  102. const EncodedChar* positionAfterLastSurrogate;
  103. codepoint_t valueOfLastSurrogate;
  104. // deferred error state.
  105. ParseError* deferredIfNotUnicodeError;
  106. ParseError* deferredIfUnicodeError;
  107. private:
  108. //
  109. // Input buffer management
  110. //
  111. void SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody);
  112. // Current position in number of logical characters, regardless of underlying character encoding
  113. inline CharCount Pos();
  114. inline bool IsEOF();
  115. inline bool ECCanConsume(CharCount n = 1);
  116. inline EncodedChar ECLookahead(CharCount n = 0);
  117. inline EncodedChar ECLookback(CharCount n = 0);
  118. inline void ECConsume(CharCount n = 1);
  119. inline void ECConsumeMultiUnit(CharCount n = 1);
  120. inline void ECRevert(CharCount n = 1);
  121. //
  122. // Helpers
  123. //
  124. int TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair = false);
  125. void TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength);
  126. Node* CreateSurrogatePairAtom(char16 lower, char16 upper);
  127. AltNode* AppendSurrogateRangeToDisjunction(codepoint_t lowerCodePoint, codepoint_t upperCodePoint, AltNode *lastAlttNode);
  128. AltNode* AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAlttNode);
  129. //
  130. // Errors
  131. //
  132. void Fail(HRESULT error);
  133. void DeferredFailIfUnicode(HRESULT error);
  134. void DeferredFailIfNotUnicode(HRESULT error);
  135. inline void ECMust(EncodedChar ec, HRESULT error);
  136. inline Char NextChar();
  137. //
  138. // Patterns/Disjunctions/Alternatives
  139. //
  140. void PatternPass0();
  141. Node* PatternPass1();
  142. Node* UnionNodes(Node* prev, Node* curr);
  143. void DisjunctionPass0(int depth);
  144. Node* DisjunctionPass1();
  145. bool IsEndOfAlternative();
  146. void EnsureLitbuf(CharCount size);
  147. void AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode);
  148. Node* FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode);
  149. void AlternativePass0(int depth);
  150. Node* AlternativePass1();
  151. //
  152. // Terms
  153. //
  154. Node* NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body);
  155. bool AtQuantifier();
  156. bool OptNonGreedy();
  157. CharCount RepeatCount();
  158. void TermPass0(int depth);
  159. Node* TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
  160. bool AtomEscapePass0();
  161. bool AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
  162. bool SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
  163. //
  164. // Classes
  165. //
  166. bool AtSecondSingletonClassAtom();
  167. void CharacterClassPass0();
  168. template <bool containsSurrogates>
  169. Node* CharacterClassPass1();
  170. bool ClassEscapePass0(Char& singleton, bool& previousSurrogatePart);
  171. Node* ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart);
  172. Node* GetNodeWithValidCharacterSet(EncodedChar ch);
  173. //
  174. // Options
  175. //
  176. void Options(RegexFlags& flags);
  177. public:
  178. Parser
  179. ( Js::ScriptContext* scriptContext
  180. , ArenaAllocator* ctAllocator
  181. , StandardChars<EncodedChar>* standardEncodedChars
  182. , StandardChars<Char>* standardChars
  183. , bool isUtf8
  184. #if ENABLE_REGEX_CONFIG_OPTIONS
  185. , DebugWriter* w
  186. #endif
  187. );
  188. //
  189. // Entry points
  190. //
  191. Node* ParseDynamic
  192. ( const EncodedChar* body // non null, null terminated (may contain embedded nulls)
  193. , const EncodedChar* bodyLim // points to terminating null of above
  194. , const EncodedChar* opts // may be null if no options, otherwise null terminated
  195. , const EncodedChar* optsLim // if above non-null, points to terminating null of above
  196. , RegexFlags& flags );
  197. // (*) For ParseLiteral:
  198. // - input string must be null terminated
  199. // - inputLim may point to the terminating null in above or before it
  200. // - if the later, input is known to be syntactically well-formed so that the parser
  201. // will find the natural end of the regex literal before passing inputLim
  202. // - input may contain nulls before the inputLim
  203. Node* ParseLiteral
  204. ( const EncodedChar* input // non null, null terminated (may contain embedded nulls)
  205. , const EncodedChar* inputLim // see (*) above
  206. , CharCount& outBodyEncodedChars // in encoded characters, not including trailing '/'
  207. , CharCount& outTotalEncodedChars // in encoded characters, including trailing '/' and any options
  208. , CharCount& outBodyChars // in unicode characters, not including ttrailing '/'
  209. , CharCount& outTotalChars // in unicode characters, including trailing '/' and any options
  210. , RegexFlags& flags );
  211. void ParseLiteralNoAST
  212. ( const EncodedChar* input // non null, null terminated
  213. , const EncodedChar* inputLim // see (*) above
  214. , CharCount& outBodyEncodedChars
  215. , CharCount& outTotalEncodedChars
  216. , CharCount& outBodyChars
  217. , CharCount& outTotalChars );
  218. template<const bool buildAST>
  219. RegexPattern* CompileProgram
  220. ( Node* root,
  221. const EncodedChar*& currentCharacter,
  222. const CharCount totalLen,
  223. const CharCount bodyChars,
  224. const CharCount bodyEncodedChars,
  225. const CharCount totalChars,
  226. const RegexFlags flags );
  227. static void CaptureEmptySourceAndNoGroups(Program* program);
  228. // bodyChars is number of unicode characters in program body, which may be less than the number
  229. // of underlying UTF-8 characters
  230. void CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars);
  231. inline const Char* GetLitbuf() { return litbuf; }
  232. void FreeBody();
  233. size_t GetMultiUnits() { return this->m_cMultiUnits; }
  234. };
  235. }