RegexParser.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. namespace UnifiedRegex
  7. {
  8. struct ParseError
  9. {
  10. bool isBody;
  11. CharCount pos; // Position in unicode characters
  12. CharCount encodedPos; // Position in underlying characters (eg utf-8 bytes)
  13. HRESULT error;
  14. ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error);
  15. };
  16. template <typename EncodingPolicy, const bool IsLiteral>
  17. class Parser : private EncodingPolicy, private Chars<char16>
  18. {
  19. private:
  20. typedef typename EncodingPolicy::EncodedChar EncodedChar;
  21. // A linked list node to track indices of surrogate pairs.
  22. struct SurrogatePairTracker
  23. {
  24. const EncodedChar* location;
  25. // If this surrogate pair is inside a range, then rangeLocation isn't null.
  26. const EncodedChar* rangeLocation;
  27. codepoint_t value;
  28. uint32 length;
  29. size_t multiUnits;
  30. SurrogatePairTracker* next;
  31. SurrogatePairTracker(const EncodedChar* location, codepoint_t value, uint32 length, size_t multiUnits)
  32. : location(location)
  33. , next(nullptr)
  34. , value(value)
  35. , length(length)
  36. , multiUnits(multiUnits)
  37. , rangeLocation(nullptr)
  38. {
  39. }
  40. SurrogatePairTracker(const EncodedChar* location, const EncodedChar* rangeLocation, codepoint_t value, uint32 length, size_t multiUnits)
  41. : location(location)
  42. , next(nullptr)
  43. , value(value)
  44. , length(length)
  45. , multiUnits(multiUnits)
  46. , rangeLocation(rangeLocation)
  47. {
  48. }
  49. bool IsInsideRange() const
  50. {
  51. return this->rangeLocation != nullptr;
  52. }
  53. };
  54. static const CharCount initLitbufSize = 16;
  55. Js::ScriptContext* scriptContext;
  56. // Arena for nodes and items needed only during compilation
  57. ArenaAllocator* ctAllocator;
  58. // Standard characters using raw encoding character representation (eg char for utf-8)
  59. StandardChars<EncodedChar>* standardEncodedChars;
  60. // Standard characters using final character representation (eg char16 for Unicode)
  61. StandardChars<Char>* standardChars;
  62. #if ENABLE_REGEX_CONFIG_OPTIONS
  63. DebugWriter* w;
  64. #endif
  65. const EncodedChar* input;
  66. const EncodedChar* inputLim;
  67. const EncodedChar* next;
  68. bool inBody;
  69. // Maximum number of capturing groups allowed, including the entire regexp, which is always
  70. // considered a capturing group. Using INT16_MAX allows us to pass one value for each
  71. // group, plus a few additional values, to a JavaScript function without overflowing the
  72. // number of arguments. This is important, for example, in the implementation of
  73. // String.prototype.replace, where the second argument is a function.
  74. //
  75. // This should really be an unsigned, but we compare it against numGroups, so make it
  76. // an int for now to avoid a bunch of compiler warnings until we can go back and clean this up.
  77. static const int MAX_NUM_GROUPS = INT16_MAX;
  78. int numGroups; // determined in first parse
  79. int nextGroupId;
  80. // Buffer accumulating all literals.
  81. // In compile-time allocator, must be transferred to runtime allocator when build program
  82. Char* litbuf;
  83. CharCount litbufLen;
  84. CharCount litbufNext;
  85. // During pass 0, if /u option for regex is provided, a linked list will be built up to
  86. // track positions of surrogate pairs in the buffer. During pass 1, these linked lists will be used
  87. // to figure out when to output a surrogate pair node.
  88. SurrogatePairTracker* surrogatePairList;
  89. SurrogatePairTracker* currentSurrogatePairNode;
  90. bool unicodeFlagPresent;
  91. bool caseInsensitiveFlagPresent;
  92. // The following two variables are used to determine if the the surrogate pair has been encountered
  93. // First holds the temporary location, second holds the value of the codepoint
  94. const EncodedChar* tempLocationOfSurrogatePair;
  95. // This will be set to a location when we are parsing a range in TermPass0, and cleared when we are out of it.
  96. const EncodedChar* tempLocationOfRange;
  97. codepoint_t codePointAtTempLocation;
  98. // When a surrogate is added for tracking, this will be updated.
  99. const EncodedChar* positionAfterLastSurrogate;
  100. codepoint_t valueOfLastSurrogate;
  101. // deferred error state.
  102. ParseError* deferredIfNotUnicodeError;
  103. ParseError* deferredIfUnicodeError;
  104. private:
  105. //
  106. // Input buffer management
  107. //
  108. void SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody);
  109. // Current position in number of logical characters, regardless of underlying character encoding
  110. inline CharCount Pos();
  111. inline bool IsEOF();
  112. inline bool ECCanConsume(CharCount n = 1);
  113. inline EncodedChar ECLookahead(CharCount n = 0);
  114. inline EncodedChar ECLookback(CharCount n = 0);
  115. inline void ECConsume(CharCount n = 1);
  116. inline void ECConsumeMultiUnit(CharCount n = 1);
  117. inline void ECRevert(CharCount n = 1);
  118. //
  119. // Helpers
  120. //
  121. int TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair = false);
  122. void TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength);
  123. Node* CreateSurrogatePairAtom(char16 lower, char16 upper);
  124. AltNode* AppendSurrogateRangeToDisjunction(codepoint_t lowerCodePoint, codepoint_t upperCodePoint, AltNode *lastAlttNode);
  125. AltNode* AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAlttNode);
  126. //
  127. // Errors
  128. //
  129. void Fail(HRESULT error);
  130. void DeferredFailIfUnicode(HRESULT error);
  131. void DeferredFailIfNotUnicode(HRESULT error);
  132. inline void ECMust(EncodedChar ec, HRESULT error);
  133. inline Char NextChar();
  134. //
  135. // Patterns/Disjunctions/Alternatives
  136. //
  137. void PatternPass0();
  138. Node* PatternPass1();
  139. Node* UnionNodes(Node* prev, Node* curr);
  140. void DisjunctionPass0(int depth);
  141. Node* DisjunctionPass1();
  142. bool IsEndOfAlternative();
  143. void EnsureLitbuf(CharCount size);
  144. void AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode);
  145. Node* FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode);
  146. void AlternativePass0(int depth);
  147. Node* AlternativePass1();
  148. //
  149. // Terms
  150. //
  151. Node* NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body);
  152. bool AtQuantifier();
  153. bool OptNonGreedy();
  154. CharCount RepeatCount();
  155. void TermPass0(int depth);
  156. Node* TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
  157. bool AtomEscapePass0();
  158. bool AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
  159. bool SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
  160. //
  161. // Classes
  162. //
  163. bool AtSecondSingletonClassAtom();
  164. void CharacterClassPass0();
  165. template <bool containsSurrogates>
  166. Node* CharacterClassPass1();
  167. bool ClassEscapePass0(Char& singleton, bool& previousSurrogatePart);
  168. Node* ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart);
  169. Node* GetNodeWithValidCharacterSet(EncodedChar ch);
  170. //
  171. // Options
  172. //
  173. void Options(RegexFlags& flags);
  174. public:
  175. Parser
  176. ( Js::ScriptContext* scriptContext
  177. , ArenaAllocator* ctAllocator
  178. , StandardChars<EncodedChar>* standardEncodedChars
  179. , StandardChars<Char>* standardChars
  180. , bool isFromExternalSource
  181. #if ENABLE_REGEX_CONFIG_OPTIONS
  182. , DebugWriter* w
  183. #endif
  184. );
  185. //
  186. // Entry points
  187. //
  188. Node* ParseDynamic
  189. ( const EncodedChar* body // non null, null terminated (may contain embedded nulls)
  190. , const EncodedChar* bodyLim // points to terminating null of above
  191. , const EncodedChar* opts // may be null if no options, otherwise null terminated
  192. , const EncodedChar* optsLim // if above non-null, points to terminating null of above
  193. , RegexFlags& flags );
  194. // (*) For ParseLiteral:
  195. // - input string must be null terminated
  196. // - inputLim may point to the terminating null in above or before it
  197. // - if the later, input is known to be syntactically well-formed so that the parser
  198. // will find the natural end of the regex literal before passing inputLim
  199. // - input may contain nulls before the inputLim
  200. Node* ParseLiteral
  201. ( const EncodedChar* input // non null, null terminated (may contain embedded nulls)
  202. , const EncodedChar* inputLim // see (*) above
  203. , CharCount& outBodyEncodedChars // in encoded characters, not including trailing '/'
  204. , CharCount& outTotalEncodedChars // in encoded characters, including trailing '/' and any options
  205. , CharCount& outBodyChars // in unicode characters, not including ttrailing '/'
  206. , CharCount& outTotalChars // in unicode characters, including trailing '/' and any options
  207. , RegexFlags& flags );
  208. void ParseLiteralNoAST
  209. ( const EncodedChar* input // non null, null terminated
  210. , const EncodedChar* inputLim // see (*) above
  211. , CharCount& outBodyEncodedChars
  212. , CharCount& outTotalEncodedChars
  213. , CharCount& outBodyChars
  214. , CharCount& outTotalChars );
  215. template<const bool buildAST>
  216. RegexPattern* CompileProgram
  217. ( Node* root,
  218. const EncodedChar*& currentCharacter,
  219. const CharCount totalLen,
  220. const CharCount bodyChars,
  221. const CharCount bodyEncodedChars,
  222. const CharCount totalChars,
  223. const RegexFlags flags );
  224. static void CaptureEmptySourceAndNoGroups(Program* program);
  225. // bodyChars is number of unicode characters in program body, which may be less than the number
  226. // of underlying UTF-8 characters
  227. void CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars);
  228. inline const Char* GetLitbuf() { return litbuf; }
  229. void FreeBody();
  230. size_t GetMultiUnits() { return this->m_cMultiUnits; }
  231. };
  232. }