RegexParser.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. namespace UnifiedRegex
  7. {
  8. struct ParseError
  9. {
  10. bool isBody;
  11. CharCount pos; // Position in unicode characters
  12. CharCount encodedPos; // Position in underlying characters (eg utf-8 bytes)
  13. HRESULT error;
  14. ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error);
  15. };
  16. template <typename EncodingPolicy, const bool IsLiteral>
  17. class Parser : private EncodingPolicy, private Chars<char16>
  18. {
  19. private:
  20. typedef typename EncodingPolicy::EncodedChar EncodedChar;
  21. // A linked list node to track indices of surrogate pairs.
  22. struct SurrogatePairTracker
  23. {
  24. const EncodedChar* location;
  25. // If this surrogate pair is inside a range, then rangeLocation isn't null.
  26. const EncodedChar* rangeLocation;
  27. codepoint_t value;
  28. uint32 length;
  29. size_t multiUnits;
  30. SurrogatePairTracker* next;
  31. SurrogatePairTracker(const EncodedChar* location, codepoint_t value, uint32 length, size_t multiUnits)
  32. : location(location)
  33. , next(nullptr)
  34. , value(value)
  35. , length(length)
  36. , multiUnits(multiUnits)
  37. , rangeLocation(nullptr)
  38. {
  39. }
  40. SurrogatePairTracker(const EncodedChar* location, const EncodedChar* rangeLocation, codepoint_t value, uint32 length, size_t multiUnits)
  41. : location(location)
  42. , next(nullptr)
  43. , value(value)
  44. , length(length)
  45. , multiUnits(multiUnits)
  46. , rangeLocation(rangeLocation)
  47. {
  48. }
  49. bool IsInsideRange() const
  50. {
  51. return this->rangeLocation != nullptr;
  52. }
  53. };
  54. static const CharCount initLitbufSize = 16;
  55. Js::ScriptContext* scriptContext;
  56. // Arena for nodes and items needed only during compilation
  57. ArenaAllocator* ctAllocator;
  58. // Standard characters using raw encoding character representation (eg char for utf-8)
  59. StandardChars<EncodedChar>* standardEncodedChars;
  60. // Standard characters using final character representation (eg char16 for Unicode)
  61. StandardChars<Char>* standardChars;
  62. #if ENABLE_REGEX_CONFIG_OPTIONS
  63. DebugWriter* w;
  64. #endif
  65. const EncodedChar* input;
  66. const EncodedChar* inputLim;
  67. const EncodedChar* next;
  68. bool inBody;
  69. int numGroups; // determined in first parse
  70. int nextGroupId;
  71. // Buffer accumulating all literals.
  72. // In compile-time allocator, must be transferred to runtime allocator when build program
  73. Char* litbuf;
  74. CharCount litbufLen;
  75. CharCount litbufNext;
  76. // During pass 0, if /u option for regex is provided, a linked list will be built up to
  77. // track positions of surrogate pairs in the buffer. During pass 1, these linked lists will be used
  78. // to figure out when to output a surrogate pair node.
  79. SurrogatePairTracker* surrogatePairList;
  80. SurrogatePairTracker* currentSurrogatePairNode;
  81. bool unicodeFlagPresent;
  82. bool caseInsensitiveFlagPresent;
  83. // The following two variables are used to determine if the the surrogate pair has been encountered
  84. // First holds the temporary location, second holds the value of the codepoint
  85. const EncodedChar* tempLocationOfSurrogatePair;
  86. // This will be set to a location when we are parsing a range in TermPass0, and cleared when we are out of it.
  87. const EncodedChar* tempLocationOfRange;
  88. codepoint_t codePointAtTempLocation;
  89. // When a surrogate is added for tracking, this will be updated.
  90. const EncodedChar* positionAfterLastSurrogate;
  91. codepoint_t valueOfLastSurrogate;
  92. // deferred error state.
  93. ParseError* deferredIfNotUnicodeError;
  94. ParseError* deferredIfUnicodeError;
  95. private:
  96. //
  97. // Input buffer management
  98. //
  99. void SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody);
  100. // Current position in number of logical characters, regardless of underlying character encoding
  101. inline CharCount Pos();
  102. inline bool IsEOF();
  103. inline bool ECCanConsume(CharCount n);
  104. inline EncodedChar ECLookahead(CharCount n = 0);
  105. inline EncodedChar ECLookback(CharCount n = 0);
  106. inline void ECConsume(CharCount n = 1);
  107. inline void ECConsumeMultiUnit(CharCount n = 1);
  108. inline void ECRevert(CharCount n = 1);
  109. //
  110. // Helpers
  111. //
  112. int TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair = false);
  113. void TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength);
  114. Node* CreateSurrogatePairAtom(char16 lower, char16 upper);
  115. AltNode* AppendSurrogateRangeToDisjunction(codepoint_t lowerCodePoint, codepoint_t upperCodePoint, AltNode *lastAlttNode);
  116. AltNode* AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAlttNode);
  117. //
  118. // Errors
  119. //
  120. void Fail(HRESULT error);
  121. void DeferredFailIfUnicode(HRESULT error);
  122. void DeferredFailIfNotUnicode(HRESULT error);
  123. inline void ECMust(EncodedChar ec, HRESULT error);
  124. inline Char NextChar();
  125. //
  126. // Patterns/Disjunctions/Alternatives
  127. //
  128. void PatternPass0();
  129. Node* PatternPass1();
  130. Node* UnionNodes(Node* prev, Node* curr);
  131. void DisjunctionPass0(int depth);
  132. Node* DisjunctionPass1();
  133. bool IsEndOfAlternative();
  134. void EnsureLitbuf(CharCount size);
  135. void AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode);
  136. Node* FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode);
  137. void AlternativePass0(int depth);
  138. Node* AlternativePass1();
  139. //
  140. // Terms
  141. //
  142. Node* NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body);
  143. bool AtQuantifier();
  144. bool OptNonGreedy();
  145. CharCount RepeatCount();
  146. void TermPass0(int depth);
  147. Node* TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
  148. bool AtomEscapePass0();
  149. bool AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
  150. bool SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
  151. //
  152. // Classes
  153. //
  154. bool AtSecondSingletonClassAtom();
  155. void CharacterClassPass0();
  156. template <bool containsSurrogates>
  157. Node* CharacterClassPass1();
  158. bool ClassEscapePass0(Char& singleton, bool& previousSurrogatePart);
  159. Node* ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart);
  160. Node* GetNodeWithValidCharacterSet(EncodedChar ch);
  161. //
  162. // Options
  163. //
  164. void Options(RegexFlags& flags);
  165. public:
  166. Parser
  167. ( Js::ScriptContext* scriptContext
  168. , ArenaAllocator* ctAllocator
  169. , StandardChars<EncodedChar>* standardEncodedChars
  170. , StandardChars<Char>* standardChars
  171. , bool isFromExternalSource
  172. #if ENABLE_REGEX_CONFIG_OPTIONS
  173. , DebugWriter* w
  174. #endif
  175. );
  176. //
  177. // Entry points
  178. //
  179. Node* ParseDynamic
  180. ( const EncodedChar* body // non null, null terminated (may contain embedded nulls)
  181. , const EncodedChar* bodyLim // points to terminating null of above
  182. , const EncodedChar* opts // may be null if no options, otherwise null terminated
  183. , const EncodedChar* optsLim // if above non-null, points to terminating null of above
  184. , RegexFlags& flags );
  185. // (*) For ParseLiteral:
  186. // - input string must be null terminated
  187. // - inputLim may point to the terminating null in above or before it
  188. // - if the later, input is known to be syntactically well-formed so that the parser
  189. // will find the natural end of the regex literal before passing inputLim
  190. // - input may contain nulls before the inputLim
  191. Node* ParseLiteral
  192. ( const EncodedChar* input // non null, null terminated (may contain embedded nulls)
  193. , const EncodedChar* inputLim // see (*) above
  194. , CharCount& outBodyEncodedChars // in encoded characters, not including trailing '/'
  195. , CharCount& outTotalEncodedChars // in encoded characters, including trailing '/' and any options
  196. , CharCount& outBodyChars // in unicode characters, not including ttrailing '/'
  197. , CharCount& outTotalChars // in unicode characters, including trailing '/' and any options
  198. , RegexFlags& flags );
  199. void ParseLiteralNoAST
  200. ( const EncodedChar* input // non null, null terminated
  201. , const EncodedChar* inputLim // see (*) above
  202. , CharCount& outBodyEncodedChars
  203. , CharCount& outTotalEncodedChars
  204. , CharCount& outBodyChars
  205. , CharCount& outTotalChars );
  206. template<const bool buildAST>
  207. RegexPattern* CompileProgram
  208. ( Node* root,
  209. const EncodedChar*& currentCharacter,
  210. const CharCount totalLen,
  211. const CharCount bodyChars,
  212. const CharCount totalChars,
  213. const RegexFlags flags );
  214. static void CaptureEmptySourceAndNoGroups(Program* program);
  215. // bodyChars is number of unicode characters in program body, which may be less than the number
  216. // of underlying UTF-8 characters
  217. void CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars);
  218. inline const Char* GetLitbuf() { return litbuf; }
  219. void FreeBody();
  220. size_t GetMultiUnits() { return this->m_cMultiUnits; }
  221. };
  222. }