RegexCompileTime.h 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. //
  6. // Regex parsing and AST-to-AST transformations/analysis
  7. //
  8. #pragma once
  9. namespace UnifiedRegex
  10. {
  11. // FORWARD
  12. class Compiler;
  13. // ----------------------------------------------------------------------
  14. // Node
  15. // ----------------------------------------------------------------------
  16. struct Node : protected Chars<char16>
  17. {
  18. // Optimization heuristics
  19. static const int maxSyncToSetSize = 256;
  20. static const int preferredMinSyncToLiteralLength = 3;
  21. static const int maxNumSyncLiterals = ScannersMixin::MaxNumSyncLiterals;
  22. static const int minRemainLengthForTest = 4;
  23. static const int maxCharsForConditionalTry = 20;
  24. static const int maxTrieArmExpansion = 16;
  25. enum NodeTag : uint16
  26. {
  27. // SimpleNode
  28. Empty, // (|...), etc
  29. BOL, // ^
  30. EOL, // $
  31. // WordBoundaryNode
  32. WordBoundary, // \b, \B
  33. // MatchLiteralNode
  34. MatchLiteral, // abc, non-empty
  35. // MatchCharNode
  36. MatchChar, // a
  37. // ConcatNode
  38. Concat, // e e
  39. // AltNode
  40. Alt, // e | e
  41. // DefineGroupNode
  42. DefineGroup, // (e)
  43. // MatchGroupNode
  44. MatchGroup, // \1
  45. // LoopNode
  46. Loop, // e*, e+, e{n,m}, e*?, e+?, e{n,m}?
  47. // MatchSetNode
  48. MatchSet, // [...], [^...], \b, \B, \d, \D, \s, \S, \w, \W, .
  49. // AssertionNode
  50. Assertion // (?=e), (?!e)
  51. };
  52. enum Features : uint16
  53. {
  54. HasEmpty = 1 << Empty,
  55. HasBOL = 1 << BOL,
  56. HasEOL = 1 << EOL,
  57. HasWordBoundary = 1 << WordBoundary,
  58. HasMatchLiteral = 1 << MatchLiteral,
  59. HasMatchChar = 1 << MatchChar,
  60. HasConcat = 1 << Concat,
  61. HasAlt = 1 << Alt,
  62. HasDefineGroup = 1 << DefineGroup,
  63. HasMatchGroup = 1 << MatchGroup,
  64. HasLoop = 1 << Loop,
  65. HasMatchSet = 1 << MatchSet,
  66. HasAssertion = 1 << Assertion
  67. };
  68. NodeTag tag;
  69. // Features of this and all child nodes
  70. uint16 features;
  71. // See comment for firstSet
  72. bool isFirstExact : 1;
  73. // True if pattern can never fail
  74. bool isThisIrrefutable : 1;
  75. // True if following patterns can never fail
  76. bool isFollowIrrefutable : 1;
  77. // True if pattern matches one or more word characters
  78. bool isWord : 1;
  79. // True if pattern will not consume more characters on backtracking
  80. bool isThisWillNotProgress : 1;
  81. // True if pattern will not consume fewer characters on backtracking
  82. bool isThisWillNotRegress : 1;
  83. // True if previous patterns will not consume more characters on backtracking
  84. bool isPrevWillNotProgress : 1;
  85. // True if previous patterns will not consume fewer characters on backtracking
  86. bool isPrevWillNotRegress : 1;
  87. // True if $ always follows pattern (and we are not in multi-line mode)
  88. bool isFollowEOL : 1;
  89. // True if pattern is deterministic (ie will never push a choicepoint during execution)
  90. // Determined in pass 4.
  91. bool isDeterministic : 1;
  92. // True if pattern is not in a loop context
  93. bool isNotInLoop : 1;
  94. // True if pattern will be matched against at least one segment of the input (ie will be executed at least once)
  95. bool isAtLeastOnce : 1;
  96. // True if pattern does not appear in an assertion
  97. bool isNotSpeculative : 1;
  98. // True if known to not be in a negative assertion context
  99. // (We do not play any games with double-negation)
  100. bool isNotNegated : 1;
  101. // True if this contains a hard fail BOI
  102. bool hasInitialHardFailBOI : 1;
  103. uint dummy : 17;
  104. // NOTE: The bodies of the following sets are allocated in the compile-time allocator and must be cloned
  105. // into the run-time allocator if they end up being used by an instruction.
  106. // NOTE: Sets may be aliased between nodes, and may be one of the standard sets.
  107. // Upper bound of FIRST characters of this pattern.
  108. // - Pattern will *never* match first characters not in this set
  109. // - If isFirstExact, pattern will *always* match first characters in this set (but may fail on later characters)
  110. // - If !isFirstExact, pattern *may* match first characters in this set, or may fail.
  111. CharSet<Char> *firstSet;
  112. // Upper bound of FOLLOW characters of this pattern.
  113. CharSet<Char> *followSet;
  114. // Range of number of characters already consumed before this pattern
  115. CountDomain prevConsumes;
  116. // Range of number of characters consumed by this pattern
  117. CountDomain thisConsumes;
  118. // Range of number of character consumed after this pattern
  119. CountDomain followConsumes;
  120. inline Node(NodeTag tag)
  121. : tag(tag)
  122. , features(0)
  123. , firstSet(0)
  124. , isFirstExact(false)
  125. , followSet(0)
  126. , isThisIrrefutable(false)
  127. , isFollowIrrefutable(false)
  128. , isWord(false)
  129. , isThisWillNotProgress(false)
  130. , isThisWillNotRegress(false)
  131. , isPrevWillNotProgress(false)
  132. , isPrevWillNotRegress(false)
  133. , isFollowEOL(false)
  134. , isDeterministic(false)
  135. , isNotInLoop(false)
  136. , isAtLeastOnce(false)
  137. , isNotSpeculative(false)
  138. , isNotNegated(false)
  139. , hasInitialHardFailBOI(false)
  140. {
  141. }
  142. //
  143. // Parse-time helpers
  144. //
  145. virtual CharCount LiteralLength() const = 0;
  146. virtual void AppendLiteral(CharCount& litbufNext, CharCount litbufLen, __inout_ecount(litbufLen) Char* litbuf) const;
  147. virtual bool IsCharOrPositiveSet() const = 0;
  148. // Transfer pass 0:
  149. // - synthesize the total number of characters required to store all literals, including case-invariant
  150. // expansions where required
  151. // - adjust match char nodes to account for case invariance if necessary
  152. virtual CharCount TransferPass0(Compiler& compiler, const Char* litbuf) = 0;
  153. // Transfer pass 1:
  154. // - transfer literals from given litbuf into newLitbuf, advancing nextLit as we go
  155. // - adjust set nodes to account for case invariance if necessary
  156. virtual void TransferPass1(Compiler& compiler, const Char* litbuf) = 0;
  157. //
  158. // Compile-time helpers
  159. //
  160. // True if firstSet of this node can be used as the followSet of a previous node, even though this node may
  161. // accept empty. True only for simple assertions.
  162. virtual bool IsRefiningAssertion(Compiler& compiler) = 0;
  163. // Annotation pass 0:
  164. // - bottom-up: isWord
  165. // - refine WordBoundary nodes where possible
  166. virtual void AnnotatePass0(Compiler& compiler) = 0;
  167. // Annotation pass 1:
  168. // - top-down: isNotInLoop, isAtLeastOnce, isNotSpeculative, isNotNegated
  169. // - bottom-up: features, thisConsumes, firstSet, isFirstExact, isThisIrrefutable, isThisWillNotProgress, isThisWillNotRegress
  170. virtual void AnnotatePass1(Compiler& compiler, bool parentNotInLoop, bool parentAtLeastOnce, bool parentNotSpeculative, bool parentNotNegated) = 0;
  171. // Annotation pass 2
  172. // - left-to-right: prevConsumes, isPrevWillNotProgress, isPrevWillNotRegress.
  173. virtual void AnnotatePass2(Compiler& compiler, CountDomain accumConsumes, bool accumPrevWillNotProgress, bool accumPrevWillNotRegress) = 0;
  174. // Annotation pass 3
  175. // - right-to-left: followConsumes, followSet, isFollowIrrefutable, isFollowEOL
  176. virtual void AnnotatePass3(Compiler& compiler, CountDomain accumConsumes, CharSet<Char>* accumFollow, bool accumFollowIrrefutable, bool accumFollowEOL) = 0;
  177. // Annotation pass 4
  178. // - possibly simplify the node in-place
  179. // - decide on the compilation scheme for each node, possibly recording it within node-specific fields
  180. // - bottom-up: isDeterministic
  181. virtual void AnnotatePass4(Compiler& compiler) = 0;
  182. // Return true if pattern can be complied assuming some fixed-length prefix of a matching input string has already been consumed
  183. virtual bool SupportsPrefixSkipping(Compiler& compiler) const = 0;
  184. // Return the Match(Char|Literal|Set) at the start of pattern, or 0 if no such unique node
  185. virtual Node* HeadSyncronizingNode(Compiler& compiler) = 0;
  186. // Count how many literals are in pattern and return their minimum length. Returns 0
  187. // if pattern not in a form which can be used by a SyncToLiterals instruction.
  188. virtual CharCount MinSyncronizingLiteralLength(Compiler& compiler, int& numLiterals) const = 0;
  189. // Collect the literals counted by above and build scanners for them.
  190. virtual void CollectSyncronizingLiterals(Compiler& compiler, ScannersMixin& scanners) const = 0;
  191. // Find a MatchLiteral or Alt of MatchLiterals which must appear at least once in input string for pattern
  192. // to match, and which has the shortest prevConsumes.
  193. virtual void BestSyncronizingNode(Compiler& compiler, Node*& bestNode) = 0;
  194. // Accumulate the range of groups definitions in pattern.
  195. // NOTE: minGroup must be > largest group, and maxGroup must be < 0 on topmost call
  196. virtual void AccumDefineGroups(Js::ScriptContext* scriptContext, int& minGroup, int& maxGroup) = 0;
  197. // Emit code to consume this pattern. The first skipped characters of pattern have been consumed by context.
  198. virtual void Emit(Compiler& compiler, CharCount& skipped) = 0;
  199. // Emit code to scan forward for the first occurence of pattern, or hard fail if no such occurence.
  200. // - if isHeadSyncronizingNode, also consume the occurence and leave input pointer at first char after it
  201. // - otherwise, leave input pointer at the latest point of input which could match the overall pattern
  202. // (ie rewind from start of occurence accerding to the prevConsumes range)
  203. // - may actually do nothing if nothing worthwhile to scan to
  204. // Return number of characters consumed.
  205. virtual CharCount EmitScan(Compiler& compiler, bool isHeadSyncronizingNode) = 0;
  206. CharCount EmitScanFirstSet(Compiler& compiler);
  207. inline bool IsObviouslyDeterministic() { return (features & (HasAlt | HasLoop)) == 0; }
  208. inline bool ContainsAssertion() { return (features & (HasBOL | HasEOL | HasWordBoundary | HasAssertion)) != 0; }
  209. inline bool ContainsDefineGroup() { return (features & HasDefineGroup) != 0; }
  210. inline bool ContainsMatchGroup() { return (features & HasMatchGroup) != 0; }
  211. inline bool IsSimple() { return !ContainsAssertion() && !ContainsDefineGroup(); }
  212. inline bool IsSimpleOneChar() { return IsSimple() && !isThisIrrefutable && isFirstExact && thisConsumes.IsExact(1); }
  213. inline bool IsEmptyOnly() { return IsSimple() && isThisIrrefutable && thisConsumes.IsExact(0); }
  214. static bool IsBetterSyncronizingNode(Compiler& compiler, Node* curr, Node* proposed);
  215. //
  216. // Recognizers
  217. //
  218. // Is regex c
  219. bool IsSingleChar(Compiler& compiler, Char& outChar) const;
  220. // Is regex \b\w+\b?
  221. bool IsBoundedWord(Compiler& compiler) const;
  222. // Is regex ^\s*|\s*$
  223. bool IsLeadingTrailingSpaces(Compiler& compiler, CharCount& leftMinMatch, CharCount& rightMinMatch) const;
  224. // Is regex ^literal
  225. bool IsBOILiteral2(Compiler& compiler) const;
  226. // Can this regex be recognized by an Octoquad/Megamatch matcher? Ie is in grammar:
  227. // octoquad ::= atom{8} '|' atom{8}
  228. // atom ::= A | '['...charset drawn from A's...']'
  229. // and A is a set of exactly four ASCII characters
  230. virtual bool IsOctoquad(Compiler& compiler, OctoquadIdentifier* oi) = 0;
  231. // Can this regex be recognized by a CharTrie structure? Ie is in grammar:
  232. // triearm ::= atom*
  233. // atom ::= c | '[' ... ']'
  234. // and factoring out sets does not exceed arm limit
  235. virtual bool IsCharTrieArm(Compiler& compiler, uint& accNumAlts) const = 0;
  236. // Assuming above returned true, expand 'trie' node to include all literals recognized in this regex, and
  237. // continue expanding from each leaf using given 'cont' regex. Return false if any trie node has too many
  238. // children.
  239. // - If isAcceptFirst is true, ignore any literals which are proper extensions of a literal already in
  240. // the trie, but return false if any later literal is a prefix of an earlier literal.
  241. // (If the follow of the alt we are turning into a trie is irrefutable, we can simply stop at the
  242. // first shortest match).
  243. // - Otherwise, return false if any literal is a proper prefix of any other literal, irrespective of order.
  244. virtual bool BuildCharTrie(Compiler& compiler, CharTrie* trie, Node* cont, bool isAcceptFirst) const = 0;
  245. #if ENABLE_REGEX_CONFIG_OPTIONS
  246. virtual void Print(DebugWriter* w, const Char* litbuf) const = 0;
  247. void PrintAnnotations(DebugWriter* w) const;
  248. #endif
  249. };
  250. #if ENABLE_REGEX_CONFIG_OPTIONS
  251. #define NODE_PRINT void Print(DebugWriter* w, const Char* litbuf) const override;
  252. #else
  253. #define NODE_PRINT
  254. #endif
  255. #define NODE_DECL CharCount LiteralLength() const override; \
  256. bool IsCharOrPositiveSet() const override; \
  257. CharCount TransferPass0(Compiler& compiler, const Char* litbuf) override; \
  258. void TransferPass1(Compiler& compiler, const Char* litbuf) override; \
  259. bool IsRefiningAssertion(Compiler& compiler) override; \
  260. void AnnotatePass0(Compiler& compiler) override; \
  261. void AnnotatePass1(Compiler& compiler, bool parentNotInLoop, bool parentAtLeastOnce, bool parentNotSpeculative, bool parentNotNegated) override; \
  262. void AnnotatePass2(Compiler& compiler, CountDomain accumConsumes, bool accumPrevWillNotProgress, bool accumPrevWillNotRegress) override; \
  263. void AnnotatePass3(Compiler& compiler, CountDomain accumConsumes, CharSet<Char>* accumFollow, bool accumFollowIrrefutable, bool accumFollowEOL) override; \
  264. void AnnotatePass4(Compiler& compiler) override; \
  265. bool SupportsPrefixSkipping(Compiler& compiler) const override; \
  266. Node* HeadSyncronizingNode(Compiler& compiler) override; \
  267. CharCount MinSyncronizingLiteralLength(Compiler& compiler, int& numLiterals) const override; \
  268. void CollectSyncronizingLiterals(Compiler& compiler, ScannersMixin& scanners) const override; \
  269. void BestSyncronizingNode(Compiler& compiler, Node*& bestNode) override; \
  270. void Emit(Compiler& compiler, CharCount& skipped) override; \
  271. CharCount EmitScan(Compiler& compiler, bool isHeadSyncronizingNode) override; \
  272. void AccumDefineGroups(Js::ScriptContext* scriptContext, int& minGroup, int& maxGroup) override; \
  273. bool IsOctoquad(Compiler& compiler, OctoquadIdentifier* oi) override; \
  274. bool IsCharTrieArm(Compiler& compiler, uint& accNumAlts) const override; \
  275. bool BuildCharTrie(Compiler& compiler, CharTrie* trie, Node* cont, bool isAcceptFirst) const override; \
  276. NODE_PRINT
  277. struct SimpleNode : Node
  278. {
  279. inline SimpleNode(NodeTag tag)
  280. : Node(tag)
  281. {
  282. }
  283. NODE_DECL
  284. };
  285. struct WordBoundaryNode : Node
  286. {
  287. bool isNegation;
  288. bool mustIncludeEntering; // isNegation => false
  289. bool mustIncludeLeaving; // isNegation => false
  290. inline WordBoundaryNode(bool isNegation)
  291. : Node(WordBoundary)
  292. , isNegation(isNegation)
  293. , mustIncludeEntering(false)
  294. , mustIncludeLeaving(false)
  295. {
  296. }
  297. NODE_DECL
  298. };
  299. struct MatchLiteralNode : Node
  300. {
  301. CharCount offset; // into literal buffer (initially parser's, then program's)
  302. CharCount length; // always > 1
  303. bool isEquivClass; // True if each consecutive triplet of characters of literal represents equivalence
  304. // class of characters to match against. Actual literal length will be 3 times the length
  305. // given above
  306. inline MatchLiteralNode(CharCount offset, CharCount length)
  307. : Node(MatchLiteral)
  308. , offset(offset)
  309. , length(length)
  310. , isEquivClass(false)
  311. {
  312. }
  313. NODE_DECL
  314. void AppendLiteral(CharCount& litbufNext, CharCount litbufLen, __inout_ecount(litbufLen) Char* litbuf) const override;
  315. };
  316. struct MatchCharNode : Node
  317. {
  318. Char cs[CaseInsensitive::EquivClassSize];
  319. bool isEquivClass; // true if above characters represent equivalence class of characters, otherwise only
  320. // first character is significant
  321. inline MatchCharNode(Char c)
  322. : Node(MatchChar)
  323. , isEquivClass(false)
  324. {
  325. cs[0] = c;
  326. #if DBG
  327. for (int i = 1; i < CaseInsensitive::EquivClassSize; i++)
  328. cs[i] = (Char)-1;
  329. #endif
  330. }
  331. NODE_DECL
  332. void AppendLiteral(CharCount& litbufNext, CharCount litbufLen, __inout_ecount(litbufLen) Char* litbuf) const override;
  333. CompileAssert(CaseInsensitive::EquivClassSize == 4);
  334. static void Emit(Compiler& compiler, __in_ecount(4) Char * cs, bool isEquivClass);
  335. private:
  336. CompileAssert(CaseInsensitive::EquivClassSize == 4);
  337. static CharCount FindUniqueEquivs(
  338. const Char equivs[CaseInsensitive::EquivClassSize],
  339. __out_ecount(4) Char uniqueEquivs[CaseInsensitive::EquivClassSize]);
  340. };
  341. struct MatchSetNode : Node
  342. {
  343. bool isNegation;
  344. bool needsEquivClass;
  345. CharSet<Char> set; // contents always owned by compile-time allocator
  346. // Set must be filled in
  347. inline MatchSetNode(bool isNegation, bool needsEquivClass = true)
  348. : Node(MatchSet)
  349. , isNegation(isNegation)
  350. , needsEquivClass(true)
  351. {
  352. }
  353. NODE_DECL
  354. };
  355. struct ConcatNode : Node
  356. {
  357. Node* head; // never a concat node
  358. ConcatNode* tail; // null for end, overall always length > 1, never consecutive literals/chars
  359. inline ConcatNode(Node* head, ConcatNode* tail)
  360. : Node(Concat)
  361. , head(head)
  362. , tail(tail)
  363. {
  364. }
  365. NODE_DECL
  366. };
  367. struct AltNode sealed : Node
  368. {
  369. Node* head; // never an alt node
  370. AltNode* tail; // null for end, overall always length > 1, never consecutive chars/sets
  371. enum CompilationScheme
  372. {
  373. Try, // Push choicepoint, try item, backtrack to next item on failure
  374. None, // No items (deterministic)
  375. Trie, // Match using trie of literals (deterministic)
  376. Switch, // Switch using 1 char lookahead (deterministic)
  377. Chain, // Chain of JumpIfNot(Char|Set) using 1 char lookahead (deterministic)
  378. Set // (Opt?)Match(Char|Set) (deterministic)
  379. };
  380. // Following determined in pass 4
  381. RuntimeCharTrie* runtimeTrie; // significant only if scheme == Trie
  382. CompilationScheme scheme;
  383. bool isOptional; // significant only if scheme == None|Switch|Chain|Set
  384. int switchSize; // significant only if scheme == Switch
  385. inline AltNode(Node* head, AltNode* tail)
  386. : Node(Alt)
  387. , head(head)
  388. , tail(tail)
  389. , scheme(Try)
  390. , runtimeTrie(0)
  391. , isOptional(false)
  392. , switchSize(0)
  393. {
  394. }
  395. NODE_DECL
  396. };
  397. struct DefineGroupNode : Node
  398. {
  399. Node* body;
  400. int groupId;
  401. enum CompilationScheme
  402. {
  403. Chomp, // Chomp matching characters and capture all into a group at the end
  404. Fixed, // Capture fixed-length group at end
  405. BeginEnd // Wrap by begin/end instructions
  406. };
  407. // Following determined in pass 4
  408. CompilationScheme scheme;
  409. bool noNeedToSave;
  410. inline DefineGroupNode(int groupId, Node* body)
  411. : Node(DefineGroup)
  412. , groupId(groupId)
  413. , body(body)
  414. , scheme(BeginEnd)
  415. , noNeedToSave(false)
  416. {
  417. }
  418. NODE_DECL
  419. };
  420. struct MatchGroupNode : Node
  421. {
  422. int groupId;
  423. inline MatchGroupNode(int groupId)
  424. : Node(MatchGroup)
  425. , groupId(groupId)
  426. {
  427. }
  428. NODE_DECL
  429. };
  430. struct LoopNode : Node
  431. {
  432. Node* body;
  433. CountDomain repeats;
  434. bool isGreedy;
  435. enum CompilationScheme
  436. {
  437. BeginEnd, // Push choicepoints for each unravelling (greedy) or deferred unravelling (non-greedy) of body
  438. None, // Loop matches empty only (deterministic)
  439. Chomp, // Chomp matching characters (deterministic)
  440. ChompGroupLastChar, // Chomp matching characters and bind the last char to group (deterministic)
  441. Guarded, // Use 1 char lookahead to control loop repeats (deterministic)
  442. Fixed, // Loop body is non-zero fixed width, deterministic, group-free
  443. FixedSet, // Loop body is MatchSet
  444. FixedGroupLastIteration, // Loop body is non-zero fixed width, deterministic, loop body has one outer group
  445. GreedyNoBacktrack, // Can keep track of backtracking info in constant space (deterministic)
  446. Set, // Treat r? as r|<empty>, emit as for AltNode::Set
  447. Chain, // Treat r? as r|<empty>, emit as for AltNode::Chain
  448. Try // Treat r? as r|<empty>, emit as for AltNode::Try
  449. };
  450. // Following determined in pass 4
  451. bool noNeedToSave; // defined for ChompGroupLastChar/FixedGroupLastIteration only
  452. CompilationScheme scheme;
  453. inline LoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body)
  454. : Node(Loop)
  455. , repeats(lower, upper)
  456. , isGreedy(isGreedy)
  457. , body(body)
  458. , scheme(BeginEnd)
  459. {
  460. }
  461. NODE_DECL
  462. };
  463. struct AssertionNode : Node
  464. {
  465. Node* body;
  466. bool isNegation;
  467. enum CompilationScheme
  468. {
  469. BeginEnd, // Protect assertion with begin/end instructions
  470. Succ, // Assertion will always succeed, without binding groups
  471. Fail // Assertion will always fail
  472. };
  473. // Following determined in pass 4
  474. CompilationScheme scheme;
  475. inline AssertionNode(bool isNegation, Node* body)
  476. : Node(Assertion)
  477. , isNegation(isNegation)
  478. , body(body)
  479. , scheme(BeginEnd)
  480. {
  481. }
  482. NODE_DECL
  483. };
  484. // ----------------------------------------------------------------------
  485. // Compiler
  486. // ----------------------------------------------------------------------
  487. class Compiler : private Chars<char16>
  488. {
  489. friend Node;
  490. friend SimpleNode;
  491. friend WordBoundaryNode;
  492. friend MatchLiteralNode;
  493. friend MatchCharNode;
  494. friend ConcatNode;
  495. friend AltNode;
  496. friend DefineGroupNode;
  497. friend MatchGroupNode;
  498. friend LoopNode;
  499. friend MatchSetNode;
  500. friend AssertionNode;
  501. private:
  502. static const CharCount initInstBufSize = 128;
  503. Js::ScriptContext* scriptContext;
  504. // Arena for nodes and items needed only during compilation
  505. ArenaAllocator* ctAllocator;
  506. // Arena for literals, sets and items needed during runtime
  507. ArenaAllocator* rtAllocator;
  508. StandardChars<Char>* standardChars;
  509. #if ENABLE_REGEX_CONFIG_OPTIONS
  510. DebugWriter* w;
  511. RegexStats* stats;
  512. #endif
  513. Program* program;
  514. uint8* instBuf; // in compile-time allocator, owned by compiler
  515. CharCount instLen; // size of instBuf in bytes
  516. CharCount instNext; // offset to emit next instruction into
  517. int nextLoopId;
  518. private:
  519. uint8* Emit(size_t size);
  520. template <typename T>
  521. T* Emit();
  522. // The instruction buffer may move, so we need to remember label fixup's relative to the instruction base
  523. // rather than as machine addresses
  524. inline Label Compiler::GetFixup(Label* pLabel)
  525. {
  526. Assert((uint8*)pLabel >= instBuf && (uint8*)pLabel < instBuf + instNext);
  527. return (Label)((uint8*)pLabel - instBuf);
  528. }
  529. inline void Compiler::DoFixup(Label fixup, Label label)
  530. {
  531. Assert(fixup < instNext);
  532. Assert(label <= instNext);
  533. *(Label*)(instBuf + fixup) = label;
  534. }
  535. inline Label Compiler::CurrentLabel()
  536. {
  537. return instNext;
  538. }
  539. template <typename T>
  540. inline T* LabelToInstPointer(Inst::InstTag tag, Label label)
  541. {
  542. Assert(label + sizeof(T) <= instNext);
  543. Assert(((Inst*)(instBuf + label))->tag == tag);
  544. return (T*)(instBuf + label);
  545. }
  546. inline int Compiler::NextLoopId()
  547. {
  548. return nextLoopId++;
  549. }
  550. inline Js::ScriptContext *GetScriptContext() const
  551. {
  552. return scriptContext;
  553. }
  554. inline Program *GetProgram() const
  555. {
  556. return program;
  557. }
  558. void SetBOIInstructionsProgramTag()
  559. {
  560. Assert(this->program->tag == Program::InstructionsTag
  561. || this->program->tag == Program::BOIInstructionsTag);
  562. Assert(this->CurrentLabel() == 0);
  563. this->program->tag = Program::BOIInstructionsTag;
  564. }
  565. void SetBOIInstructionsProgramForStickyFlagTag()
  566. {
  567. Assert(this->program->tag == Program::InstructionsTag
  568. || this->program->tag == Program::BOIInstructionsForStickyFlagTag);
  569. Assert(this->CurrentLabel() == 0);
  570. AssertMsg((this->program->flags & StickyRegexFlag) != 0, "Shouldn't set BOIInstructionsForStickyFlagTag, if sticky is false.");
  571. this->program->tag = Program::BOIInstructionsForStickyFlagTag;
  572. }
  573. static void CaptureNoLiterals(Program* program);
  574. void CaptureLiterals(Node* root, const Char *litbuf);
  575. static void EmitAndCaptureSuccInst(Recycler* recycler, Program* program);
  576. void CaptureInsts();
  577. void FreeBody();
  578. Compiler
  579. ( Js::ScriptContext* scriptContext
  580. , ArenaAllocator* ctAllocator
  581. , ArenaAllocator* rtAllocator
  582. , StandardChars<Char>* standardChars
  583. , Program* program
  584. #if ENABLE_REGEX_CONFIG_OPTIONS
  585. , DebugWriter* w
  586. , RegexStats* stats
  587. #endif
  588. );
  589. public:
  590. static void CompileEmptyRegex
  591. ( Program* program
  592. , RegexPattern* pattern
  593. #if ENABLE_REGEX_CONFIG_OPTIONS
  594. , DebugWriter* w
  595. , RegexStats* stats
  596. #endif
  597. );
  598. static void Compile
  599. ( Js::ScriptContext* scriptContext
  600. , ArenaAllocator* ctAllocator
  601. , ArenaAllocator* rtAllocator
  602. , StandardChars<Char>* standardChars
  603. , Program* program
  604. , Node* root
  605. , const Char* litbuf
  606. , RegexPattern* pattern
  607. #if ENABLE_REGEX_CONFIG_OPTIONS
  608. , DebugWriter* w
  609. , RegexStats* stats
  610. #endif
  611. );
  612. };
  613. }