RegexCompileTime.h 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. //
  6. // Regex parsing and AST-to-AST transformations/analysis
  7. //
  8. #pragma once
  9. #include "RuntimeCommon.h"
  10. namespace UnifiedRegex
  11. {
  12. // FORWARD
  13. class Compiler;
  14. // ----------------------------------------------------------------------
  15. // Node
  16. // ----------------------------------------------------------------------
  17. struct Node : protected Chars<char16>
  18. {
  19. // Optimization heuristics
  20. static const int maxSyncToSetSize = 256;
  21. static const int preferredMinSyncToLiteralLength = 3;
  22. static const int maxNumSyncLiterals = ScannersMixin::MaxNumSyncLiterals;
  23. static const int minRemainLengthForTest = 4;
  24. static const int maxCharsForConditionalTry = 20;
  25. static const int maxTrieArmExpansion = 16;
  26. enum NodeTag : uint16
  27. {
  28. // SimpleNode
  29. Empty, // (|...), etc
  30. BOL, // ^
  31. EOL, // $
  32. // WordBoundaryNode
  33. WordBoundary, // \b, \B
  34. // MatchLiteralNode
  35. MatchLiteral, // abc, non-empty
  36. // MatchCharNode
  37. MatchChar, // a
  38. // ConcatNode
  39. Concat, // e e
  40. // AltNode
  41. Alt, // e | e
  42. // DefineGroupNode
  43. DefineGroup, // (e)
  44. // MatchGroupNode
  45. MatchGroup, // \1
  46. // LoopNode
  47. Loop, // e*, e+, e{n,m}, e*?, e+?, e{n,m}?
  48. // MatchSetNode
  49. MatchSet, // [...], [^...], \b, \B, \d, \D, \s, \S, \w, \W, .
  50. // AssertionNode
  51. Assertion // (?=e), (?!e)
  52. };
  53. enum Features : uint16
  54. {
  55. HasEmpty = 1 << Empty,
  56. HasBOL = 1 << BOL,
  57. HasEOL = 1 << EOL,
  58. HasWordBoundary = 1 << WordBoundary,
  59. HasMatchLiteral = 1 << MatchLiteral,
  60. HasMatchChar = 1 << MatchChar,
  61. HasConcat = 1 << Concat,
  62. HasAlt = 1 << Alt,
  63. HasDefineGroup = 1 << DefineGroup,
  64. HasMatchGroup = 1 << MatchGroup,
  65. HasLoop = 1 << Loop,
  66. HasMatchSet = 1 << MatchSet,
  67. HasAssertion = 1 << Assertion
  68. };
  69. NodeTag tag;
  70. // Features of this and all child nodes
  71. uint16 features;
  72. // See comment for firstSet
  73. bool isFirstExact : 1;
  74. // True if pattern can never fail
  75. bool isThisIrrefutable : 1;
  76. // True if following patterns can never fail
  77. bool isFollowIrrefutable : 1;
  78. // True if pattern matches one or more word characters
  79. bool isWord : 1;
  80. // True if pattern will not consume more characters on backtracking
  81. bool isThisWillNotProgress : 1;
  82. // True if pattern will not consume fewer characters on backtracking
  83. bool isThisWillNotRegress : 1;
  84. // True if previous patterns will not consume more characters on backtracking
  85. bool isPrevWillNotProgress : 1;
  86. // True if previous patterns will not consume fewer characters on backtracking
  87. bool isPrevWillNotRegress : 1;
  88. // True if $ always follows pattern (and we are not in multi-line mode)
  89. bool isFollowEOL : 1;
  90. // True if pattern is deterministic (ie will never push a choicepoint during execution)
  91. // Determined in pass 4.
  92. bool isDeterministic : 1;
  93. // True if pattern is not in a loop context
  94. bool isNotInLoop : 1;
  95. // True if pattern will be matched against at least one segment of the input (ie will be executed at least once)
  96. bool isAtLeastOnce : 1;
  97. // True if pattern does not appear in an assertion
  98. bool isNotSpeculative : 1;
  99. // True if known to not be in a negative assertion context
  100. // (We do not play any games with double-negation)
  101. bool isNotNegated : 1;
  102. // True if this contains a hard fail BOI
  103. bool hasInitialHardFailBOI : 1;
  104. uint dummy : 17;
  105. // NOTE: The bodies of the following sets are allocated in the compile-time allocator and must be cloned
  106. // into the run-time allocator if they end up being used by an instruction.
  107. // NOTE: Sets may be aliased between nodes, and may be one of the standard sets.
  108. // Upper bound of FIRST characters of this pattern.
  109. // - Pattern will *never* match first characters not in this set
  110. // - If isFirstExact, pattern will *always* match first characters in this set (but may fail on later characters)
  111. // - If !isFirstExact, pattern *may* match first characters in this set, or may fail.
  112. CharSet<Char> *firstSet;
  113. // Upper bound of FOLLOW characters of this pattern.
  114. CharSet<Char> *followSet;
  115. // Range of number of characters already consumed before this pattern
  116. CountDomain prevConsumes;
  117. // Range of number of characters consumed by this pattern
  118. CountDomain thisConsumes;
  119. // Range of number of character consumed after this pattern
  120. CountDomain followConsumes;
  121. inline Node(NodeTag tag)
  122. : tag(tag)
  123. , features(0)
  124. , firstSet(0)
  125. , isFirstExact(false)
  126. , followSet(0)
  127. , isThisIrrefutable(false)
  128. , isFollowIrrefutable(false)
  129. , isWord(false)
  130. , isThisWillNotProgress(false)
  131. , isThisWillNotRegress(false)
  132. , isPrevWillNotProgress(false)
  133. , isPrevWillNotRegress(false)
  134. , isFollowEOL(false)
  135. , isDeterministic(false)
  136. , isNotInLoop(false)
  137. , isAtLeastOnce(false)
  138. , isNotSpeculative(false)
  139. , isNotNegated(false)
  140. , hasInitialHardFailBOI(false)
  141. {
  142. }
  143. //
  144. // Parse-time helpers
  145. //
  146. virtual CharCount LiteralLength() const = 0;
  147. virtual void AppendLiteral(CharCount& litbufNext, CharCount litbufLen, __inout_ecount(litbufLen) Char* litbuf) const;
  148. virtual bool IsCharOrPositiveSet() const = 0;
  149. // Transfer pass 0:
  150. // - synthesize the total number of characters required to store all literals, including case-invariant
  151. // expansions where required
  152. // - adjust match char nodes to account for case invariance if necessary
  153. virtual CharCount TransferPass0(Compiler& compiler, const Char* litbuf) = 0;
  154. // Transfer pass 1:
  155. // - transfer literals from given litbuf into newLitbuf, advancing nextLit as we go
  156. // - adjust set nodes to account for case invariance if necessary
  157. virtual void TransferPass1(Compiler& compiler, const Char* litbuf) = 0;
  158. //
  159. // Compile-time helpers
  160. //
  161. // True if firstSet of this node can be used as the followSet of a previous node, even though this node may
  162. // accept empty. True only for simple assertions.
  163. virtual bool IsRefiningAssertion(Compiler& compiler) = 0;
  164. // Annotation pass 0:
  165. // - bottom-up: isWord
  166. // - refine WordBoundary nodes where possible
  167. virtual void AnnotatePass0(Compiler& compiler) = 0;
  168. // Annotation pass 1:
  169. // - top-down: isNotInLoop, isAtLeastOnce, isNotSpeculative, isNotNegated
  170. // - bottom-up: features, thisConsumes, firstSet, isFirstExact, isThisIrrefutable, isThisWillNotProgress, isThisWillNotRegress
  171. virtual void AnnotatePass1(Compiler& compiler, bool parentNotInLoop, bool parentAtLeastOnce, bool parentNotSpeculative, bool parentNotNegated) = 0;
  172. // Annotation pass 2
  173. // - left-to-right: prevConsumes, isPrevWillNotProgress, isPrevWillNotRegress.
  174. virtual void AnnotatePass2(Compiler& compiler, CountDomain accumConsumes, bool accumPrevWillNotProgress, bool accumPrevWillNotRegress) = 0;
  175. // Annotation pass 3
  176. // - right-to-left: followConsumes, followSet, isFollowIrrefutable, isFollowEOL
  177. virtual void AnnotatePass3(Compiler& compiler, CountDomain accumConsumes, CharSet<Char>* accumFollow, bool accumFollowIrrefutable, bool accumFollowEOL) = 0;
  178. // Annotation pass 4
  179. // - possibly simplify the node in-place
  180. // - decide on the compilation scheme for each node, possibly recording it within node-specific fields
  181. // - bottom-up: isDeterministic
  182. virtual void AnnotatePass4(Compiler& compiler) = 0;
  183. // Return true if pattern can be complied assuming some fixed-length prefix of a matching input string has already been consumed
  184. virtual bool SupportsPrefixSkipping(Compiler& compiler) const = 0;
  185. // Return the Match(Char|Literal|Set) at the start of pattern, or 0 if no such unique node
  186. virtual Node* HeadSyncronizingNode(Compiler& compiler) = 0;
  187. // Count how many literals are in pattern and return their minimum length. Returns 0
  188. // if pattern not in a form which can be used by a SyncToLiterals instruction.
  189. virtual CharCount MinSyncronizingLiteralLength(Compiler& compiler, int& numLiterals) const = 0;
  190. // Collect the literals counted by above and build scanners for them.
  191. virtual void CollectSyncronizingLiterals(Compiler& compiler, ScannersMixin& scanners) const = 0;
  192. // Find a MatchLiteral or Alt of MatchLiterals which must appear at least once in input string for pattern
  193. // to match, and which has the shortest prevConsumes.
  194. virtual void BestSyncronizingNode(Compiler& compiler, Node*& bestNode) = 0;
  195. // Accumulate the range of groups definitions in pattern.
  196. // NOTE: minGroup must be > largest group, and maxGroup must be < 0 on topmost call
  197. virtual void AccumDefineGroups(Js::ScriptContext* scriptContext, int& minGroup, int& maxGroup) = 0;
  198. // Emit code to consume this pattern. The first skipped characters of pattern have been consumed by context.
  199. virtual void Emit(Compiler& compiler, CharCount& skipped) = 0;
  200. // Emit code to scan forward for the first occurence of pattern, or hard fail if no such occurence.
  201. // - if isHeadSyncronizingNode, also consume the occurence and leave input pointer at first char after it
  202. // - otherwise, leave input pointer at the latest point of input which could match the overall pattern
  203. // (ie rewind from start of occurence accerding to the prevConsumes range)
  204. // - may actually do nothing if nothing worthwhile to scan to
  205. // Return number of characters consumed.
  206. virtual CharCount EmitScan(Compiler& compiler, bool isHeadSyncronizingNode) = 0;
  207. CharCount EmitScanFirstSet(Compiler& compiler);
  208. inline bool IsObviouslyDeterministic() { return (features & (HasAlt | HasLoop)) == 0; }
  209. inline bool ContainsAssertion() { return (features & (HasBOL | HasEOL | HasWordBoundary | HasAssertion)) != 0; }
  210. inline bool ContainsDefineGroup() { return (features & HasDefineGroup) != 0; }
  211. inline bool ContainsMatchGroup() { return (features & HasMatchGroup) != 0; }
  212. inline bool IsSimple() { return !ContainsAssertion() && !ContainsDefineGroup(); }
  213. inline bool IsSimpleOneChar() { return IsSimple() && !isThisIrrefutable && isFirstExact && thisConsumes.IsExact(1); }
  214. inline bool IsEmptyOnly() { return IsSimple() && isThisIrrefutable && thisConsumes.IsExact(0); }
  215. static bool IsBetterSyncronizingNode(Compiler& compiler, Node* curr, Node* proposed);
  216. //
  217. // Recognizers
  218. //
  219. // Is regex c
  220. bool IsSingleChar(Compiler& compiler, Char& outChar) const;
  221. // Is regex \b\w+\b?
  222. bool IsBoundedWord(Compiler& compiler) const;
  223. // Is regex ^\s*|\s*$
  224. bool IsLeadingTrailingSpaces(Compiler& compiler, CharCount& leftMinMatch, CharCount& rightMinMatch) const;
  225. // Is regex ^literal
  226. bool IsBOILiteral2(Compiler& compiler) const;
  227. // Can this regex be recognized by an Octoquad/Megamatch matcher? Ie is in grammar:
  228. // octoquad ::= atom{8} '|' atom{8}
  229. // atom ::= A | '['...charset drawn from A's...']'
  230. // and A is a set of exactly four ASCII characters
  231. virtual bool IsOctoquad(Compiler& compiler, OctoquadIdentifier* oi) = 0;
  232. // Can this regex be recognized by a CharTrie structure? Ie is in grammar:
  233. // triearm ::= atom*
  234. // atom ::= c | '[' ... ']'
  235. // and factoring out sets does not exceed arm limit
  236. virtual bool IsCharTrieArm(Compiler& compiler, uint& accNumAlts) const = 0;
  237. // Assuming above returned true, expand 'trie' node to include all literals recognized in this regex, and
  238. // continue expanding from each leaf using given 'cont' regex. Return false if any trie node has too many
  239. // children.
  240. // - If isAcceptFirst is true, ignore any literals which are proper extensions of a literal already in
  241. // the trie, but return false if any later literal is a prefix of an earlier literal.
  242. // (If the follow of the alt we are turning into a trie is irrefutable, we can simply stop at the
  243. // first shortest match).
  244. // - Otherwise, return false if any literal is a proper prefix of any other literal, irrespective of order.
  245. virtual bool BuildCharTrie(Compiler& compiler, CharTrie* trie, Node* cont, bool isAcceptFirst) const = 0;
  246. #if ENABLE_REGEX_CONFIG_OPTIONS
  247. virtual void Print(DebugWriter* w, const Char* litbuf) const = 0;
  248. void PrintAnnotations(DebugWriter* w) const;
  249. #endif
  250. };
  251. #if ENABLE_REGEX_CONFIG_OPTIONS
  252. #define NODE_PRINT void Print(DebugWriter* w, const Char* litbuf) const override;
  253. #else
  254. #define NODE_PRINT
  255. #endif
  256. #define NODE_DECL CharCount LiteralLength() const override; \
  257. bool IsCharOrPositiveSet() const override; \
  258. CharCount TransferPass0(Compiler& compiler, const Char* litbuf) override; \
  259. void TransferPass1(Compiler& compiler, const Char* litbuf) override; \
  260. bool IsRefiningAssertion(Compiler& compiler) override; \
  261. void AnnotatePass0(Compiler& compiler) override; \
  262. void AnnotatePass1(Compiler& compiler, bool parentNotInLoop, bool parentAtLeastOnce, bool parentNotSpeculative, bool parentNotNegated) override; \
  263. void AnnotatePass2(Compiler& compiler, CountDomain accumConsumes, bool accumPrevWillNotProgress, bool accumPrevWillNotRegress) override; \
  264. void AnnotatePass3(Compiler& compiler, CountDomain accumConsumes, CharSet<Char>* accumFollow, bool accumFollowIrrefutable, bool accumFollowEOL) override; \
  265. void AnnotatePass4(Compiler& compiler) override; \
  266. bool SupportsPrefixSkipping(Compiler& compiler) const override; \
  267. Node* HeadSyncronizingNode(Compiler& compiler) override; \
  268. CharCount MinSyncronizingLiteralLength(Compiler& compiler, int& numLiterals) const override; \
  269. void CollectSyncronizingLiterals(Compiler& compiler, ScannersMixin& scanners) const override; \
  270. void BestSyncronizingNode(Compiler& compiler, Node*& bestNode) override; \
  271. void Emit(Compiler& compiler, CharCount& skipped) override; \
  272. CharCount EmitScan(Compiler& compiler, bool isHeadSyncronizingNode) override; \
  273. void AccumDefineGroups(Js::ScriptContext* scriptContext, int& minGroup, int& maxGroup) override; \
  274. bool IsOctoquad(Compiler& compiler, OctoquadIdentifier* oi) override; \
  275. bool IsCharTrieArm(Compiler& compiler, uint& accNumAlts) const override; \
  276. bool BuildCharTrie(Compiler& compiler, CharTrie* trie, Node* cont, bool isAcceptFirst) const override; \
  277. NODE_PRINT
  278. struct SimpleNode : Node
  279. {
  280. inline SimpleNode(NodeTag tag)
  281. : Node(tag)
  282. {
  283. }
  284. NODE_DECL
  285. };
  286. struct WordBoundaryNode : Node
  287. {
  288. bool isNegation;
  289. bool mustIncludeEntering; // isNegation => false
  290. bool mustIncludeLeaving; // isNegation => false
  291. inline WordBoundaryNode(bool isNegation)
  292. : Node(WordBoundary)
  293. , isNegation(isNegation)
  294. , mustIncludeEntering(false)
  295. , mustIncludeLeaving(false)
  296. {
  297. }
  298. NODE_DECL
  299. };
  300. struct MatchLiteralNode : Node
  301. {
  302. CharCount offset; // into literal buffer (initially parser's, then program's)
  303. CharCount length; // always > 1
  304. bool isEquivClass; // True if each consecutive triplet of characters of literal represents equivalence
  305. // class of characters to match against. Actual literal length will be 3 times the length
  306. // given above
  307. inline MatchLiteralNode(CharCount offset, CharCount length)
  308. : Node(MatchLiteral)
  309. , offset(offset)
  310. , length(length)
  311. , isEquivClass(false)
  312. {
  313. }
  314. NODE_DECL
  315. void AppendLiteral(CharCount& litbufNext, CharCount litbufLen, __inout_ecount(litbufLen) Char* litbuf) const override;
  316. };
  317. struct MatchCharNode : Node
  318. {
  319. Char cs[CaseInsensitive::EquivClassSize];
  320. bool isEquivClass; // true if above characters represent equivalence class of characters, otherwise only
  321. // first character is significant
  322. inline MatchCharNode(Char c)
  323. : Node(MatchChar)
  324. , isEquivClass(false)
  325. {
  326. cs[0] = c;
  327. for (int i = 1; i < CaseInsensitive::EquivClassSize; i++)
  328. cs[i] = c;
  329. }
  330. NODE_DECL
  331. void AppendLiteral(CharCount& litbufNext, CharCount litbufLen, __inout_ecount(litbufLen) Char* litbuf) const override;
  332. CompileAssert(CaseInsensitive::EquivClassSize == 4);
  333. static void Emit(Compiler& compiler, __in_ecount(4) Char * cs, bool isEquivClass);
  334. private:
  335. CompileAssert(CaseInsensitive::EquivClassSize == 4);
  336. static CharCount FindUniqueEquivs(
  337. const Char equivs[CaseInsensitive::EquivClassSize],
  338. __out_ecount(4) Char uniqueEquivs[CaseInsensitive::EquivClassSize]);
  339. };
  340. struct MatchSetNode : Node
  341. {
  342. bool isNegation;
  343. bool needsEquivClass;
  344. CharSet<Char> set; // contents always owned by compile-time allocator
  345. // Set must be filled in
  346. inline MatchSetNode(bool isNegation, bool needsEquivClass = true)
  347. : Node(MatchSet)
  348. , isNegation(isNegation)
  349. , needsEquivClass(true)
  350. {
  351. }
  352. NODE_DECL
  353. };
  354. struct ConcatNode : Node
  355. {
  356. Node* head; // never a concat node
  357. ConcatNode* tail; // null for end, overall always length > 1, never consecutive literals/chars
  358. inline ConcatNode(Node* head, ConcatNode* tail)
  359. : Node(Concat)
  360. , head(head)
  361. , tail(tail)
  362. {
  363. }
  364. NODE_DECL
  365. };
  366. struct AltNode sealed : Node
  367. {
  368. Node* head; // never an alt node
  369. AltNode* tail; // null for end, overall always length > 1, never consecutive chars/sets
  370. enum CompilationScheme
  371. {
  372. Try, // Push choicepoint, try item, backtrack to next item on failure
  373. None, // No items (deterministic)
  374. Trie, // Match using trie of literals (deterministic)
  375. Switch, // Switch using 1 char lookahead (deterministic)
  376. Chain, // Chain of JumpIfNot(Char|Set) using 1 char lookahead (deterministic)
  377. Set // (Opt?)Match(Char|Set) (deterministic)
  378. };
  379. // Following determined in pass 4
  380. RuntimeCharTrie* runtimeTrie; // significant only if scheme == Trie
  381. CompilationScheme scheme;
  382. bool isOptional; // significant only if scheme == None|Switch|Chain|Set
  383. int switchSize; // significant only if scheme == Switch
  384. inline AltNode(Node* head, AltNode* tail)
  385. : Node(Alt)
  386. , head(head)
  387. , tail(tail)
  388. , scheme(Try)
  389. , runtimeTrie(0)
  390. , isOptional(false)
  391. , switchSize(0)
  392. {
  393. }
  394. NODE_DECL
  395. };
  396. struct DefineGroupNode : Node
  397. {
  398. Node* body;
  399. int groupId;
  400. enum CompilationScheme
  401. {
  402. Chomp, // Chomp matching characters and capture all into a group at the end
  403. Fixed, // Capture fixed-length group at end
  404. BeginEnd // Wrap by begin/end instructions
  405. };
  406. // Following determined in pass 4
  407. CompilationScheme scheme;
  408. bool noNeedToSave;
  409. inline DefineGroupNode(int groupId, Node* body)
  410. : Node(DefineGroup)
  411. , groupId(groupId)
  412. , body(body)
  413. , scheme(BeginEnd)
  414. , noNeedToSave(false)
  415. {
  416. }
  417. NODE_DECL
  418. };
  419. struct MatchGroupNode : Node
  420. {
  421. int groupId;
  422. inline MatchGroupNode(int groupId)
  423. : Node(MatchGroup)
  424. , groupId(groupId)
  425. {
  426. }
  427. NODE_DECL
  428. };
  429. struct LoopNode : Node
  430. {
  431. Node* body;
  432. CountDomain repeats;
  433. // If set and not equal to MaxChar, followFirst is the deterministic first character of the follow set of this node.
  434. // Could be expanded to encompass the entire firstSet of the next node.
  435. Char followFirst;
  436. bool isGreedy;
  437. enum CompilationScheme
  438. {
  439. BeginEnd, // Push choicepoints for each unravelling (greedy) or deferred unravelling (non-greedy) of body
  440. None, // Loop matches empty only (deterministic)
  441. Chomp, // Chomp matching characters (deterministic)
  442. ChompGroupLastChar, // Chomp matching characters and bind the last char to group (deterministic)
  443. Guarded, // Use 1 char lookahead to control loop repeats (deterministic)
  444. Fixed, // Loop body is non-zero fixed width, deterministic, group-free
  445. FixedSet, // Loop body is MatchSet
  446. FixedGroupLastIteration, // Loop body is non-zero fixed width, deterministic, loop body has one outer group
  447. GreedyNoBacktrack, // Can keep track of backtracking info in constant space (deterministic)
  448. Set, // Treat r? as r|<empty>, emit as for AltNode::Set
  449. Chain, // Treat r? as r|<empty>, emit as for AltNode::Chain
  450. Try // Treat r? as r|<empty>, emit as for AltNode::Try
  451. };
  452. // Following determined in pass 4
  453. bool noNeedToSave; // defined for ChompGroupLastChar/FixedGroupLastIteration only
  454. CompilationScheme scheme;
  455. inline LoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body)
  456. : Node(Loop)
  457. , repeats(lower, upper)
  458. , isGreedy(isGreedy)
  459. , followFirst(MaxChar)
  460. , body(body)
  461. , scheme(BeginEnd)
  462. {
  463. }
  464. NODE_DECL
  465. };
  466. struct AssertionNode : Node
  467. {
  468. Node* body;
  469. bool isNegation;
  470. enum CompilationScheme
  471. {
  472. BeginEnd, // Protect assertion with begin/end instructions
  473. Succ, // Assertion will always succeed, without binding groups
  474. Fail // Assertion will always fail
  475. };
  476. // Following determined in pass 4
  477. CompilationScheme scheme;
  478. inline AssertionNode(bool isNegation, Node* body)
  479. : Node(Assertion)
  480. , isNegation(isNegation)
  481. , body(body)
  482. , scheme(BeginEnd)
  483. {
  484. }
  485. NODE_DECL
  486. };
  487. // ----------------------------------------------------------------------
  488. // Compiler
  489. // ----------------------------------------------------------------------
  490. class Compiler : private Chars<char16>
  491. {
  492. friend Node;
  493. friend SimpleNode;
  494. friend WordBoundaryNode;
  495. friend MatchLiteralNode;
  496. friend MatchCharNode;
  497. friend ConcatNode;
  498. friend AltNode;
  499. friend DefineGroupNode;
  500. friend MatchGroupNode;
  501. friend LoopNode;
  502. friend MatchSetNode;
  503. friend AssertionNode;
  504. private:
  505. static const CharCount initInstBufSize = 128;
  506. Js::ScriptContext* scriptContext;
  507. // Arena for nodes and items needed only during compilation
  508. ArenaAllocator* ctAllocator;
  509. // Arena for literals, sets and items needed during runtime
  510. ArenaAllocator* rtAllocator;
  511. StandardChars<Char>* standardChars;
  512. #if ENABLE_REGEX_CONFIG_OPTIONS
  513. DebugWriter* w;
  514. RegexStats* stats;
  515. #endif
  516. Program* program;
  517. uint8* instBuf; // in compile-time allocator, owned by compiler
  518. CharCount instLen; // size of instBuf in bytes
  519. CharCount instNext; // offset to emit next instruction into
  520. int nextLoopId;
  521. private:
  522. uint8* Emit(size_t size);
  523. template <typename T>
  524. T* Emit();
  525. // The instruction buffer may move, so we need to remember label fixup's relative to the instruction base
  526. // rather than as machine addresses.
  527. // NOTE: pLabel is declared unaligned because Inst structs are unaligned in the inst buffer (thanks to #pragma pack(1)).
  528. inline Label GetFixup(unaligned Label* pLabel)
  529. {
  530. Assert((uint8*)pLabel >= instBuf && (uint8*)pLabel < instBuf + instNext);
  531. return (Label)((uint8*)pLabel - instBuf);
  532. }
  533. inline void DoFixup(Label fixup, Label label)
  534. {
  535. Assert(fixup < instNext);
  536. Assert(label <= instNext);
  537. *(Label*)(instBuf + fixup) = label;
  538. }
  539. inline Label CurrentLabel()
  540. {
  541. return instNext;
  542. }
  543. template <typename T>
  544. inline T* LabelToInstPointer(Inst::InstTag tag, Label label)
  545. {
  546. Assert(label + sizeof(T) <= instNext);
  547. Assert(((Inst*)(instBuf + label))->tag == tag);
  548. return (T*)(instBuf + label);
  549. }
  550. inline int NextLoopId()
  551. {
  552. return nextLoopId++;
  553. }
  554. inline Js::ScriptContext *GetScriptContext() const
  555. {
  556. return scriptContext;
  557. }
  558. inline Program *GetProgram() const
  559. {
  560. return program;
  561. }
  562. void SetBOIInstructionsProgramTag()
  563. {
  564. Assert(this->program->tag == Program::ProgramTag::InstructionsTag
  565. || this->program->tag == Program::ProgramTag::BOIInstructionsTag);
  566. Assert(this->CurrentLabel() == 0);
  567. this->program->tag = Program::ProgramTag::BOIInstructionsTag;
  568. }
  569. void SetBOIInstructionsProgramForStickyFlagTag()
  570. {
  571. Assert(this->program->tag == Program::ProgramTag::InstructionsTag
  572. || this->program->tag == Program::ProgramTag::BOIInstructionsForStickyFlagTag);
  573. Assert(this->CurrentLabel() == 0);
  574. AssertMsg((this->program->flags & StickyRegexFlag) != 0, "Shouldn't set BOIInstructionsForStickyFlagTag, if sticky is false.");
  575. this->program->tag = Program::ProgramTag::BOIInstructionsForStickyFlagTag;
  576. }
  577. static void CaptureNoLiterals(Program* program);
  578. void CaptureLiterals(Node* root, const Char *litbuf);
  579. static void EmitAndCaptureSuccInst(Recycler* recycler, Program* program);
  580. void CaptureInsts();
  581. void FreeBody();
  582. Compiler
  583. ( Js::ScriptContext* scriptContext
  584. , ArenaAllocator* ctAllocator
  585. , ArenaAllocator* rtAllocator
  586. , StandardChars<Char>* standardChars
  587. , Program* program
  588. #if ENABLE_REGEX_CONFIG_OPTIONS
  589. , DebugWriter* w
  590. , RegexStats* stats
  591. #endif
  592. );
  593. public:
  594. static void CompileEmptyRegex
  595. ( Program* program
  596. , RegexPattern* pattern
  597. #if ENABLE_REGEX_CONFIG_OPTIONS
  598. , DebugWriter* w
  599. , RegexStats* stats
  600. #endif
  601. );
  602. static void Compile
  603. ( Js::ScriptContext* scriptContext
  604. , ArenaAllocator* ctAllocator
  605. , ArenaAllocator* rtAllocator
  606. , StandardChars<Char>* standardChars
  607. , Program* program
  608. , Node* root
  609. , const Char* litbuf
  610. , RegexPattern* pattern
  611. #if ENABLE_REGEX_CONFIG_OPTIONS
  612. , DebugWriter* w
  613. , RegexStats* stats
  614. #endif
  615. );
  616. };
  617. }