пре 10 година · ad56276a65
--- a/lib/Parser/RegexRuntime.cpp
+++ b/lib/Parser/RegexRuntime.cpp
@@ -2128,9 +2128,28 @@ namespace UnifiedRegex
 
				         int besti = -1;
			
 
				         CharCount bestMatchOffset = 0;
			
 
				 
			
 
				+        if (matcher.literalNextSyncInputOffsets == nullptr)
			
 
				+        {
			
 
				+            matcher.literalNextSyncInputOffsets =
			
 
				+                RecyclerNewArrayLeaf(matcher.recycler, CharCount, ScannersMixin::MaxNumSyncLiterals);
			
 
				+        }
			
 
				+        CharCount* literalNextSyncInputOffsets = matcher.literalNextSyncInputOffsets;
			
 
				+
			
 
				+        if (firstIteration)
			
 
				+        {
			
 
				+            for (int i = 0; i < numLiterals; i++)
			
 
				+            {
			
 
				+                literalNextSyncInputOffsets[i] = inputOffset;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				         for (int i = 0; i < numLiterals; i++)
			
 
				         {
			
 
				-            CharCount thisMatchOffset = inputOffset;
			
 
				+            CharCount thisMatchOffset = literalNextSyncInputOffsets[i];
			
 
				+            if (inputOffset > thisMatchOffset)
			
 
				+            {
			
 
				+                thisMatchOffset = inputOffset;
			
 
				+            }
			
 
				 
			
 
				             if (infos[i]->isEquivClass ?
			
 
				                     (infos[i]->scanner.Match<CaseInsensitive::EquivClassSize>
			
@@ -2159,6 +2178,12 @@ namespace UnifiedRegex
 
				                     besti = i;
			
 
				                     bestMatchOffset = thisMatchOffset;
			
 
				                 }
			
 
				+
			
 
				+                literalNextSyncInputOffsets[i] = thisMatchOffset;
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                literalNextSyncInputOffsets[i] = inputLength;
			
 
				             }
			
 
				         }
			
 
				 
			
@@ -4198,14 +4223,14 @@ namespace UnifiedRegex
 
				         , program(pattern->rep.unified.program)
			
 
				         , groupInfos(nullptr)
			
 
				         , loopInfos(nullptr)
			
 
				+        , literalNextSyncInputOffsets(nullptr)
			
 
				+        , recycler(scriptContext->GetRecycler())
			
 
				         , previousQcTime(0)
			
 
				 #if ENABLE_REGEX_CONFIG_OPTIONS
			
 
				         , stats(0)
			
 
				         , w(0)
			
 
				 #endif
			
 
				     {
			
 
				-        const auto recycler = scriptContext->GetRecycler();
			
 
				-
			
 
				         // Don't need to zero out - the constructor for GroupInfo should take care of it
			
 
				         groupInfos = RecyclerNewArrayLeaf(recycler, GroupInfo, program->numGroups);
			
 
				 
			
@@ -4369,7 +4394,7 @@ namespace UnifiedRegex
 
				     const uint32 maxInstTag = instTags[(sizeof(instTags) / sizeof(uint32)) - 1];
			
 
				 #endif
			
 
				 
			
 
				-    __inline void Matcher::Run(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks)
			
 
				+    __inline void Matcher::Run(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration)
			
 
				     {
			
 
				         CharCount inputOffset = matchStart;
			
 
				         const uint8 *instPointer = program->rep.insts.insts;
			
@@ -4391,7 +4416,7 @@ namespace UnifiedRegex
 
				             {
			
 
				 #define MBase(TagName, ClassName) \
			
 
				                 case Inst::TagName: \
			
 
				-                    if (((const ClassName *)inst)->Exec(*this, input, inputLength, matchStart, inputOffset, nextSyncInputOffset, instPointer, contStack, assertionStack, qcTicks)) \
			
 
				+                    if (((const ClassName *)inst)->Exec(*this, input, inputLength, matchStart, inputOffset, nextSyncInputOffset, instPointer, contStack, assertionStack, qcTicks, firstIteration)) \
			
 
				                         return; \
			
 
				                     break;
			
 
				 #define M(TagName) MBase(TagName, TagName##Inst)
			
@@ -4415,7 +4440,7 @@ namespace UnifiedRegex
 
				     }
			
 
				 #endif
			
 
				 
			
 
				-    __inline bool Matcher::MatchHere(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks)
			
 
				+    __inline bool Matcher::MatchHere(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration)
			
 
				     {
			
 
				         // Reset the continuation and assertion stacks ready for fresh run
			
 
				         // NOTE: We used to do this after the Run, but it's safer to do it here in case unusual control flow exits
			
@@ -4432,7 +4457,7 @@ namespace UnifiedRegex
 
				         ResetLoopInfos();
			
 
				 #endif
			
 
				 
			
 
				-        Run(input, inputLength, matchStart, nextSyncInputOffset, contStack, assertionStack, qcTicks);
			
 
				+        Run(input, inputLength, matchStart, nextSyncInputOffset, contStack, assertionStack, qcTicks, firstIteration);
			
 
				 
			
 
				         // Leave the continuation and assertion stack memory in place so we don't have to alloc next time
			
 
				 
			
@@ -4769,11 +4794,13 @@ namespace UnifiedRegex
 
				 
			
 
				                 // Need to continue matching even if matchStart == inputLim since some patterns may match an empty string at the end
			
 
				                 // of the input. For instance: /a*$/.exec("b")
			
 
				+                bool firstIteration = true;
			
 
				                 do
			
 
				                 {
			
 
				                     // Let there be only one call to MatchHere(), as that call expands the interpreter loop in-place. Having
			
 
				                     // multiple calls to MatchHere() would bloat the code.
			
 
				-                    res = MatchHere(input, inputLength, offset, nextSyncInputOffset, regexStacks->contStack, regexStacks->assertionStack, qcTicks);
			
 
				+                    res = MatchHere(input, inputLength, offset, nextSyncInputOffset, regexStacks->contStack, regexStacks->assertionStack, qcTicks, firstIteration);
			
 
				+                    firstIteration = false;
			
 
				                 } while(!res && loopMatchHere && ++offset <= inputLength);
			
 
				 
			
 
				                 break;
			
--- a/lib/Parser/RegexRuntime.h
+++ b/lib/Parser/RegexRuntime.h
@@ -650,7 +650,7 @@ namespace UnifiedRegex
 
				 #define INST_BODY_PRINT
			
 
				 #endif
			
 
				 
			
 
				-#define REGEX_INST_EXEC_PARAMETERS Matcher& matcher, const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount& inputOffset, CharCount &nextSyncInputOffset, const uint8*& instPointer, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks
			
 
				+#define REGEX_INST_EXEC_PARAMETERS Matcher& matcher, const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount& inputOffset, CharCount &nextSyncInputOffset, const uint8*& instPointer, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration
			
 
				 #define INST_BODY bool Exec(REGEX_INST_EXEC_PARAMETERS) const; \
			
 
				                   INST_BODY_PRINT
			
 
				 
			
@@ -1702,6 +1702,22 @@ namespace UnifiedRegex
 
				         GroupInfo* groupInfos;
			
 
				         LoopInfo* loopInfos;
			
 
				 
			
 
				+        // Furthest offsets in the input string that we tried to match during a scan.
			
 
				+        // This is used to prevent unnecessary retraversal of the input string.
			
 
				+        //
			
 
				+        // Assume we have the RegExp /<(foo|bar)/ and the input string "<0bar<0bar<0bar".
			
 
				+        // When we try to match the string, we first scan it fully for "foo", but can't
			
 
				+        // find it. Then we scan for "bar" and find it at index 2. However, since there
			
 
				+        // is no '<' right before it, we continue with our search. We do the same thing
			
 
				+        // two more times starting at indexes 7 and 12 (since those are where the "bar"s
			
 
				+        // are), each time scanning the rest of the string fully for "foo".
			
 
				+        //
			
 
				+        // However, if we cache the furthest offsets we tried, we can skip the searches
			
 
				+        // for "foo" after the first time.
			
 
				+        CharCount* literalNextSyncInputOffsets;
			
 
				+
			
 
				+        Recycler* recycler;
			
 
				+
			
 
				         uint previousQcTime;
			
 
				 
			
 
				 #if ENABLE_REGEX_CONFIG_OPTIONS
			
@@ -1766,8 +1782,8 @@ namespace UnifiedRegex
 
				         // As above, but control whether to try backtracking or later matches
			
 
				         __inline bool HardFail(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &inputOffset, const uint8 *&instPointer, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, HardFailMode mode);
			
 
				 
			
 
				-        __inline void Run(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks);
			
 
				-        __inline bool MatchHere(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks);
			
 
				+        __inline void Run(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration);
			
 
				+        __inline bool MatchHere(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration);
			
 
				 
			
 
				         // Return true if assertion succeeded
			
 
				         __inline bool PopAssertion(CharCount &inputOffset, const uint8 *&instPointer, ContStack &contStack, AssertionStack &assertionStack, bool isFailed);
			
--- a/test/UnifiedRegex/rlexe.xml
+++ b/test/UnifiedRegex/rlexe.xml
@@ -163,4 +163,10 @@
 
				       <baseline>SourceToString.baseline</baseline>
			
 
				     </default>
			
 
				   </test>
			
 
				+  <test>
			
 
				+    <default>
			
 
				+      <files>scanner.js</files>
			
 
				+      <compile-flags>-args summary -endargs</compile-flags>
			
 
				+    </default>
			
 
				+  </test>
			
 
				 </regress-exe>
			
--- a/test/UnifiedRegex/scanner.js
+++ b/test/UnifiedRegex/scanner.js
@@ -0,0 +1,28 @@
 
				+//-------------------------------------------------------------------------------------------------------
			
 
				+// Copyright (C) Microsoft. All rights reserved.
			
 
				+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
			
 
				+//-------------------------------------------------------------------------------------------------------
			
 
				+
			
 
				+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
			
 
				+
			
 
				+var tests = [
			
 
				+    {
			
 
				+        name: "SyncToLiteralsAndBackupInst should continue scanning for a literal from the furthest point scanned in the previous iteration",
			
 
				+        body: function () {
			
 
				+            var re = /<(foo|bar)/;
			
 
				+            var string = "0bar1<1<foo";
			
 
				+
			
 
				+            // We first find "foo" at index 8, but then find "bar" at index 1.
			
 
				+            // Since the index of the "bar" is lower, we try to match at that
			
 
				+            // position. However, the "bar" isn't preceded by a '<', so we
			
 
				+            // retry again. In the second iteration, we're supposed to match
			
 
				+            // the string at index 7, right before the "foo".
			
 
				+            var result = re.exec(string);
			
 
				+
			
 
				+            assert.areNotEqual(null, result, "result");
			
 
				+            assert.areEqual(7, result.index, "result.index");
			
 
				+        }
			
 
				+    },
			
 
				+];
			
 
				+
			
 
				+testRunner.runTests(tests, { verbose: WScript.Arguments[0] != 'summary' });