Преглед изворни кода

Do not unnecessarily rescan a string for a literal in a RegExp disjunction

Assume we have the RegExp /<(foo|bar)/ and the input string "<0bar<0bar<0bar".
When we try to match the string, we first scan it fully for "foo", but can't
find it. Then we scan for "bar" and find it at index 2. However, since there is
no '<' right before it, we continue with our search. We currently do the same
thing two more times starting at indexes 7 and 12 (since those are where the
"bar"s are), each time scanning the rest of the string fully for "foo".

However, if we cache the furthest offsets we tried, we can skip the searches
for "foo" after the first time.
Gorkem Yakin пре 10 година
родитељ
комит
ad56276a65
4 измењених фајлова са 88 додато и 11 уклоњено
  1. 35 8
      lib/Parser/RegexRuntime.cpp
  2. 19 3
      lib/Parser/RegexRuntime.h
  3. 6 0
      test/UnifiedRegex/rlexe.xml
  4. 28 0
      test/UnifiedRegex/scanner.js

+ 35 - 8
lib/Parser/RegexRuntime.cpp

@@ -2128,9 +2128,28 @@ namespace UnifiedRegex
         int besti = -1;
         CharCount bestMatchOffset = 0;
 
+        if (matcher.literalNextSyncInputOffsets == nullptr)
+        {
+            matcher.literalNextSyncInputOffsets =
+                RecyclerNewArrayLeaf(matcher.recycler, CharCount, ScannersMixin::MaxNumSyncLiterals);
+        }
+        CharCount* literalNextSyncInputOffsets = matcher.literalNextSyncInputOffsets;
+
+        if (firstIteration)
+        {
+            for (int i = 0; i < numLiterals; i++)
+            {
+                literalNextSyncInputOffsets[i] = inputOffset;
+            }
+        }
+
         for (int i = 0; i < numLiterals; i++)
         {
-            CharCount thisMatchOffset = inputOffset;
+            CharCount thisMatchOffset = literalNextSyncInputOffsets[i];
+            if (inputOffset > thisMatchOffset)
+            {
+                thisMatchOffset = inputOffset;
+            }
 
             if (infos[i]->isEquivClass ?
                     (infos[i]->scanner.Match<CaseInsensitive::EquivClassSize>
@@ -2159,6 +2178,12 @@ namespace UnifiedRegex
                     besti = i;
                     bestMatchOffset = thisMatchOffset;
                 }
+
+                literalNextSyncInputOffsets[i] = thisMatchOffset;
+            }
+            else
+            {
+                literalNextSyncInputOffsets[i] = inputLength;
             }
         }
 
@@ -4198,14 +4223,14 @@ namespace UnifiedRegex
         , program(pattern->rep.unified.program)
         , groupInfos(nullptr)
         , loopInfos(nullptr)
+        , literalNextSyncInputOffsets(nullptr)
+        , recycler(scriptContext->GetRecycler())
         , previousQcTime(0)
 #if ENABLE_REGEX_CONFIG_OPTIONS
         , stats(0)
         , w(0)
 #endif
     {
-        const auto recycler = scriptContext->GetRecycler();
-
         // Don't need to zero out - the constructor for GroupInfo should take care of it
         groupInfos = RecyclerNewArrayLeaf(recycler, GroupInfo, program->numGroups);
 
@@ -4369,7 +4394,7 @@ namespace UnifiedRegex
     const uint32 maxInstTag = instTags[(sizeof(instTags) / sizeof(uint32)) - 1];
 #endif
 
-    __inline void Matcher::Run(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks)
+    __inline void Matcher::Run(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration)
     {
         CharCount inputOffset = matchStart;
         const uint8 *instPointer = program->rep.insts.insts;
@@ -4391,7 +4416,7 @@ namespace UnifiedRegex
             {
 #define MBase(TagName, ClassName) \
                 case Inst::TagName: \
-                    if (((const ClassName *)inst)->Exec(*this, input, inputLength, matchStart, inputOffset, nextSyncInputOffset, instPointer, contStack, assertionStack, qcTicks)) \
+                    if (((const ClassName *)inst)->Exec(*this, input, inputLength, matchStart, inputOffset, nextSyncInputOffset, instPointer, contStack, assertionStack, qcTicks, firstIteration)) \
                         return; \
                     break;
 #define M(TagName) MBase(TagName, TagName##Inst)
@@ -4415,7 +4440,7 @@ namespace UnifiedRegex
     }
 #endif
 
-    __inline bool Matcher::MatchHere(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks)
+    __inline bool Matcher::MatchHere(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration)
     {
         // Reset the continuation and assertion stacks ready for fresh run
         // NOTE: We used to do this after the Run, but it's safer to do it here in case unusual control flow exits
@@ -4432,7 +4457,7 @@ namespace UnifiedRegex
         ResetLoopInfos();
 #endif
 
-        Run(input, inputLength, matchStart, nextSyncInputOffset, contStack, assertionStack, qcTicks);
+        Run(input, inputLength, matchStart, nextSyncInputOffset, contStack, assertionStack, qcTicks, firstIteration);
 
         // Leave the continuation and assertion stack memory in place so we don't have to alloc next time
 
@@ -4769,11 +4794,13 @@ namespace UnifiedRegex
 
                 // Need to continue matching even if matchStart == inputLim since some patterns may match an empty string at the end
                 // of the input. For instance: /a*$/.exec("b")
+                bool firstIteration = true;
                 do
                 {
                     // Let there be only one call to MatchHere(), as that call expands the interpreter loop in-place. Having
                     // multiple calls to MatchHere() would bloat the code.
-                    res = MatchHere(input, inputLength, offset, nextSyncInputOffset, regexStacks->contStack, regexStacks->assertionStack, qcTicks);
+                    res = MatchHere(input, inputLength, offset, nextSyncInputOffset, regexStacks->contStack, regexStacks->assertionStack, qcTicks, firstIteration);
+                    firstIteration = false;
                 } while(!res && loopMatchHere && ++offset <= inputLength);
 
                 break;

+ 19 - 3
lib/Parser/RegexRuntime.h

@@ -650,7 +650,7 @@ namespace UnifiedRegex
 #define INST_BODY_PRINT
 #endif
 
-#define REGEX_INST_EXEC_PARAMETERS Matcher& matcher, const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount& inputOffset, CharCount &nextSyncInputOffset, const uint8*& instPointer, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks
+#define REGEX_INST_EXEC_PARAMETERS Matcher& matcher, const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount& inputOffset, CharCount &nextSyncInputOffset, const uint8*& instPointer, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration
 #define INST_BODY bool Exec(REGEX_INST_EXEC_PARAMETERS) const; \
                   INST_BODY_PRINT
 
@@ -1702,6 +1702,22 @@ namespace UnifiedRegex
         GroupInfo* groupInfos;
         LoopInfo* loopInfos;
 
+        // Furthest offsets in the input string that we tried to match during a scan.
+        // This is used to prevent unnecessary retraversal of the input string.
+        //
+        // Assume we have the RegExp /<(foo|bar)/ and the input string "<0bar<0bar<0bar".
+        // When we try to match the string, we first scan it fully for "foo", but can't
+        // find it. Then we scan for "bar" and find it at index 2. However, since there
+        // is no '<' right before it, we continue with our search. We do the same thing
+        // two more times starting at indexes 7 and 12 (since those are where the "bar"s
+        // are), each time scanning the rest of the string fully for "foo".
+        //
+        // However, if we cache the furthest offsets we tried, we can skip the searches
+        // for "foo" after the first time.
+        CharCount* literalNextSyncInputOffsets;
+
+        Recycler* recycler;
+
         uint previousQcTime;
 
 #if ENABLE_REGEX_CONFIG_OPTIONS
@@ -1766,8 +1782,8 @@ namespace UnifiedRegex
         // As above, but control whether to try backtracking or later matches
         __inline bool HardFail(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &inputOffset, const uint8 *&instPointer, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, HardFailMode mode);
 
-        __inline void Run(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks);
-        __inline bool MatchHere(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks);
+        __inline void Run(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration);
+        __inline bool MatchHere(const Char* const input, const CharCount inputLength, CharCount &matchStart, CharCount &nextSyncInputOffset, ContStack &contStack, AssertionStack &assertionStack, uint &qcTicks, bool firstIteration);
 
         // Return true if assertion succeeded
         __inline bool PopAssertion(CharCount &inputOffset, const uint8 *&instPointer, ContStack &contStack, AssertionStack &assertionStack, bool isFailed);

+ 6 - 0
test/UnifiedRegex/rlexe.xml

@@ -163,4 +163,10 @@
       <baseline>SourceToString.baseline</baseline>
     </default>
   </test>
+  <test>
+    <default>
+      <files>scanner.js</files>
+      <compile-flags>-args summary -endargs</compile-flags>
+    </default>
+  </test>
 </regress-exe>

+ 28 - 0
test/UnifiedRegex/scanner.js

@@ -0,0 +1,28 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+
+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
+
+var tests = [
+    {
+        name: "SyncToLiteralsAndBackupInst should continue scanning for a literal from the furthest point scanned in the previous iteration",
+        body: function () {
+            var re = /<(foo|bar)/;
+            var string = "0bar1<1<foo";
+
+            // We first find "foo" at index 8, but then find "bar" at index 1.
+            // Since the index of the "bar" is lower, we try to match at that
+            // position. However, the "bar" isn't preceded by a '<', so we
+            // retry again. In the second iteration, we're supposed to match
+            // the string at index 7, right before the "foo".
+            var result = re.exec(string);
+
+            assert.areNotEqual(null, result, "result");
+            assert.areEqual(7, result.index, "result.index");
+        }
+    },
+];
+
+testRunner.runTests(tests, { verbose: WScript.Arguments[0] != 'summary' });