Ver Fonte

Regex backtracking improvements

Rajat Dua há 9 anos atrás
pai
commit
6b4151a44b

+ 1 - 0
lib/Common/ConfigFlagsList.h

@@ -277,6 +277,7 @@ PHASE(All)
         PHASE(Host)
         PHASE(BailOut)
         PHASE(RegexQc)
+        PHASE(RegexOptBT)
         PHASE(InlineCache)
         PHASE(PolymorphicInlineCache)
         PHASE(MissingPropertyCache)

+ 14 - 1
lib/Parser/RegexCompileTime.cpp

@@ -3460,6 +3460,12 @@ namespace UnifiedRegex
         innerFollow->UnionInPlace(compiler.ctAllocator, *accumFollow);
         innerFollow->UnionInPlace(compiler.ctAllocator, *body->firstSet);
 
+        if (followSet->IsSingleton())
+        {
+            Assert(followSet->IsCompact());
+            followFirst = followSet->GetCompactChar(0);
+        }
+
         /*
         All of the following must be true for the loop body's follow to be irrefutable:
 
@@ -3883,7 +3889,14 @@ namespace UnifiedRegex
                 //   LoopSet
                 //
                 Assert(body->IsSimpleOneChar());
-                EMIT(compiler, LoopSetInst, compiler.NextLoopId(), repeats, !isNotInLoop)->set.CloneFrom(compiler.rtAllocator, *body->firstSet);
+                if (followFirst == MaxChar)
+                {
+                    EMIT(compiler, LoopSetInst, compiler.NextLoopId(), repeats, !isNotInLoop)->set.CloneFrom(compiler.rtAllocator, *body->firstSet);
+                }
+                else
+                {
+                    EMIT(compiler, LoopSetWithFollowFirstInst, compiler.NextLoopId(), repeats, !isNotInLoop, followFirst)->set.CloneFrom(compiler.rtAllocator, *body->firstSet);
+                }
                 break;
             }
 

+ 5 - 0
lib/Parser/RegexCompileTime.h

@@ -508,6 +508,10 @@ namespace UnifiedRegex
     {
         Node* body;
         CountDomain repeats;
+
+        // If set and not equal to MaxChar, followFirst is the deterministic first character of the follow set of this node.
+        // Could be expanded to encompass the entire firstSet of the next node.
+        Char followFirst;
         bool isGreedy;
 
         enum CompilationScheme
@@ -534,6 +538,7 @@ namespace UnifiedRegex
             : Node(Loop)
             , repeats(lower, upper)
             , isGreedy(isGreedy)
+            , followFirst(MaxChar)
             , body(body)
             , scheme(BeginEnd)
         {

+ 1 - 0
lib/Parser/RegexContcodes.h

@@ -11,4 +11,5 @@ M(RepeatLoop)
 M(PopAssertion)
 M(RewindLoopFixed)
 M(RewindLoopSet)
+M(RewindLoopSetWithFollowFirst)
 M(RewindLoopFixedGroupLastIteration)

+ 1 - 0
lib/Parser/RegexOpCodes.h

@@ -73,6 +73,7 @@ M(RepeatLoopIfSet)
 M(BeginLoopFixed)
 M(RepeatLoopFixed)
 M(LoopSet)
+M(LoopSetWithFollowFirst)
 M(BeginLoopFixedGroupLastIteration)
 M(RepeatLoopFixedGroupLastIteration)
 M(BeginGreedyLoopNoBacktrack)

+ 163 - 5
lib/Parser/RegexRuntime.cpp

@@ -3052,6 +3052,80 @@ namespace UnifiedRegex
     }
 #endif
 
+    inline bool LoopSetWithFollowFirstInst::Exec(REGEX_INST_EXEC_PARAMETERS) const
+    {
+        LoopInfo* loopInfo = matcher.LoopIdToLoopInfo(loopId);
+        Assert(PHASE_OFF1(Js::RegexOptBTPhase) || !loopInfo->offsetsOfFollowFirst || loopInfo->offsetsOfFollowFirst->Empty());
+
+        // If loop is contained in an outer loop, continuation stack may already have a RewindLoopFixed entry for
+        // this loop. We must make sure it's state is preserved on backtrack.
+        if (hasOuterLoops)
+        {
+            PUSH(contStack, RestoreLoopCont, loopId, *loopInfo);
+#if ENABLE_REGEX_CONFIG_OPTIONS
+            matcher.PushStats(contStack, input);
+#endif
+        }
+
+        // startInputOffset will stay here for all iterations, and we'll use number of length to figure out
+        // where in the input to rewind to
+        loopInfo->startInputOffset = inputOffset;
+
+        // Consume as many elements of set as possible
+        const RuntimeCharSet<Char>& matchSet = this->set;
+        const CharCount loopMatchStart = inputOffset;
+        const CharCountOrFlag repeatsUpper = repeats.upper;
+        const CharCount inputEndOffset =
+            static_cast<CharCount>(repeatsUpper) >= inputLength - inputOffset
+            ? inputLength
+            : inputOffset + static_cast<CharCount>(repeatsUpper);
+#if ENABLE_REGEX_CONFIG_OPTIONS
+        matcher.CompStats();
+#endif
+        while (inputOffset < inputEndOffset && matchSet.Get(input[inputOffset]))
+        {
+#if ENABLE_REGEX_CONFIG_OPTIONS
+            matcher.CompStats();
+#endif
+            if (!PHASE_OFF1(Js::RegexOptBTPhase) && input[inputOffset] == this->followFirst)
+            {
+                loopInfo->EnsureOffsetsOfFollowFirst(matcher);
+                loopInfo->offsetsOfFollowFirst->Push(inputOffset - loopInfo->startInputOffset);
+            }
+            inputOffset++;
+        }
+
+        loopInfo->number = inputOffset - loopMatchStart;
+        if (loopInfo->number < repeats.lower)
+            return matcher.Fail(FAIL_PARAMETERS);
+        if (loopInfo->number > repeats.lower)
+        {
+            // CHOICEPOINT: If follow fails, try consuming one fewer characters
+            Assert(instPointer == (uint8*)this);
+            PUSH(contStack, RewindLoopSetWithFollowFirstCont, matcher.InstPointerToLabel(instPointer));
+#if ENABLE_REGEX_CONFIG_OPTIONS
+            matcher.PushStats(contStack, input);
+#endif
+        }
+        // else: failure of follow signals failure of entire loop
+
+        // Continue with follow
+        instPointer += sizeof(*this);
+        return false;
+    }
+
+#if ENABLE_REGEX_CONFIG_OPTIONS
+    int LoopSetWithFollowFirstInst::Print(DebugWriter* w, Label label, const Char* litbuf) const
+    {
+        w->Print(_u("L%04x: LoopSet(loopId: %d, followFirst: %c, "), label, loopId, followFirst);
+        repeats.Print(w);
+        w->Print(_u(", hasOuterLoops: %s, "), hasOuterLoops ? _u("true") : _u("false"));
+        SetMixin::Print(w, litbuf);
+        w->PrintEOL(_u(")"));
+        return sizeof(*this);
+    }
+#endif
+
     // ----------------------------------------------------------------------
     // BeginLoopFixedGroupLastIterationInst (optimized instruction)
     // ----------------------------------------------------------------------
@@ -3879,6 +3953,14 @@ namespace UnifiedRegex
     }
 #endif
 
+    void LoopInfo::EnsureOffsetsOfFollowFirst(Matcher& matcher)
+    {
+        if (this->offsetsOfFollowFirst == nullptr)
+        {
+            this->offsetsOfFollowFirst = Anew(matcher.pattern->library->GetScriptContext()->RegexAllocator(), SList<CharCount>, matcher.pattern->library->GetScriptContext()->RegexAllocator());
+        }
+    }
+
 #if ENABLE_REGEX_CONFIG_OPTIONS
     void GroupInfo::Print(DebugWriter* w, const Char* const input) const
     {
@@ -4116,12 +4198,12 @@ namespace UnifiedRegex
 
         LoopSetInst* begin = matcher.L2I(LoopSet, beginLabel);
         LoopInfo* loopInfo = matcher.LoopIdToLoopInfo(begin->loopId);
-
-        // >loopInfonumber is the number of iterations completed before trying follow
+        
+        // loopInfo->number is the number of iterations completed before trying follow
         Assert(loopInfo->number > begin->repeats.lower);
-        // Try follow with one fewer iteration
+        // Try follow with fewer iterations
         loopInfo->number--;
-
+        
         // Rewind input
         inputOffset = loopInfo->startInputOffset + loopInfo->number;
 
@@ -4147,6 +4229,83 @@ namespace UnifiedRegex
     }
 #endif
 
+    // ----------------------------------------------------------------------
+    // RewindLoopSetWithFollowFirstCont
+    // ----------------------------------------------------------------------
+
+    inline bool RewindLoopSetWithFollowFirstCont::Exec(REGEX_CONT_EXEC_PARAMETERS)
+    {
+        matcher.QueryContinue(qcTicks);
+
+        LoopSetWithFollowFirstInst* begin = matcher.L2I(LoopSetWithFollowFirst, beginLabel);
+        LoopInfo* loopInfo = matcher.LoopIdToLoopInfo(begin->loopId);
+
+        // loopInfo->number is the number of iterations completed before trying follow
+        Assert(loopInfo->number > begin->repeats.lower);
+        // Try follow with fewer iterations
+        if (!PHASE_OFF1(Js::RegexOptBTPhase))
+        {
+            if (loopInfo->offsetsOfFollowFirst == nullptr)
+            {
+                if (begin->followFirst != MaxUChar)
+                {
+                    // We determined the first character in the follow set at compile time,
+                    // but didn't find a single match for it in the last iteration of the loop.
+                    // So, there is no benefit in backtracking.
+                    loopInfo->number = begin->repeats.lower; // stop backtracking
+                }
+                else
+                {
+                    // We couldn't determine the first character in the follow set at compile time;
+                    // fall back to backtracking by one character at a time.
+                    loopInfo->number--;
+                }
+            }
+            else
+            {
+                if (loopInfo->offsetsOfFollowFirst->Empty())
+                {
+                    // We have already backtracked to the first offset where we matched the LoopSet's followFirst;
+                    // no point in backtracking more.
+                    loopInfo->number = begin->repeats.lower; // stop backtracking
+                }
+                else
+                {
+                    // Backtrack to the previous offset where we matched the LoopSet's followFirst
+                    loopInfo->number = loopInfo->offsetsOfFollowFirst->Pop();
+                }
+            }
+        }
+        else
+        {
+            loopInfo->number--;
+        }
+
+        // Rewind input
+        inputOffset = loopInfo->startInputOffset + loopInfo->number;
+
+        if (loopInfo->number > begin->repeats.lower)
+        {
+            // Un-pop the continuation ready for next time
+            contStack.UnPop<RewindLoopSetWithFollowFirstCont>();
+#if ENABLE_REGEX_CONFIG_OPTIONS
+            matcher.UnPopStats(contStack, input);
+#endif
+        }
+        // else: Can't try any fewer iterations if follow fails, so leave continuation as popped and let failure propagate
+
+        instPointer = matcher.LabelToInstPointer(beginLabel + sizeof(LoopSetWithFollowFirstInst));
+        return true; // STOP BACKTRACKING
+    }
+
+#if ENABLE_REGEX_CONFIG_OPTIONS
+    int RewindLoopSetWithFollowFirstCont::Print(DebugWriter* w, const Char* const input) const
+    {
+        w->PrintEOL(_u("RewindLoopSetWithFollowFirst(beginLabel: L%04x)"), beginLabel);
+        return sizeof(*this);
+    }
+#endif
+
     // ----------------------------------------------------------------------
     // RewindLoopFixedGroupLastIterationCont
     // ----------------------------------------------------------------------
@@ -4473,7 +4632,6 @@ namespace UnifiedRegex
 #endif
 
         Run(input, inputLength, matchStart, nextSyncInputOffset, contStack, assertionStack, qcTicks, firstIteration);
-
         // Leave the continuation and assertion stack memory in place so we don't have to alloc next time
 
         return WasLastMatchSuccessful();

+ 30 - 1
lib/Parser/RegexRuntime.h

@@ -1230,10 +1230,24 @@ namespace UnifiedRegex
         inline LoopSetInst(int loopId, const CountDomain& repeats, bool hasOuterLoops)
             : Inst(LoopSet), loopId(loopId), repeats(repeats), hasOuterLoops(hasOuterLoops) {}
 
+        inline LoopSetInst(InstTag tag, int loopId, const CountDomain& repeats, bool hasOuterLoops)
+            : Inst(tag), loopId(loopId), repeats(repeats), hasOuterLoops(hasOuterLoops) {}
+        
         INST_BODY
         INST_BODY_FREE(SetMixin)
     };
 
+    // Loop is greedy, contains a MatchSet only, first character in its follow set is known
+    struct LoopSetWithFollowFirstInst : LoopSetInst
+    {
+        Char followFirst;
+
+        inline LoopSetWithFollowFirstInst(int loopId, const CountDomain& repeats, bool hasOuterLoops, Char followFirst)
+            : LoopSetInst(InstTag::LoopSetWithFollowFirst, loopId, repeats, hasOuterLoops), followFirst(followFirst) {}
+
+        INST_BODY
+    };
+
     // Loop is greedy, fixed width, deterministic body, one outermost group
     struct BeginLoopFixedGroupLastIterationInst : Inst, BeginLoopMixin, FixedLengthMixin, GroupMixin, NoNeedToSaveMixin
     {
@@ -1425,16 +1439,23 @@ namespace UnifiedRegex
     {
         CharCount number;            // current iteration number
         CharCount startInputOffset;  // input offset where the iteration started
-
+        SList<CharCount>* offsetsOfFollowFirst; // list of offsets from startInputOffset where we matched with followFirst
+        
         inline void Reset()
         {
 #if DBG
             // So debug prints will look nice
             number = 0;
             startInputOffset = 0;
+            if (offsetsOfFollowFirst)
+            {
+                offsetsOfFollowFirst->Clear();
+            }
 #endif
         }
 
+        inline void EnsureOffsetsOfFollowFirst(Matcher& matcher);
+
 #if ENABLE_REGEX_CONFIG_OPTIONS
         void Print(DebugWriter* w) const;
 #endif
@@ -1604,6 +1625,14 @@ namespace UnifiedRegex
         CONT_BODY
     };
 
+    struct RewindLoopSetWithFollowFirstCont : Cont
+    {
+        Label beginLabel;   // label of LoopSet instruction
+
+        inline RewindLoopSetWithFollowFirstCont(Label beginLabel) : Cont(RewindLoopSetWithFollowFirst), beginLabel(beginLabel) {}
+
+        CONT_BODY
+    };
 
     struct RewindLoopFixedGroupLastIterationCont : Cont
     {