Переглянути джерело

Adding failure for character classes in range if unicode flag set.
Adding test cases for RegExp
Fixing bug where backtick '\' added to set if set ends with escaped character class

Kenji Fukuda 7 роки тому
батько
коміт
3daa80d6c3

+ 32 - 19
lib/Parser/RegexParser.cpp

@@ -2126,8 +2126,8 @@ namespace UnifiedRegex
         EncodedChar nextChar = ECLookahead();
         bool previousWasASurrogate = false;
         bool currIsACharSet = false;
-        bool prevWasACharSet = false;
-        bool prevprevWasACharSet = false;
+        bool prevWasACharSetAndPartOfRange = false;
+        bool prevprevWasACharSetAndPartOfRange = false;
 
         while(nextChar != ']')
         {
@@ -2138,6 +2138,7 @@ namespace UnifiedRegex
             {
                 ECConsume();
             }
+
             // These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
             if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
             {
@@ -2156,7 +2157,12 @@ namespace UnifiedRegex
 
                 if (returnedNode->tag == Node::MatchSet)
                 {
-                    pendingCodePoint = nextChar;
+                    if (unicodeFlagPresent && pendingRangeStart != INVALID_CODEPOINT)
+                    {
+                        Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
+                    }
+
+                    pendingCodePoint = INVALID_CODEPOINT;
                     if (pendingRangeStart != INVALID_CODEPOINT)
                     {
                         codePointSet.Set(ctAllocator, '-');
@@ -2173,9 +2179,9 @@ namespace UnifiedRegex
             }
             else if (nextChar == '-')
             {
-                if ((!prevWasACharSet && (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT)) ||  ECLookahead(1) == ']')
+                if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
                 {
-                    // - is just a char, or end of a range. If the previous char of the RegExp was a charset we want to treat it as the beginning of a range.
+                    // - is just a char, or end of a range.
                     codePointToSet = pendingCodePoint;
                     pendingCodePoint = '-';
                     ECConsume();
@@ -2193,26 +2199,33 @@ namespace UnifiedRegex
                 pendingCodePoint = NextChar();
             }
 
-            if (codePointToSet != INVALID_CODEPOINT)
+            if (codePointToSet != INVALID_CODEPOINT || prevprevWasACharSetAndPartOfRange)
             {
-                if (pendingRangeStart != INVALID_CODEPOINT)
+                if (unicodeFlagPresent && prevprevWasACharSetAndPartOfRange)
+                {
+                    Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
+                }
+                else if (prevprevWasACharSetAndPartOfRange)
+                {
+                    if (pendingCodePoint != INVALID_CODEPOINT)
+                    {
+                        codePointSet.Set(ctAllocator, pendingCodePoint);
+                    }
+
+                    codePointSet.Set(ctAllocator, '-');
+                    pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
+                }
+                else if (pendingRangeStart != INVALID_CODEPOINT)
                 {
-                    if (pendingRangeStart > pendingCodePoint && !prevprevWasACharSet)
+                    if (pendingRangeStart > pendingCodePoint)
                     {
                         //We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
                         //This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
                         Assert(!unicodeFlagPresent);
                         Fail(JSERR_RegExpBadRange);
                     }
-                    if (prevprevWasACharSet)
-                    {
-                        codePointSet.Set(ctAllocator, '-');
-                        codePointSet.Set(ctAllocator, pendingCodePoint);
-                    }
-                    else
-                    {
-                        codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
-                    }
+                    
+                    codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
                     pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
                 }
                 else
@@ -2222,8 +2235,8 @@ namespace UnifiedRegex
             }
 
             nextChar = ECLookahead();
-            prevprevWasACharSet = prevWasACharSet;
-            prevWasACharSet = currIsACharSet;
+            prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;
+            prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';
             currIsACharSet = false;
         }
 

+ 1 - 0
lib/Parser/rterrors.h

@@ -366,6 +366,7 @@ RT_ERROR_MSG(JSERR_NoAccessors, 5673, "Invalid property descriptor: accessors no
 RT_ERROR_MSG(JSERR_RegExpInvalidEscape, 5674, "", "Invalid regular expression: invalid escape in unicode pattern", kjstSyntaxError, 0)
 RT_ERROR_MSG(JSERR_RegExpTooManyCapturingGroups, 5675, "", "Regular expression cannot have more than 32,767 capturing groups", kjstRangeError, 0)
 RT_ERROR_MSG(JSERR_ProxyHandlerReturnedFalse, 5676, "Proxy %s handler returned false", "Proxy handler returned false", kjstTypeError, 0)
+RT_ERROR_MSG(JSERR_UnicodeRegExpRangeContainsCharClass, 5677, "", "Character classes not allowed in class ranges", kjstSyntaxError, 0)
 
 //Host errors
 RT_ERROR_MSG(JSERR_HostMaybeMissingPromiseContinuationCallback, 5700, "", "Host may not have set any promise continuation callback. Promises may not be executed.", kjstTypeError, 0)

+ 79 - 110
test/Regex/characterclass_with_range.js

@@ -5,159 +5,128 @@
 
 WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
 
-let re = /^[\s-a-z]$/;
-let reIgnoreCase = /^[\s-a-z]$/i;
-let reUnicode = /^[\s-z]$/u;
-let reNoCharClass = /^[a-c-z]$/;
+function testRegExp(str, regexp, expectedResult)
+{
+    actualResult = regexp.test(str);
+    errorMsg = "Expected result of test for match between string: '" + str + "' and regular expression: " + regexp + " to be " + 
+                    expectedResult + " but was " + actualResult;
+    assert.areEqual(expectedResult, actualResult, errorMsg); 
+}
 
 var tests = [
-    /*No Flag RegExp Tests begin*/
-    {
-        name : "Ensure 'a-z' not counted as range",
-        body : function () 
-        {
-            assert.isFalse(re.test("b"));
-        }
-    },
-    {
-        name : "Ensure 'a' included in set",
-        body : function () 
-        {
-            assert.isTrue(re.test("a"));
-        }
-    },
-    {
-        name : "Ensure ' ' included in set",
-        body : function () 
-        {
-            assert.isTrue(re.test(" "));
-        }
-    },
-    {
-        name : "Ensure 'z' included in set",
-        body : function () 
-        {
-            assert.isTrue(re.test("z"));
-        }
-    },
-    {
-        name : "Ensure '\t' included in set",
-        body : function () 
-        {
-            assert.isTrue(re.test("\t"));
-        }
-    },
-    {
-        name : "Ensure 'a-z' not counted as range",
-        body : function () 
-        {
-            assert.isFalse(re.test("q"));
-        }
-    },
-    {
-        name : "Ensure '\' not counted in set",
-        body : function () 
-        {
-            assert.isFalse(re.test("\\"));
-        }
-    },
-    /*No Flag RegExp Tests End*/
-    /*IgnoreCase Flag RegExp Tests Begin*/
     {
-        name : "Ensure 'O' not included in set",
+        name : "RegExp tests with no flags",
         body : function () 
         {
-            assert.isFalse(reIgnoreCase.test("O"));
+            let re = /[\s-a-z]/;
+            testRegExp("b", re, false);
+            testRegExp("a", re, true);
+            testRegExp(" ", re, true);
+            testRegExp("z", re, true);
+            testRegExp("\t", re, true);
+            testRegExp("q", re, false);
+            testRegExp("\\", re, false);
+            testRegExp("\u2028", re, true);
+            testRegExp("\u2009", re, true);
         }
     },
     {
-        name : "Ensure 'A' included in set",
+        name : "RegExp tests with IgnoreCase flag set",
         body : function () 
         {
-            assert.isTrue(reIgnoreCase.test("A"));
+            let reIgnoreCase = /^[\s-a-z]$/i;
+            testRegExp("O", reIgnoreCase, false);
+            testRegExp("A", reIgnoreCase, true);
+            testRegExp(" ", reIgnoreCase, true);
+            testRegExp("z", reIgnoreCase, true);
+            testRegExp("\t", reIgnoreCase, true);
+            testRegExp("\u2028", reIgnoreCase, true);
+            testRegExp("\u2009", reIgnoreCase, true);
         }
     },
     {
-        name : "Ensure ' ' included in set",
+        name : "RegExp tests with Unicode flag set",
         body : function () 
         {
-            assert.isTrue(reIgnoreCase.test(" "));
+            let reUnicode = /^[a-d]$/u;
+            testRegExp("a", reUnicode, true);
+            testRegExp("c", reUnicode, true);
+            testRegExp("d", reUnicode, true);
+            testRegExp("C", reUnicode, false);
+            testRegExp("g", reUnicode, false);
+            testRegExp("\u2028", reUnicode, false);
+            testRegExp("\u2009", reUnicode, false);
+            assert.throws(() => eval("/^[\\s-z]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in class ranges");
         }
     },
     {
-        name : "Ensure 'z' included in set",
+        name : "Non-character class tests",
         body : function () 
         {
-            assert.isTrue(reIgnoreCase.test("z"));
+            let reNoCharClass = /^[a-c-z]$/;
+            testRegExp("b", reNoCharClass, true);
+            testRegExp("-", reNoCharClass, true);
+            testRegExp("z", reNoCharClass, true);
+            testRegExp("y", reNoCharClass, false);
         }
     },
     {
-        name : "Ensure '\t' included in set",
+        name : "Regression tests from bugFixRegression",
         body : function () 
         {
-            assert.isTrue(reIgnoreCase.test("\t"));
+            assert.areEqual(" -a", /[\s-a-c]*/.exec(" -abc")[0]);
+            assert.areEqual(" -abc", /[\s\-a-c]*/.exec(" -abc")[0]);
+            assert.areEqual(" -ab", /[a-\s-b]*/.exec(" -ab")[0]);
+            assert.areEqual(" -ab", /[a\-\s\-b]*/.exec(" -ab")[0]);
         }
     },
-    /*IgnoreCase Flag RegExp Tests End*/
-    /*Unicode Flag RegExp Tests Begin*/
     {
-        name : "'-' included in set since \s-z treated as union of three types, not range",
+        name : "Special character tests",
         body : function () 
         {
-            assert.isTrue(reUnicode.test("-"));
+                let re = /^[\s][a\sb][\s--c-f]$/;
+                testRegExp('  \\', re, false);
+                testRegExp(' \\ ', re, false);
+                testRegExp('\\  ', re, false);
+                re = /[-][\d\-]/;
+                testRegExp('--', re, true);
+                testRegExp('-9', re, true);
+                testRegExp('  ', re, false);
         }
     },
     {
-        name : "' ' in set from \s character set",
+        name : "Negation character set tests",
         body : function () 
         {
-            assert.isTrue(reUnicode.test(" "));
+                let reNegationCharSet = /[\D-\s]+/;
+                testRegExp('555686', reNegationCharSet, false);
+                testRegExp('555-686', reNegationCharSet, true);
+                testRegExp('alphabet-123', reNegationCharSet, true);
         }
     },
     {
-        name : "b not included in '\s-z'",
+        name : "Non-range tests",
         body : function () 
         {
-            assert.isFalse(reUnicode.test("b"));
+                let reNonRange = /[-\w]/
+                testRegExp('-', reNonRange, true);
+                testRegExp('g', reNonRange, true);
+                testRegExp('5', reNonRange, true);
+                testRegExp(' ', reNonRange, false);
+                testRegExp('\t', reNonRange, false);
+                testRegExp('\u2028', reNonRange, false);
+                
+                reNonRange = /[\w-]/
+                testRegExp('-', reNonRange, true);
+                testRegExp('g', reNonRange, true);
+                testRegExp('5', reNonRange, true);
+                testRegExp(' ', reNonRange, false);
+                testRegExp('\t', reNonRange, false);
+                testRegExp('\u2028', reNonRange, false);
         }
-    },
-    /*Unicode Flag RegExp Tests End*/
-    /*Non-character class tests Begin*/
-    {
-        name : "First range is used",
-        body : function () 
-        {
-            assert.isTrue(reNoCharClass.test("b"));
-        }
-    },
-    {
-        name : "'-' included in set from 2nd dash",
-        body : function () 
-        {
-            assert.isTrue(reNoCharClass.test("-"));
-        }
-    },
-    {
-        name : "z included in set",
-        body : function () 
-        {
-            assert.isTrue(reNoCharClass.test("z"));
-        }
-    },
-    {
-        name : "'c-z' not viewed as range",
-        body : function () 
-        {
-            assert.isFalse(reNoCharClass.test("y"));
-        }
-    }    
-    /*Non-character class tests End*/
+    }
 ];
 
-if (typeof modifyTests !== "undefined") {
-    tests = modifyTests(tests);
-}
-
 testRunner.runTests(tests, {
     verbose : WScript.Arguments[0] != "summary"
 });

+ 0 - 20
test/UnifiedRegex/bugFixRegression.baseline

@@ -632,26 +632,6 @@ exec(/(?:a||b)?/ /*lastIndex=0*/ , "b");
 ["b"] /*input="b", index=0*/ 
 r.lastIndex=0
 RegExp.${_,1,...,9}=["b","","","","","","","","",""]
-exec(/[\s-a-c]*/ /*lastIndex=0*/ , " -abc");
-[" -abc"] /*input=" -abc", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
-exec(/[\s\-a-c]*/ /*lastIndex=0*/ , " -abc");
-[" -abc"] /*input=" -abc", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
-exec(/[a-\s-b]*/ /*lastIndex=0*/ , " -ab");
-[" -ab"] /*input=" -ab", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
-exec(/[a\-\s\-b]*/ /*lastIndex=0*/ , " -ab");
-[" -ab"] /*input=" -ab", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
-exec(/[\s--c-!]*/ /*lastIndex=0*/ , " -./0Abc!");
-[" -./0Abc!"] /*input=" -./0Abc!", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -./0Abc!","","","","","","","","",""]
 EXCEPTION
 exec(/x*(?:(?=x(y*)+)y|\1x)/ /*lastIndex=0*/ , "xxy");
 ["xx",undefined] /*input="xxy", index=0*/ 

+ 0 - 7
test/UnifiedRegex/bugFixRegression.js

@@ -501,13 +501,6 @@ exec(/(?:a*)?/, "");
 exec(/(?:a+)?/, "");
 exec(/(?:a||b)?/, "b");
 
-// WOOB1145588
-exec(/[\s-a-c]*/, " -abc");
-exec(/[\s\-a-c]*/, " -abc");
-exec(/[a-\s-b]*/, " -ab");
-exec(/[a\-\s\-b]*/, " -ab");
-exec(/[\s--c-!]*/, " -./0Abc!");
-
 try {
     var r = new RegExp("[\\s-c-a]*", "");
     exec(r, " -abc");