7 tahun lalu · 288d7fec0d
--- a/lib/Parser/DebugWriter.cpp
+++ b/lib/Parser/DebugWriter.cpp
@@ -72,6 +72,8 @@ namespace UnifiedRegex
 
				         CheckForNewline();
			
 
				         if (c > 0xff)
			
 
				             Output::Print(_u("\\u%lc%lc%lc%lc"), hex[c >> 12], hex[(c >> 8) & 0xf], hex[(c >> 4) & 0xf], hex[c & 0xf]);
			
 
				+        else if (c == '-')
			
 
				+            Output::Print(_u("\\x2d"));
			
 
				         else if (c < ' ' || c > '~')
			
 
				             Output::Print(_u("\\x%lc%lc"), hex[c >> 4], hex[c & 0xf]);
			
 
				         else
			
--- a/lib/Parser/RegexParser.cpp
+++ b/lib/Parser/RegexParser.cpp
@@ -1931,6 +1931,7 @@ namespace UnifiedRegex
 
				         codepoint_t pendingRangeStart = INVALID_CODEPOINT;
			
 
				         codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
			
 
				         bool previousSurrogatePart = false;
			
 
				+
			
 
				         while(nextChar != ']')
			
 
				         {
			
 
				             current = next;
			
@@ -2034,7 +2035,7 @@ namespace UnifiedRegex
 
				 
			
 
				                     lastCodepoint = INVALID_CODEPOINT;
			
 
				                 }
			
 
				-                // If we the next character is the end of range ']', then we can't have a surrogate pair.
			
 
				+                // If the next character is the end of range ']', then we can't have a surrogate pair.
			
 
				                 // The current character is the range end, if we don't already have a candidate.
			
 
				                 else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
			
 
				                 {
			
@@ -2124,6 +2125,10 @@ namespace UnifiedRegex
 
				         codepoint_t pendingRangeStart = INVALID_CODEPOINT;
			
 
				         EncodedChar nextChar = ECLookahead();
			
 
				         bool previousWasASurrogate = false;
			
 
				+        bool currIsACharSet = false;
			
 
				+        bool prevWasACharSetAndPartOfRange = false;
			
 
				+        bool prevprevWasACharSetAndPartOfRange = false;
			
 
				+
			
 
				         while(nextChar != ']')
			
 
				         {
			
 
				             codepoint_t codePointToSet = INVALID_CODEPOINT;
			
@@ -2133,6 +2138,7 @@ namespace UnifiedRegex
 
				             {
			
 
				                 ECConsume();
			
 
				             }
			
 
				+
			
 
				             // These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
			
 
				             if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
			
 
				             {
			
@@ -2147,22 +2153,30 @@ namespace UnifiedRegex
 
				             else if (nextChar == '\\')
			
 
				             {
			
 
				                 Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
			
 
				+                codePointToSet = pendingCodePoint;
			
 
				 
			
 
				                 if (returnedNode->tag == Node::MatchSet)
			
 
				                 {
			
 
				-                    codePointToSet = pendingCodePoint;
			
 
				-                    pendingCodePoint = INVALID_CODEPOINT;
			
 
				                     if (pendingRangeStart != INVALID_CODEPOINT)
			
 
				                     {
			
 
				+                        if (unicodeFlagPresent)
			
 
				+                        {
			
 
				+                            //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
			
 
				+                            //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
			
 
				+                            Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
			
 
				+                        }
			
 
				+
			
 
				                         codePointSet.Set(ctAllocator, '-');
			
 
				                     }
			
 
				+
			
 
				+                    pendingCodePoint = INVALID_CODEPOINT;
			
 
				                     pendingRangeStart = INVALID_CODEPOINT;
			
 
				                     codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
			
 
				+                    currIsACharSet = true;
			
 
				                 }
			
 
				                 else
			
 
				                 {
			
 
				                     // Just a character
			
 
				-                    codePointToSet = pendingCodePoint;
			
 
				                     pendingCodePoint = deferredCharNode.cs[0];
			
 
				                 }
			
 
				             }
			
@@ -2188,9 +2202,26 @@ namespace UnifiedRegex
 
				                 pendingCodePoint = NextChar();
			
 
				             }
			
 
				 
			
 
				-            if (codePointToSet != INVALID_CODEPOINT)
			
 
				+            if (codePointToSet != INVALID_CODEPOINT || prevprevWasACharSetAndPartOfRange)
			
 
				             {
			
 
				-                if (pendingRangeStart != INVALID_CODEPOINT)
			
 
				+                if (prevprevWasACharSetAndPartOfRange)
			
 
				+                {
			
 
				+                    //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
			
 
				+                    //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
			
 
				+                    if (unicodeFlagPresent)
			
 
				+                    {
			
 
				+                        Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
			
 
				+                    }
			
 
				+
			
 
				+                    if (pendingCodePoint != INVALID_CODEPOINT)
			
 
				+                    {
			
 
				+                        codePointSet.Set(ctAllocator, pendingCodePoint);
			
 
				+                    }
			
 
				+
			
 
				+                    codePointSet.Set(ctAllocator, '-'); //Add '-' to set because a range was detected but turned out to be a union of character set with '-' and another atom.
			
 
				+                    pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
			
 
				+                }
			
 
				+                else if (pendingRangeStart != INVALID_CODEPOINT)
			
 
				                 {
			
 
				                     if (pendingRangeStart > pendingCodePoint)
			
 
				                     {
			
@@ -2199,6 +2230,7 @@ namespace UnifiedRegex
 
				                         Assert(!unicodeFlagPresent);
			
 
				                         Fail(JSERR_RegExpBadRange);
			
 
				                     }
			
 
				+                    
			
 
				                     codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
			
 
				                     pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
			
 
				                 }
			
@@ -2209,6 +2241,9 @@ namespace UnifiedRegex
 
				             }
			
 
				 
			
 
				             nextChar = ECLookahead();
			
 
				+            prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;
			
 
				+            prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';
			
 
				+            currIsACharSet = false;
			
 
				         }
			
 
				 
			
 
				         if (pendingCodePoint != INVALID_CODEPOINT)
			
--- a/lib/Parser/rterrors.h
+++ b/lib/Parser/rterrors.h
@@ -366,6 +366,7 @@ RT_ERROR_MSG(JSERR_NoAccessors, 5673, "Invalid property descriptor: accessors no
 
				 RT_ERROR_MSG(JSERR_RegExpInvalidEscape, 5674, "", "Invalid regular expression: invalid escape in unicode pattern", kjstSyntaxError, 0)
			
 
				 RT_ERROR_MSG(JSERR_RegExpTooManyCapturingGroups, 5675, "", "Regular expression cannot have more than 32,767 capturing groups", kjstRangeError, 0)
			
 
				 RT_ERROR_MSG(JSERR_ProxyHandlerReturnedFalse, 5676, "Proxy %s handler returned false", "Proxy handler returned false", kjstTypeError, 0)
			
 
				+RT_ERROR_MSG(JSERR_UnicodeRegExpRangeContainsCharClass, 5677, "%s", "Character classes not allowed in a RegExp class range.", kjstSyntaxError, 0)
			
 
				 
			
 
				 //Host errors
			
 
				 RT_ERROR_MSG(JSERR_HostMaybeMissingPromiseContinuationCallback, 5700, "", "Host may not have set any promise continuation callback. Promises may not be executed.", kjstTypeError, 0)
			
--- a/test/Regex/characterclass_with_range.js
+++ b/test/Regex/characterclass_with_range.js
@@ -0,0 +1,140 @@
 
				+//-------------------------------------------------------------------------------------------------------

			
 
				+// Copyright (C) Microsoft. All rights reserved.

			
 
				+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.

			
 
				+//-------------------------------------------------------------------------------------------------------

			
 
				+

			
 
				+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");

			
 
				+

			
 
				+function matchRegExp(str, regexp, expectedResult)

			
 
				+{

			
 
				+    matchResult = str.match(regexp);//regexp.test(str);

			
 
				+    errorMsg = "Expected result of match between string: '" + str + "' and regular expression: " + regexp + " to be " + 

			
 
				+                    expectedResult + " but was " + matchResult;

			
 
				+

			
 
				+    actualResult = matchResult == null ? null : matchResult[0];

			
 
				+    assert.areEqual(expectedResult, actualResult, errorMsg); 

			
 
				+}

			
 
				+

			
 
				+var tests = [

			
 
				+    {

			
 
				+        name : "RegExp tests with no flags",

			
 
				+        body : function () 

			
 
				+        {

			
 
				+            let re = /[\s-a-z]/;

			
 
				+            matchRegExp("b", re, null);

			
 
				+            matchRegExp("a", re, "a");

			
 
				+            matchRegExp(" ", re, " ");

			
 
				+            matchRegExp("z", re, "z");

			
 
				+            matchRegExp("\t", re, "\t");

			
 
				+            matchRegExp("q", re, null);

			
 
				+            matchRegExp("\\", re, null);

			
 
				+            matchRegExp("\u2028", re, "\u2028");

			
 
				+            matchRegExp("\u2009", re, "\u2009");

			
 
				+        }

			
 
				+    },

			
 
				+    {

			
 
				+        name : "RegExp tests with IgnoreCase flag set",

			
 
				+        body : function () 

			
 
				+        {

			
 
				+            let reIgnoreCase = /^[\s-a-z]$/i;

			
 
				+            matchRegExp("O", reIgnoreCase, null);

			
 
				+            matchRegExp("A", reIgnoreCase, "A");

			
 
				+            matchRegExp(" ", reIgnoreCase, " ");

			
 
				+            matchRegExp("z", reIgnoreCase, "z");

			
 
				+            matchRegExp("\t", reIgnoreCase, "\t");

			
 
				+            matchRegExp("\u2028", reIgnoreCase, "\u2028");

			
 
				+            matchRegExp("\u2009", reIgnoreCase, "\u2009");

			
 
				+        }

			
 
				+    },

			
 
				+    {

			
 
				+        name : "RegExp tests with Unicode flag set",

			
 
				+        body : function () 

			
 
				+        {

			
 
				+            let reUnicode = /^[a-d]$/u;

			
 
				+            matchRegExp("a", reUnicode, "a");

			
 
				+            matchRegExp("c", reUnicode, "c");

			
 
				+            matchRegExp("d", reUnicode, "d");

			
 
				+            matchRegExp("C", reUnicode, null);

			
 
				+            matchRegExp("g", reUnicode, null);

			
 
				+            matchRegExp("\u2028", reUnicode, null);

			
 
				+            matchRegExp("\u2009", reUnicode, null);

			
 
				+            assert.throws(() => eval("/^[\\s-z]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in a RegExp class range.");

			
 
				+            assert.throws(() => eval("/^[z-\\s]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in a RegExp class range.");

			
 
				+        

			
 
				+        }

			
 
				+    },

			
 
				+    {

			
 
				+        name : "Non-character class tests",

			
 
				+        body : function () 

			
 
				+        {

			
 
				+            let reNoCharClass = /^[a-c-z]$/;

			
 
				+            matchRegExp("b", reNoCharClass, "b");

			
 
				+            matchRegExp("-", reNoCharClass, "-");

			
 
				+            matchRegExp("z", reNoCharClass, "z");

			
 
				+            matchRegExp("y", reNoCharClass, null);

			
 
				+        }

			
 
				+    },

			
 
				+    {

			
 
				+        name : "Regression tests from bugFixRegression",

			
 
				+        body : function () 

			
 
				+        {

			
 
				+            matchRegExp(" -abc", /[\s-a-c]*/, " -a");

			
 
				+            matchRegExp(" -abc", /[\s\-a-c]*/, " -abc");

			
 
				+            matchRegExp(" -ab", /[a-\s-b]*/, " -ab");

			
 
				+            matchRegExp(" -ab", /[a\-\s\-b]*/, " -ab");

			
 
				+            assert.throws(() => eval("/^[\\s--c-!]$/.exec(\"-./0Abc!\")"), SyntaxError, "Expected an error due to 'c-!' being an invalid range.", "Invalid range in character set");

			
 
				+        }

			
 
				+    },

			
 
				+    {

			
 
				+        name : "Special character tests",

			
 
				+        body : function () 

			
 
				+        {

			
 
				+                let re = /^[\s][a\sb][\s--c-f]$/;

			
 
				+                matchRegExp('  \\', re, null);

			
 
				+                matchRegExp(' \\ ', re, null);

			
 
				+                matchRegExp('\\  ', re, null);

			
 
				+                re = /[-][\d\-]/;

			
 
				+                matchRegExp('--', re, '--');

			
 
				+                matchRegExp('-9', re, '-9');

			
 
				+                matchRegExp('  ', re, null);

			
 
				+                matchRegExp('-\\', re, null);

			
 
				+        }

			
 
				+    },

			
 
				+    {

			
 
				+        name : "Negation character set tests",

			
 
				+        body : function () 

			
 
				+        {

			
 
				+                let reNegationCharSet = /[\D-\s]+/;

			
 
				+                matchRegExp('555686', reNegationCharSet, null);

			
 
				+                matchRegExp('555-686', reNegationCharSet, '-');

			
 
				+                matchRegExp('alphabet-123', reNegationCharSet, 'alphabet-');

			
 
				+        }

			
 
				+    },

			
 
				+    {

			
 
				+        name : "Non-range tests",

			
 
				+        body : function () 

			
 
				+        {

			
 
				+                let reNonRange = /[-\w]/

			
 
				+                matchRegExp('-', reNonRange, '-');

			
 
				+                matchRegExp('g', reNonRange, 'g');

			
 
				+                matchRegExp('5', reNonRange, '5');

			
 
				+                matchRegExp(' ', reNonRange, null);

			
 
				+                matchRegExp('\t', reNonRange, null);

			
 
				+                matchRegExp('\u2028', reNonRange, null);

			
 
				+                matchRegExp('\\', reNonRange, null);

			
 
				+                

			
 
				+                reNonRange = /[\w-]/

			
 
				+                matchRegExp('-', reNonRange, '-');

			
 
				+                matchRegExp('g', reNonRange, 'g');

			
 
				+                matchRegExp('5', reNonRange, '5');

			
 
				+                matchRegExp(' ', reNonRange, null);

			
 
				+                matchRegExp('\t', reNonRange, null);

			
 
				+                matchRegExp('\u2028', reNonRange, null);

			
 
				+                matchRegExp('\\', reNonRange, null); 

			
 
				+        }

			
 
				+    }

			
 
				+];

			
 
				+

			
 
				+testRunner.runTests(tests, {

			
 
				+    verbose : WScript.Arguments[0] != "summary"

			
 
				+});

			
--- a/test/Regex/rlexe.xml
+++ b/test/Regex/rlexe.xml
@@ -229,4 +229,10 @@
 
				       <compile-flags>-args summary -endargs</compile-flags>
			
 
				     </default>
			
 
				   </test>
			
 
				+    <test>
			
 
				+    <default>
			
 
				+      <files>characterclass_with_range.js</files>
			
 
				+      <compile-flags>-args summary -endargs</compile-flags>
			
 
				+    </default>
			
 
				+  </test>
			
 
				 </regress-exe>
			
--- a/test/UnifiedRegex/bugFixRegression.baseline
+++ b/test/UnifiedRegex/bugFixRegression.baseline
@@ -632,26 +632,6 @@ exec(/(?:a||b)?/ /*lastIndex=0*/ , "b");
 
				 ["b"] /*input="b", index=0*/ 

			
 
				 r.lastIndex=0

			
 
				 RegExp.${_,1,...,9}=["b","","","","","","","","",""]

			
 
				-exec(/[\s-a-c]*/ /*lastIndex=0*/ , " -abc");

			
 
				-[" -abc"] /*input=" -abc", index=0*/ 

			
 
				-r.lastIndex=0

			
 
				-RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]

			
 
				-exec(/[\s\-a-c]*/ /*lastIndex=0*/ , " -abc");

			
 
				-[" -abc"] /*input=" -abc", index=0*/ 

			
 
				-r.lastIndex=0

			
 
				-RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]

			
 
				-exec(/[a-\s-b]*/ /*lastIndex=0*/ , " -ab");

			
 
				-[" -ab"] /*input=" -ab", index=0*/ 

			
 
				-r.lastIndex=0

			
 
				-RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]

			
 
				-exec(/[a\-\s\-b]*/ /*lastIndex=0*/ , " -ab");

			
 
				-[" -ab"] /*input=" -ab", index=0*/ 

			
 
				-r.lastIndex=0

			
 
				-RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]

			
 
				-exec(/[\s--c-!]*/ /*lastIndex=0*/ , " -./0Abc!");

			
 
				-[" -./0Abc!"] /*input=" -./0Abc!", index=0*/ 

			
 
				-r.lastIndex=0

			
 
				-RegExp.${_,1,...,9}=[" -./0Abc!","","","","","","","","",""]

			
 
				 EXCEPTION

			
 
				 exec(/x*(?:(?=x(y*)+)y|\1x)/ /*lastIndex=0*/ , "xxy");

			
 
				 ["xx",undefined] /*input="xxy", index=0*/ 

			
--- a/test/UnifiedRegex/bugFixRegression.js
+++ b/test/UnifiedRegex/bugFixRegression.js
@@ -501,13 +501,6 @@ exec(/(?:a*)?/, "");
 
				 exec(/(?:a+)?/, "");

			
 
				 exec(/(?:a||b)?/, "b");

			
 
				 

			
 
				-// WOOB1145588

			
 
				-exec(/[\s-a-c]*/, " -abc");

			
 
				-exec(/[\s\-a-c]*/, " -abc");

			
 
				-exec(/[a-\s-b]*/, " -ab");

			
 
				-exec(/[a\-\s\-b]*/, " -ab");

			
 
				-exec(/[\s--c-!]*/, " -./0Abc!");

			
 
				-

			
 
				 try {

			
 
				     var r = new RegExp("[\\s-c-a]*", "");

			
 
				     exec(r, " -abc");