Bladeren bron

Change parsing of \c to reflect spec
Fixes #2973

Kenji Fukuda 7 jaren geleden
bovenliggende
commit
b15fa2c1de
3 gewijzigde bestanden met toevoegingen van 222 en 21 verwijderingen
  1. 14 20
      lib/Parser/RegexParser.cpp
  2. 201 0
      test/Regex/control_character_escapes.js
  3. 7 1
      test/Regex/rlexe.xml

+ 14 - 20
lib/Parser/RegexParser.cpp

@@ -1529,7 +1529,13 @@ namespace UnifiedRegex
         else if (ECLookahead() == 'c')
         {
             if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
+            {
                 ECConsume(2);
+            }
+            else
+            {
+                DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);
+            }
             return false;
         }
         else
@@ -2494,6 +2500,8 @@ namespace UnifiedRegex
                 }
                 else
                 {
+                    DeferredFailIfUnicode(JSERR_RegExpInvalidEscape); // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar SPEC #prod-annexB-Term 
+
                     if (!IsEOF())
                     {
                         EncodedChar ecLookahead = ECLookahead();
@@ -2625,7 +2633,7 @@ namespace UnifiedRegex
                 standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
                 return deferredSetNode;
             case 'c':
-                if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
+                if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word
                 {
                     c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
                     ECConsume();
@@ -2633,25 +2641,11 @@ namespace UnifiedRegex
                 }
                 else
                 {
-                    // SPEC DEVIATION: For non-letters, still take lower 5 bits, e.g. [\c1] == [\x11].
-                    //                 However, '-', ']', and EOF make the \c just a 'c'.
-                    if (!IsEOF())
-                    {
-                        EncodedChar ec = ECLookahead();
-                        switch (ec)
-                        {
-                        case '-':
-                        case ']':
-                            // fall-through for identity escape with 'c'
-                            break;
-                        default:
-                            c = UTC(Chars<EncodedChar>::CTU(ec) % 32);
-                            ECConsume();
-                            // fall-through for identity escape
-                            break;
-                        }
-                    }
-                    // else: fall-through for identity escape with 'c'
+                    // If the lookahead is a non-alphanumeric and not a dash('-'), then treat '\' and 'c' separately.
+                    //#sec-regular-expression-patterns-semantics 
+                    ECRevert(1); //Put cursor back at 'c' and treat it as a non-escaped character.
+                    deferredCharNode->cs[0] = '\\';
+                    return deferredCharNode;
                 }
                 break;
             case 'x':

+ 201 - 0
test/Regex/control_character_escapes.js

@@ -0,0 +1,201 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+
+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
+
+function matchRegExp(str, regexpLiteral, expectedResult)
+{
+    matchResultLiteral = str.match(regexpLiteral);
+    errorMsgBase = "Expected result of match between string: '" + str + "' and regular expression: " + regexpLiteral + " to be " + 
+                    expectedResult + " but was "
+
+    actualResultLiteral = matchResultLiteral == null ? null : matchResultLiteral[0];
+    assert.areEqual(expectedResult, actualResultLiteral, errorMsgBase + actualResultLiteral); 
+    
+    regexpConstructor = new RegExp(regexpLiteral);
+    matchResultConstructor = str.match(regexpConstructor);
+
+    actualResultConstructor = matchResultConstructor == null ? null : matchResultConstructor[0];
+    assert.areEqual(expectedResult, actualResultConstructor, errorMsgBase + actualResultConstructor); 
+}
+
+var tests = [
+    {
+        name : "Control characters followed by a word character ([A-Za-z0-9_])",
+        body : function () 
+        {
+            re = /[\c6]+/; //'6' = ascii x36
+            matchRegExp("6", re, null);
+            matchRegExp("\\", re, null);
+            matchRegExp("\\c6", re, null);
+            matchRegExp("c", re, null);
+            matchRegExp("\x16", re, "\x16");
+            
+            re = /\c6/; //'6' = ascii x36
+            matchRegExp("\\c6", re, "\\c6");
+            matchRegExp("\\", re, null);
+            matchRegExp("6", re, null);
+            matchRegExp("c", re, null);
+            matchRegExp("\x16", re, null);
+            
+            re = /\c6[\c6]+/; //'6' = ascii x36
+            matchRegExp("\\c6\x16", re, "\\c6\x16");
+            matchRegExp("\\", re, null);
+            matchRegExp("c", re, null);
+            matchRegExp("\x16", re, null);
+            
+            re = /[\ca]+/; //'a' = ascii x61
+            matchRegExp("a", re, null);
+            matchRegExp("\\", re, null);
+            matchRegExp("c", re, null);
+            matchRegExp("00xyzabc123\x01qrst", re, "\x01");
+	    
+            re = /[\c_]+/; //'_' = ascii 0x5F
+            matchRegExp("\x1F\x1F\x05", re, "\x1F\x1F");
+            matchRegExp("\\\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("ccc_", re, null);
+            
+            re = /[\cG]*/; //'G' = ascii x47
+            matchRegExp("\x07\x06\x05", re, "\x07");
+            matchRegExp("\\\\", re, "");
+            matchRegExp("////", re, "");
+            matchRegExp("cccG", re, "");
+            
+            re = /\cG\cf/; //'G' = ascii x47, 'f' = ascii x66
+            matchRegExp("\x00\x03\x07\x06\x07\x08", re, "\x07\x06");
+            matchRegExp("\\", re, null);
+            matchRegExp("/", re, null);
+            matchRegExp("\\cG\\c6\\cf", re, null);
+            
+            re = /[\cG\c6\cf]+/; //'G' = ascii x47, '6' = ascii x36, 'f' = ascii x66
+            matchRegExp("\x00\x03\x07\x06\x16\x07\x08", re, "\x07\x06\x16\x07");
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("cfG6", re, null);
+            
+            re = /\cG\cf/; //'G' = ascii x47, 'f' = ascii x66
+            matchRegExp("\x00\x03\x07\x06\x16\x07\x08", re, "\x07\x06");
+            matchRegExp("\\", re, null);
+            matchRegExp("/", re, null);
+            matchRegExp("\\cG\\c6\\cf", re, null);
+            
+            re = /[\cz\cZ]+/; //'z' = ascii x7A, 'Z' = ascii x5A, have the same lowest 5 bits
+            matchRegExp("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + 
+                        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f", re, "\x1a");
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("ccczZ", re, null);
+        }
+    },
+    {
+        name : "Control characters followed by a non-word character ([^A-Za-z0-9_])",
+        body : function () 
+        {
+            re = /[\c*]+/; //'*' = ascii 42
+            matchRegExp("\x0a\x09\x08", re, null);
+            matchRegExp("a*c*b*d*", re, "*c*");
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, null);
+            matchRegExp("ccc", re, "ccc");
+            
+            re = /[\c}]*/; //'}' = ascii 125
+            matchRegExp("\x1d\x7d\x3d", re, "");
+            matchRegExp("}c}}cd*c*b*d*", re, "}c}}c");
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, "");
+            matchRegExp("ccc", re, "ccc");
+            
+            re = /[\c;]+/; //';' = ascii 59
+            matchRegExp("\x1b\x1c", re, null);
+            matchRegExp("d;c;d;*", re, ";c;");
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, null);
+            matchRegExp("ccc", re, "ccc");
+            
+            re = /\c%/; //'%' = ascii x25
+            matchRegExp("\\", re, null);
+            matchRegExp("\\", re, null);
+            matchRegExp("\\c%", re, "\\c%");
+            matchRegExp("\x05", re, null);
+        }
+    },
+    {
+        name : "Control Character tests with unicode flag present",
+        body : function () 
+        {
+            re = /[\cAg]+/u; //'A' = ascii x41
+            matchRegExp("abcdefghi", re, "g");
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("\x01\x01gg\x02\x04ggg", re, "\x01\x01gg");            
+            
+            re = /[\czA]+/u;  //'z' = ascii x7A
+            matchRegExp("abcdefghi", re, null);
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("YZA\x1aABC", re, "A\x1aA");    
+            
+            assert.throws(() => eval("\"\".match(/[\\c]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/[\\c-d]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/[ab\\c_$]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/[ab\\c\\d]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/[ab\\c3]/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+                        
+            re = /\cAg/u;  //'A' = ascii x41
+            matchRegExp("abcdefghi", re, null);
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("\x01\x01gg\x02\x04ggg", re, "\x01g");            
+            
+            re = /\czA/u;  //'z' = ascii x7A
+            matchRegExp("abcdefghi", re, null);
+            matchRegExp("\\\\", re, null);
+            matchRegExp("////", re, null);
+            matchRegExp("YZA\x1aABC", re, "\x1aA");   
+            
+            assert.throws(() => eval("\"\".match(/\\c/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/\\c-d/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/ab\\c_$/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/ab\\c\\d/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+            assert.throws(() => eval("\"\".match(/ab\\c3/u)"), SyntaxError, "Expected an error due to non-letters being disallowed from control character when unicode flag present", 
+                        "Invalid regular expression: invalid escape in unicode pattern");
+        }
+    },
+    {
+        name : "Control character edge cases",
+        body : function () 
+        {
+            re = /[\c-g]+/; //'-' = ascii x2D
+            matchRegExp("abcdefghi", re, "cdefg");
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, null);
+            matchRegExp("\x0d", re, null);
+            matchRegExp("aobd\\f\\d", re, "d\\f\\d");            
+            
+            re = /[\c-]+/; //'-' = ascii x2D
+            matchRegExp("abcdefghi", re, "c");
+            matchRegExp("\x0d", re, null);
+            matchRegExp("\\\\", re, "\\\\");
+            matchRegExp("////", re, null);
+            matchRegExp("aobd\\f\\d", re, "\\");  
+            
+            assert.throws(() => eval("\"\".match(/[\\c-a]/)"), SyntaxError, "Expected an error due to 'c-a' being an invalid range.", "Invalid range in character set");
+        }
+    }
+];
+
+testRunner.runTests(tests, {
+    verbose : WScript.Arguments[0] != "summary"
+});

+ 7 - 1
test/Regex/rlexe.xml

@@ -229,10 +229,16 @@
       <compile-flags>-args summary -endargs</compile-flags>
     </default>
   </test>
-    <test>
+  <test>
     <default>
       <files>characterclass_with_range.js</files>
       <compile-flags>-args summary -endargs</compile-flags>
     </default>
   </test>
+  <test>
+    <default>
+      <files>control_character_escapes.js</files>
+      <compile-flags>-args summary -endargs</compile-flags>
+    </default>
+  </test>
 </regress-exe>