regex-unicode.js 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
  6. function assertTest(asserter, re, string, message) {
  7. asserter(re.test(string), message);
  8. }
  9. function assertMatches() {
  10. assertTest(assert.isTrue, ...arguments);
  11. }
  12. function assertDoesNotMatch(re, string, message) {
  13. assertTest(assert.isFalse, ...arguments);
  14. }
  15. // TODO: RegExp functions currently process strings as a list of code units as
  16. // opposed to a list of code points. This causes a RegExp to match just
  17. // the high surrogate. For example, /[^\ud800\udc00]/ matches
  18. // "\ud800\udc00". This this due to "\ud800" being in the negated set and
  19. // matching the first code unit in the string.
  20. //
  21. // Some of the patterns below have the format "^...$" to force the RegExp
  22. // to match the string fully. Once the bug is fixes, the '^'s and '$'s
  23. // can be removed. The bug # is 3679792.
  24. var tests = [
  25. {
  26. name: "A character set containing a negated character from a supplementary plane shouldn't match the character itself",
  27. body: function () {
  28. assertDoesNotMatch(/^[^\ud800\udc00]$/u, "\ud800\udc00", "Surrogate pair in RegExp and surrogate pair in string to test");
  29. assertDoesNotMatch(/^[^\ud800\udc00]$/u, "\u{10000}", "Surrogate pair in RegExp and code point in string to test");
  30. assertDoesNotMatch(/^[^\u{10000}]$/u, "\ud800\udc00", "Code point in RegExp and surrogate pair in string to test");
  31. assertDoesNotMatch(/^[^\u{10000}]$/u, "\u{10000}", "Code point in RegExp and code point in string to test");
  32. }
  33. },
  34. {
  35. name: "A character set containing a negated character from a supplementary plane should match other characters",
  36. body: function () {
  37. assertMatches(/^[^\ud800\udc00]$/u, "\ud801\udc01", "Surrogate pair in RegExp and surrogate pair in string to test");
  38. assertMatches(/^[^\u{10000}]$/u, "\ud801\udc01", "Surrogate pair in RegExp and code point in string to test");
  39. assertMatches(/^[^\ud800\udc00]$/u, "\u{10101}", "Code point in RegExp and surrogate pair in string to test");
  40. assertMatches(/^[^\u10000]$/u, "\u{10101}", "Code point in RegExp and code point in string to test");
  41. assertMatches(/^[^\u10000]$/u, "\u0345", "Code point in RegExp and code unit in string to test");
  42. assertMatches(/^[^\ud800\udc00]$/u, "\u0345", "Surrogate pair in RegExp and code unit in string to test");
  43. }
  44. },
  45. {
  46. name: "A character set containing a negated character from the basic plane should match characters from supplementary planes",
  47. body: function () {
  48. assertMatches(/^[^0345]$/u, "\ud800\udc00", "Surrogate pair");
  49. assertMatches(/^[^0345]$/u, "\u{10000}", "Code point");
  50. }
  51. },
  52. {
  53. name: "A character set containing a range spanning multiple planes should match characters from all those planes",
  54. body: function () {
  55. var re = /^[\u0000-\u{10FFFF}]$/u;
  56. var numberOfPlanes = 17;
  57. for (var plane = 0; plane < numberOfPlanes; ++plane) {
  58. function getCharacterInPlane(code) {
  59. var codePoint = plane * 0x10000 + code;
  60. return String.fromCodePoint(codePoint);
  61. }
  62. assertMatches(re, getCharacterInPlane(0x0000), "First character in plane #" + plane);
  63. assertMatches(re, getCharacterInPlane(0xFFFF), "Last character in plane #" + plane);
  64. }
  65. }
  66. },
  67. {
  68. name: "A dash character and a non-dash character following a full one shouldn't be interpreted as a range",
  69. body: function () {
  70. var re = /^[\ud800-\udbff\udc00-\udbff\udc02]$/u;
  71. assertDoesNotMatch(re, "\udbff\udc01", "Shouldn't be in the second range");
  72. assertMatches(re, "-", "Second '-' treated as a normal character");
  73. }
  74. },
  75. {
  76. name: "Reserved characters shouldn't be ignored when they are in a character set together with characters from a supplementary plane",
  77. body: function () {
  78. assertMatches(/^[\ud800\udc00 \ud800]$/u, "\ud800", "Start of the reserver character range (\\ud800)");
  79. assertMatches(/^[\ud800\udc00 \udfff]$/u, "\udfff", "Start of the reserver character range (\\udfff)");
  80. }
  81. },
  82. {
  83. name: "A high and a low surrogate part with a '-' between should be interpreted as a range",
  84. body: function () {
  85. assertMatches(/^[\ud800-\udfff]$/u, "\ud800", "Range start");
  86. assertMatches(/^[\ud800-\udfff]$/u, "\udfff", "Range end");
  87. // We had a bug where we interpreted the character set below as [\ud800\udfff] and omitted '-'.
  88. assertDoesNotMatch(/^[\ud800-\udfff]$/u, "\ud800\udfff", "Not a surrogate pair");
  89. }
  90. }
  91. ];
  92. testRunner.runTests(tests, { verbose: WScript.Arguments[0] != "summary" });