OctoquadIdentifier.h 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. //
  6. // Matchers for pattern of form:
  7. // pattern ::= atom{8} '|' atom{8}
  8. // atom ::= A | [...charset drawn from A's...]
  9. // where:
  10. // - A is a set of exactly four ASCII characters
  11. // - The pattern ignores case
  12. // - The pattern includes the global flag
  13. // An example pattern would be "ABCdABCd|aDcAbBcD".
  14. #pragma once
  15. namespace UnifiedRegex
  16. {
  17. // ----------------------------------------------------------------------
  18. // Trigrams
  19. // ----------------------------------------------------------------------
  20. struct TrigramInfo {
  21. static const int PatternLength=8;
  22. static const int MaxResults=32;
  23. bool isTrigramPattern;
  24. bool hasCachedResultString;
  25. int triPat1;
  26. int triPat2;
  27. int resultCount;
  28. int offsets[MaxResults];
  29. Js::JavascriptString * cachedResult[MaxResults];
  30. TrigramInfo(__in_ecount(PatternLength) char* pat1,__in_ecount(PatternLength) char* pat2, Recycler* recycler);
  31. };
  32. struct PatternTri {
  33. RegexPattern* pattern;
  34. int encodedPattern;
  35. };
  36. struct TrigramStart {
  37. static const int MaxPatPerStart=12;
  38. int count;
  39. PatternTri patterns[MaxPatPerStart];
  40. };
  41. struct TrigramAlphabet {
  42. static const int AlphaCount=4;
  43. static const int AsciiTableSize=128;
  44. static const int BitsNotInAlpha=4;
  45. static const int TrigramMapSize=221;
  46. static const int TrigramNotInPattern=65;
  47. static const char LowerCaseBit=0x20;
  48. static const char UpperCaseMask=0x5f;
  49. static const int TrigramCount=64;
  50. static const int MaxCachedStarts=48;
  51. TrigramStart trigramStarts[TrigramCount];
  52. char alpha[AlphaCount];
  53. char alphaBits[AsciiTableSize];
  54. char trigramMap[TrigramMapSize];
  55. const char16* input;
  56. int inputLen;
  57. void InitTrigramMap();
  58. bool AddStarts(__in_xcount(TrigramInfo::PatternLength) char* pat1,__in_xcount(TrigramInfo::PatternLength) char* pat2, RegexPattern* pattern);
  59. void MegaMatch(__in_ecount(inputLen) const char16* input,int inputLen);
  60. };
  61. // ----------------------------------------------------------------------
  62. // OctoquadIdentifier
  63. // ----------------------------------------------------------------------
  64. class OctoquadIdentifier : private Chars<char16>
  65. {
  66. friend class OctoquadMatcher;
  67. public:
  68. static const int NumPatterns = 2;
  69. private:
  70. // Number of characters in the alphabet encountered so far
  71. int numCodes;
  72. // Maps a character code to the character
  73. char (&codeToChar)[TrigramAlphabet::AlphaCount];
  74. // Maps a character to its code 0-3. This array is passed into the constructor and only indexes for characters in the
  75. // alphabet are updated.
  76. char (&charToCode)[TrigramAlphabet::AsciiTableSize];
  77. // For each octoquad pattern, each byte contains a 4-bit pattern. One character will be represented as 0x1, 0x2, 0x4, or
  78. // 0x8 since it's a quad alphabet. A character class in the pattern can cause the bit pattern to be a combination of the
  79. // character bits.
  80. char patternBits[NumPatterns][TrigramInfo::PatternLength];
  81. int currPatternLength;
  82. int currPatternNum;
  83. void SetTrigramAlphabet(Js::ScriptContext * scriptContext,
  84. __in_xcount(regex::TrigramAlphabet::AlphaCount) char* alpha,
  85. __in_xcount(regex::TrigramAlphabet::AsciiTableSize) char* alphaBits);
  86. public:
  87. static bool Qualifies(const Program *const program);
  88. OctoquadIdentifier(
  89. const int numCodes,
  90. char (&codeToChar)[TrigramAlphabet::AlphaCount],
  91. char (&charToCode)[TrigramAlphabet::AsciiTableSize]);
  92. // Returns -1 if character not in quad alphabet and the alphabet is full
  93. int GetOrAddCharCode(const Char c);
  94. bool BeginConcat();
  95. bool CouldAppend(const CharCount n) const;
  96. bool AppendChar(Char c);
  97. bool BeginUnions();
  98. bool UnionChar(Char c);
  99. void EndUnions();
  100. bool IsOctoquad();
  101. void InitializeTrigramInfo(Js::ScriptContext* scriptContext, RegexPattern* const pattern);
  102. };
  103. // ----------------------------------------------------------------------
  104. // OctoquadMatcher
  105. // ----------------------------------------------------------------------
  106. class OctoquadMatcher : private Chars<char16>
  107. {
  108. private:
  109. OctoquadMatcher(const StandardChars<Char>* standardChars, CaseInsensitive::MappingSource mappingSource, OctoquadIdentifier* identifier);
  110. Char codeToChar[TrigramAlphabet::AlphaCount];
  111. // Maps characters (0..AsciTableSize-1) to 0 if not in alphabet, or 0x1, 0x2, 0x4 or 0x8.
  112. // Allocated and filled only if invoke Match below.
  113. uint8 charToBits[TrigramAlphabet::AsciiTableSize];
  114. uint32 patterns[OctoquadIdentifier::NumPatterns];
  115. public:
  116. static OctoquadMatcher *New(Recycler* recycler, const StandardChars<Char>* standardChars, CaseInsensitive::MappingSource mappingSource, OctoquadIdentifier* identifier);
  117. bool Match
  118. ( const Char* const input
  119. , const CharCount inputLength
  120. , CharCount& offset
  121. #if ENABLE_REGEX_CONFIG_OPTIONS
  122. , RegexStats* stats
  123. #endif
  124. );
  125. #if ENABLE_REGEX_CONFIG_OPTIONS
  126. void Print(DebugWriter* w) const;
  127. #endif
  128. };
  129. }