TextbookBoyerMoore.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Copyright (c) 2021 ChakraCore Project Contributors. All rights reserved.
  4. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  5. //-------------------------------------------------------------------------------------------------------
  6. // From Cormen, Leiserson and Rivest, ch 34.
  7. #pragma once
  8. namespace UnifiedRegex
  9. {
  10. template <typename C>
  11. struct TextbookBoyerMooreSetup : private Chars<C>
  12. {
  13. private:
  14. typedef typename Chars<C>::Char Char;
  15. template <typename T>
  16. friend class TextbookBoyerMoore;
  17. template <typename T>
  18. friend class TextbookBoyerMooreWithLinearMap;
  19. public:
  20. enum Scheme
  21. {
  22. DefaultScheme,
  23. LinearScheme
  24. };
  25. TextbookBoyerMooreSetup(Char const * pat, CharCount patLen) : pat(pat), patLen(patLen) { Init(); }
  26. Scheme GetScheme() const { return scheme; }
  27. static int32 * GetGoodSuffix(ArenaAllocator* allocator, const Char * pat, CharCount patLen, int skip = 1);
  28. private:
  29. void Init();
  30. Scheme scheme;
  31. Char const * const pat;
  32. CharCount const patLen;
  33. uint numLinearChars;
  34. Char linearChar[MaxCharMapLinearChars];
  35. int32 lastOcc[MaxCharMapLinearChars];
  36. };
  37. template <typename C>
  38. class TextbookBoyerMooreWithLinearMap : private Chars<C>
  39. {
  40. template <typename T>
  41. friend struct TextbookBoyerMooreSetup;
  42. typedef typename Chars<C>::Char Char;
  43. private:
  44. typedef CharMap<Char, int32, CharMapScheme_Linear> LastOccMap;
  45. // NOTE: We don't store the actual pattern here since it may be moved between
  46. // constructing the scanner and running it.
  47. LastOccMap lastOccurrence;
  48. int32 *goodSuffix;
  49. public:
  50. inline TextbookBoyerMooreWithLinearMap() : lastOccurrence(-1), goodSuffix(0) {}
  51. // Construct Boyer-Moore tables for pattern pat:
  52. // - pat must be of length patLen * skip
  53. // - if skip is > 1, then each consecutive skip characters of pattern are assumed to represent
  54. // an equivalence class of characters in canonical order (i.e. all chars in a class are represented
  55. // by the same sequence of skip chars)
  56. // - otherwise this is a regular exact-match pattern
  57. void Setup(ArenaAllocator* allocator, TextbookBoyerMooreSetup<C> const& setup);
  58. void FreeBody(ArenaAllocator* allocator, CharCount patLen);
  59. // NOTE: In the following pat and patLen must be the same as passed to Setup above
  60. // For skip = 1
  61. template <uint equivClassSize>
  62. bool Match
  63. ( const Char *const input
  64. , const CharCount inputLength
  65. , CharCount& inputOffset
  66. , const Char* pat
  67. , const CharCount patLen
  68. #if ENABLE_REGEX_CONFIG_OPTIONS
  69. , RegexStats* stats
  70. #endif
  71. ) const;
  72. #if ENABLE_REGEX_CONFIG_OPTIONS
  73. static char16 const * GetName() { return _u("linear map Boyer-Moore"); }
  74. #endif
  75. };
  76. template <typename C>
  77. class TextbookBoyerMoore : private Chars<C>
  78. {
  79. template <typename T>
  80. friend struct TextbookBoyerMooreSetup;
  81. typedef typename Chars<C>::Char Char;
  82. private:
  83. typedef CharMap<Char, int32> LastOccMap;
  84. // NOTE: We don't store the actual pattern here since it may be moved between
  85. // constructing the scanner and running it.
  86. Field(LastOccMap) lastOccurrence;
  87. Field(int32 *) goodSuffix;
  88. public:
  89. inline TextbookBoyerMoore() : lastOccurrence(-1), goodSuffix(nullptr) {}
  90. // Construct Boyer-Moore tables for pattern pat:
  91. // - pat must be of length patLen * skip
  92. // - if skip is > 1, then each consecutive skip characters of pattern are assumed to represent
  93. // an equivalence class of characters in canonical order (i.e. all chars in a class are represented
  94. // by the same sequence of skip chars)
  95. // - otherwise this is a regular exact-match pattern
  96. void Setup(ArenaAllocator* allocator, TextbookBoyerMooreSetup<C> const& setup);
  97. void Setup(ArenaAllocator * allocator, const Char * pat, CharCount patLen, int skip);
  98. void FreeBody(ArenaAllocator* allocator, CharCount patLen);
  99. // NOTE: In the following pat and patLen must be the same as passed to Setup above
  100. template <uint equivClassSize>
  101. inline bool Match
  102. ( const Char *const input
  103. , const CharCount inputLength
  104. , CharCount& inputOffset
  105. , const Char* pat
  106. , const CharCount patLen
  107. #if ENABLE_REGEX_CONFIG_OPTIONS
  108. , RegexStats* stats
  109. #endif
  110. ) const
  111. {
  112. return Match<equivClassSize, equivClassSize>(input, inputLength, inputOffset, pat, patLen
  113. #if ENABLE_REGEX_CONFIG_OPTIONS
  114. , stats
  115. #endif
  116. );
  117. }
  118. template <uint equivClassSize, uint lastPatCharEquivClass>
  119. bool Match
  120. ( const Char *const input
  121. , const CharCount inputLength
  122. , CharCount& inputOffset
  123. , const Char* pat
  124. , const CharCount patLen
  125. #if ENABLE_REGEX_CONFIG_OPTIONS
  126. , RegexStats* stats
  127. #endif
  128. ) const;
  129. #if ENABLE_REGEX_CONFIG_OPTIONS
  130. static char16 const * GetName() { return _u("full map Boyer-Moore"); }
  131. #endif
  132. };
  133. }