TextbookBoyerMoore.h 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. // From Cormen, Leiserson and Rivest, ch 34.
  6. #pragma once
  7. namespace UnifiedRegex
  8. {
  9. template <typename C>
  10. struct TextbookBoyerMooreSetup : private Chars<C>
  11. {
  12. private:
  13. typedef typename Chars<C>::Char Char;
  14. template <typename C>
  15. friend class TextbookBoyerMoore;
  16. template <typename C>
  17. friend class TextbookBoyerMooreWithLinearMap;
  18. public:
  19. enum Scheme
  20. {
  21. DefaultScheme,
  22. LinearScheme
  23. };
  24. TextbookBoyerMooreSetup(Char const * pat, CharCount patLen) : pat(pat), patLen(patLen) { Init(); }
  25. Scheme GetScheme() const { return scheme; }
  26. static int32 * GetGoodSuffix(ArenaAllocator* allocator, const Char * pat, CharCount patLen, int skip = 1);
  27. private:
  28. void Init();
  29. Scheme scheme;
  30. Char const * const pat;
  31. CharCount const patLen;
  32. uint numLinearChars;
  33. Char linearChar[MaxCharMapLinearChars];
  34. int32 lastOcc[MaxCharMapLinearChars];
  35. };
  36. template <typename C>
  37. class TextbookBoyerMooreWithLinearMap : private Chars<C>
  38. {
  39. template <typename C>
  40. friend struct TextbookBoyerMooreSetup;
  41. typedef typename Chars<C>::Char Char;
  42. private:
  43. typedef CharMap<Char, int32, CharMapScheme_Linear> LastOccMap;
  44. // NOTE: We don't store the actual pattern here since it may be moved between
  45. // constructing the scanner and running it.
  46. LastOccMap lastOccurrence;
  47. int32 *goodSuffix;
  48. public:
  49. inline TextbookBoyerMooreWithLinearMap() : lastOccurrence(-1), goodSuffix(0) {}
  50. // Construct Boyer-Moore tables for pattern pat:
  51. // - pat must be of length patLen * skip
  52. // - if skip is > 1, then each consecutive skip characters of pattern are assumed to represent
  53. // an equivalence class of characters in canonical order (i.e. all chars in a class are represented
  54. // by the same sequence of skip chars)
  55. // - otherwise this is a regular exact-match pattern
  56. void Setup(ArenaAllocator* allocator, TextbookBoyerMooreSetup<C> const& setup);
  57. void FreeBody(ArenaAllocator* allocator, CharCount patLen);
  58. // NOTE: In the following pat and patLen must be the same as passed to Setup above
  59. // For skip = 1
  60. template <uint equivClassSize>
  61. bool Match
  62. ( const Char *const input
  63. , const CharCount inputLength
  64. , CharCount& inputOffset
  65. , const Char* pat
  66. , const CharCount patLen
  67. #if ENABLE_REGEX_CONFIG_OPTIONS
  68. , RegexStats* stats
  69. #endif
  70. ) const;
  71. #if ENABLE_REGEX_CONFIG_OPTIONS
  72. static char16 const * GetName() { return _u("linear map Boyer-Moore"); }
  73. #endif
  74. };
  75. template <typename C>
  76. class TextbookBoyerMoore : private Chars<C>
  77. {
  78. template <typename C>
  79. friend struct TextbookBoyerMooreSetup;
  80. typedef typename Chars<C>::Char Char;
  81. private:
  82. typedef CharMap<Char, int32> LastOccMap;
  83. // NOTE: We don't store the actual pattern here since it may be moved between
  84. // constructing the scanner and running it.
  85. Field(LastOccMap) lastOccurrence;
  86. Field(int32 *) goodSuffix;
  87. public:
  88. inline TextbookBoyerMoore() : lastOccurrence(-1), goodSuffix(nullptr) {}
  89. // Construct Boyer-Moore tables for pattern pat:
  90. // - pat must be of length patLen * skip
  91. // - if skip is > 1, then each consecutive skip characters of pattern are assumed to represent
  92. // an equivalence class of characters in canonical order (i.e. all chars in a class are represented
  93. // by the same sequence of skip chars)
  94. // - otherwise this is a regular exact-match pattern
  95. void Setup(ArenaAllocator* allocator, TextbookBoyerMooreSetup<C> const& setup);
  96. void Setup(ArenaAllocator * allocator, const Char * pat, CharCount patLen, int skip);
  97. void FreeBody(ArenaAllocator* allocator, CharCount patLen);
  98. // NOTE: In the following pat and patLen must be the same as passed to Setup above
  99. template <uint equivClassSize>
  100. inline bool Match
  101. ( const Char *const input
  102. , const CharCount inputLength
  103. , CharCount& inputOffset
  104. , const Char* pat
  105. , const CharCount patLen
  106. #if ENABLE_REGEX_CONFIG_OPTIONS
  107. , RegexStats* stats
  108. #endif
  109. ) const
  110. {
  111. return Match<equivClassSize, equivClassSize>(input, inputLength, inputOffset, pat, patLen
  112. #if ENABLE_REGEX_CONFIG_OPTIONS
  113. , stats
  114. #endif
  115. );
  116. }
  117. template <uint equivClassSize, uint lastPatCharEquivClass>
  118. bool Match
  119. ( const Char *const input
  120. , const CharCount inputLength
  121. , CharCount& inputOffset
  122. , const Char* pat
  123. , const CharCount patLen
  124. #if ENABLE_REGEX_CONFIG_OPTIONS
  125. , RegexStats* stats
  126. #endif
  127. ) const;
  128. #if ENABLE_REGEX_CONFIG_OPTIONS
  129. static char16 const * GetName() { return _u("full map Boyer-Moore"); }
  130. #endif
  131. };
  132. }