TextbookBoyerMoore.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "ParserPch.h"
  6. namespace UnifiedRegex
  7. {
  8. template <typename C>
  9. void TextbookBoyerMooreSetup<C>::Init()
  10. {
  11. Assert(patLen > 0);
  12. for (uint i = 0; i < MaxCharMapLinearChars; i++)
  13. {
  14. lastOcc[i] = -1;
  15. }
  16. numLinearChars = 1;
  17. // Always put the last character in the first index
  18. linearChar[0] = pat[patLen - 1];
  19. for (CharCount i = 0; i < patLen; i++)
  20. {
  21. if (numLinearChars <= MaxCharMapLinearChars)
  22. {
  23. uint j = 0;
  24. for (; j < numLinearChars; j++)
  25. {
  26. if (linearChar[j] == pat[i])
  27. {
  28. lastOcc[j] = i;
  29. break;
  30. }
  31. }
  32. if (j == numLinearChars)
  33. {
  34. if (numLinearChars < MaxCharMapLinearChars)
  35. {
  36. linearChar[numLinearChars] = pat[i];
  37. lastOcc[numLinearChars] = i;
  38. }
  39. numLinearChars++;
  40. }
  41. }
  42. if (numLinearChars > MaxCharMapLinearChars)
  43. {
  44. break;
  45. }
  46. }
  47. if (numLinearChars <= MaxCharMapLinearChars)
  48. {
  49. scheme = LinearScheme;
  50. }
  51. else
  52. {
  53. scheme = DefaultScheme;
  54. }
  55. }
  56. template <typename C>
  57. void TextbookBoyerMoore<C>::Setup(ArenaAllocator* allocator, TextbookBoyerMooreSetup<C> const& info)
  58. {
  59. Assert(info.GetScheme() == TextbookBoyerMooreSetup<C>::DefaultScheme);
  60. this->Setup(allocator, info.pat, info.patLen, 1);
  61. }
  62. template <typename C>
  63. void TextbookBoyerMoore<C>::Setup(ArenaAllocator * allocator, const Char * pat, CharCount patLen, int skip)
  64. {
  65. // character c |-> index of last occurrence of c in pat, otherwise -1
  66. for (CharCount i = 0; i < patLen; i++)
  67. {
  68. for (int j = 0; j < skip; j++)
  69. lastOccurrence.Set(allocator, pat[i * skip + j], i);
  70. }
  71. goodSuffix = TextbookBoyerMooreSetup<C>::GetGoodSuffix(allocator, pat, patLen, skip);
  72. }
  73. template <typename C>
  74. int32 * TextbookBoyerMooreSetup<C>::GetGoodSuffix(ArenaAllocator* allocator, const Char * pat, CharCount patLen, int skip)
  75. {
  76. // pat offset q |-> longest prefix of pat which is a proper suffix of pat[0..q]
  77. // (thanks to equivalence classes being in canonical order we only need to look at the first
  78. // character of each skip grouping in the pattern)
  79. int32* prefix = AnewArray(allocator, int32, patLen);
  80. prefix[0] = 0;
  81. int32 k = 0;
  82. for (CharCount q = 1; q < patLen; q++)
  83. {
  84. while (k > 0 && pat[k * skip] != pat[q * skip])
  85. k = prefix[k - 1];
  86. if (pat[k * skip] == pat[q * skip])
  87. k++;
  88. prefix[q] = k;
  89. }
  90. // As above, but for rev(pat)
  91. int32* revPrefix = AnewArray(allocator, int32, patLen);
  92. revPrefix[0] = 0;
  93. k = 0;
  94. for (CharCount q = 1; q < patLen; q++)
  95. {
  96. while (k > 0 && pat[(patLen - k - 1) * skip] != pat[(patLen - q - 1) * skip])
  97. k = revPrefix[k - 1];
  98. if (pat[(patLen - k - 1) * skip] == pat[(patLen - q - 1) * skip])
  99. k++;
  100. revPrefix[q] = k;
  101. }
  102. // pat prefix length l |-> least shift s.t. pat[0..l-1] is not mismatched
  103. int32 * goodSuffix = AnewArray(allocator, int32, patLen + 1);
  104. for (CharCount j = 0; j <= patLen; j++)
  105. goodSuffix[j] = patLen - prefix[patLen - 1];
  106. for (CharCount l = 1; l <= patLen; l++)
  107. {
  108. CharCount j = patLen - revPrefix[l - 1];
  109. int32 s = l - revPrefix[l - 1];
  110. if (goodSuffix[j] > s)
  111. goodSuffix[j] = s;
  112. }
  113. // shift above one to the left
  114. for (CharCount j = 0; j < patLen; j++)
  115. goodSuffix[j] = goodSuffix[j + 1];
  116. AdeleteArray(allocator, patLen, prefix);
  117. AdeleteArray(allocator, patLen, revPrefix);
  118. return goodSuffix;
  119. }
  120. template <typename C>
  121. void TextbookBoyerMoore<C>::FreeBody(ArenaAllocator* allocator, CharCount patLen)
  122. {
  123. if(goodSuffix)
  124. {
  125. AdeleteArray(allocator, patLen + 1, goodSuffix);
  126. #if DBG
  127. goodSuffix = 0;
  128. #endif
  129. }
  130. lastOccurrence.FreeBody(allocator);
  131. }
  132. template <uint equivClassSize, uint compareCount>
  133. static bool MatchPatternAt(uint inputChar, char16 const * pat, CharCount index);
  134. template <>
  135. static bool MatchPatternAt<1, 1>(uint inputChar, char16 const* pat, CharCount index)
  136. {
  137. return inputChar == pat[index];
  138. }
  139. template <>
  140. static bool MatchPatternAt<CaseInsensitive::EquivClassSize, CaseInsensitive::EquivClassSize>(uint inputChar, char16 const * pat, CharCount index)
  141. {
  142. CompileAssert(CaseInsensitive::EquivClassSize == 4);
  143. return inputChar == pat[index * CaseInsensitive::EquivClassSize]
  144. || inputChar == pat[index * CaseInsensitive::EquivClassSize + 1]
  145. || inputChar == pat[index * CaseInsensitive::EquivClassSize + 2]
  146. || inputChar == pat[index * CaseInsensitive::EquivClassSize + 3];
  147. }
  148. template <>
  149. static bool MatchPatternAt<CaseInsensitive::EquivClassSize, 1>(uint inputChar, char16 const * pat, CharCount index)
  150. {
  151. CompileAssert(CaseInsensitive::EquivClassSize == 4);
  152. return inputChar == pat[index * 4];
  153. }
  154. template <typename C>
  155. template <uint equivClassSize, uint lastPatCharEquivClass>
  156. bool TextbookBoyerMoore<C>::Match
  157. ( const Char *const input
  158. , const CharCount inputLength
  159. , CharCount& inputOffset
  160. , const Char* pat
  161. , const CharCount patLen
  162. #if ENABLE_REGEX_CONFIG_OPTIONS
  163. , RegexStats* stats
  164. #endif
  165. ) const
  166. {
  167. Assert(input != 0);
  168. Assert(inputOffset <= inputLength);
  169. if (inputLength < patLen)
  170. return false;
  171. CharCount offset = inputOffset;
  172. const CharCount endOffset = inputLength - (patLen - 1);
  173. const int32* const localGoodSuffix = goodSuffix;
  174. const LastOccMap* const localLastOccurrence = &lastOccurrence;
  175. const CharCount lastPatCharIndex = (patLen - 1);
  176. while (offset < endOffset)
  177. {
  178. // A separate tight loop to find the last character
  179. while (true)
  180. {
  181. uint inputChar = Chars<Char>::CTU(input[offset + lastPatCharIndex]);
  182. if (MatchPatternAt<equivClassSize, lastPatCharEquivClass>(inputChar, pat, lastPatCharIndex))
  183. {
  184. // Found a match. Break out of this loop and go to the match pattern loop
  185. break;
  186. }
  187. // Negative case is more common,
  188. // Write the checks so that we have a super tight loop
  189. int lastOcc;
  190. if (inputChar < localLastOccurrence->GetDirectMapSize())
  191. {
  192. if (!localLastOccurrence->IsInDirectMap(inputChar))
  193. {
  194. offset += patLen;
  195. if (offset >= endOffset)
  196. {
  197. return false;
  198. }
  199. continue;
  200. }
  201. lastOcc = localLastOccurrence->GetDirectMap(inputChar);
  202. }
  203. else if (!localLastOccurrence->GetNonDirect(inputChar, lastOcc))
  204. {
  205. offset += patLen;
  206. if (offset >= endOffset)
  207. {
  208. return false;
  209. }
  210. continue;
  211. }
  212. Assert((int)lastPatCharIndex - lastOcc >= localGoodSuffix[lastPatCharIndex]);
  213. offset += lastPatCharIndex - lastOcc;
  214. if (offset >= endOffset)
  215. {
  216. return false;
  217. }
  218. }
  219. // CONSIDER: we can remove this check if we stop using TextbookBoyerMoore for one char pattern
  220. if (lastPatCharIndex == 0)
  221. {
  222. inputOffset = offset;
  223. return true;
  224. }
  225. // Match the rest of the pattern
  226. int32 j = lastPatCharIndex - 1;
  227. while (true)
  228. {
  229. #if ENABLE_REGEX_CONFIG_OPTIONS
  230. if (stats != 0)
  231. stats->numCompares++;
  232. #endif
  233. uint inputChar = Chars<Char>::CTU(input[offset + j]);
  234. if (!MatchPatternAt<equivClassSize, equivClassSize>(inputChar, pat, j))
  235. {
  236. const int32 e = j - localLastOccurrence->Get((Char)inputChar);
  237. offset += e > localGoodSuffix[j] ? e : localGoodSuffix[j];
  238. break;
  239. }
  240. if (--j < 0)
  241. {
  242. inputOffset = offset;
  243. return true;
  244. }
  245. }
  246. }
  247. return false;
  248. }
  249. // Specialized linear char map version
  250. template <typename C>
  251. void TextbookBoyerMooreWithLinearMap<C>::Setup(ArenaAllocator * allocator, TextbookBoyerMooreSetup<C> const& setup)
  252. {
  253. Assert(setup.GetScheme() == TextbookBoyerMooreSetup<C>::LinearScheme);
  254. lastOccurrence.Set(setup.numLinearChars, setup.linearChar, setup.lastOcc);
  255. goodSuffix = TextbookBoyerMooreSetup<C>::GetGoodSuffix(allocator, setup.pat, setup.patLen);
  256. }
  257. template <typename C>
  258. void TextbookBoyerMooreWithLinearMap<C>::FreeBody(ArenaAllocator* allocator, CharCount patLen)
  259. {
  260. if(goodSuffix)
  261. {
  262. AdeleteArray(allocator, patLen + 1, goodSuffix);
  263. #if DBG
  264. goodSuffix = 0;
  265. #endif
  266. }
  267. }
  268. template <typename C>
  269. template <uint equivClassSize>
  270. bool TextbookBoyerMooreWithLinearMap<C>::Match
  271. ( const Char *const input
  272. , const CharCount inputLength
  273. , CharCount& inputOffset
  274. , const Char* pat
  275. , const CharCount patLen
  276. #if ENABLE_REGEX_CONFIG_OPTIONS
  277. , RegexStats* stats
  278. #endif
  279. ) const
  280. {
  281. CompileAssert(equivClassSize == 1);
  282. Assert(input != 0);
  283. Assert(inputOffset <= inputLength);
  284. if (inputLength < patLen)
  285. return false;
  286. const int32* const localGoodSuffix = goodSuffix;
  287. const LastOccMap* const localLastOccurrence = &lastOccurrence;
  288. CharCount offset = inputOffset;
  289. const CharCount lastPatCharIndex = (patLen - 1);
  290. const CharCount endOffset = inputLength - lastPatCharIndex;
  291. // Using int size instead of Char value is faster
  292. const uint lastPatChar = pat[lastPatCharIndex];
  293. Assert(lastPatChar == localLastOccurrence->GetChar(0));
  294. while (offset < endOffset)
  295. {
  296. // A separate tight loop to find the last character
  297. while (true)
  298. {
  299. #if ENABLE_REGEX_CONFIG_OPTIONS
  300. if (stats != 0)
  301. stats->numCompares++;
  302. #endif
  303. uint inputChar = Chars<Char>::CTU(input[offset + lastPatCharIndex]);
  304. if (inputChar == lastPatChar)
  305. {
  306. // Found a match. Break out of this loop and go to the match pattern loop
  307. break;
  308. }
  309. // Negative case is more common,
  310. // Write the checks so that we have a super tight loop
  311. Assert(inputChar != localLastOccurrence->GetChar(0));
  312. int32 lastOcc;
  313. if (localLastOccurrence->GetChar(1) != inputChar)
  314. {
  315. if (localLastOccurrence->GetChar(2) != inputChar)
  316. {
  317. if (localLastOccurrence->GetChar(3) != inputChar)
  318. {
  319. offset += patLen;
  320. if (offset >= endOffset)
  321. {
  322. return false;
  323. }
  324. continue;
  325. }
  326. lastOcc = localLastOccurrence->GetLastOcc(3);
  327. }
  328. else
  329. {
  330. lastOcc = localLastOccurrence->GetLastOcc(2);
  331. }
  332. }
  333. else
  334. {
  335. lastOcc = localLastOccurrence->GetLastOcc(1);
  336. }
  337. Assert((int)lastPatCharIndex - lastOcc >= localGoodSuffix[lastPatCharIndex]);
  338. offset += lastPatCharIndex - lastOcc;
  339. if (offset >= endOffset)
  340. {
  341. return false;
  342. }
  343. }
  344. // CONSIDER: we can remove this check if we stop using
  345. // TextbookBoyerMoore for one char pattern
  346. if (lastPatCharIndex == 0)
  347. {
  348. inputOffset = offset;
  349. return true;
  350. }
  351. // Match the rest of the pattern
  352. int32 j = lastPatCharIndex - 1;
  353. while (true)
  354. {
  355. #if ENABLE_REGEX_CONFIG_OPTIONS
  356. if (stats != 0)
  357. stats->numCompares++;
  358. #endif
  359. uint inputChar = Chars<Char>::CTU(input[offset + j]);
  360. if (inputChar != pat[j])
  361. {
  362. int goodSuffix = localGoodSuffix[j];
  363. Assert(patLen <= MaxCharCount);
  364. if (goodSuffix == (int)patLen)
  365. {
  366. offset += patLen;
  367. }
  368. else
  369. {
  370. const int32 e = j - localLastOccurrence->Get(inputChar);
  371. offset += e > goodSuffix ? e : goodSuffix;
  372. }
  373. break;
  374. }
  375. if (--j < 0)
  376. {
  377. inputOffset = offset;
  378. return true;
  379. }
  380. }
  381. }
  382. return false;
  383. }
  384. // explicit instantiation
  385. template struct TextbookBoyerMooreSetup<char16>;
  386. template class TextbookBoyerMoore<char16>;
  387. template class TextbookBoyerMooreWithLinearMap<char16>;
  388. template
  389. bool TextbookBoyerMoore<char16>::Match<1>
  390. ( const Char *const input
  391. , const CharCount inputLength
  392. , CharCount& inputOffset
  393. , const Char* pat
  394. , const CharCount patLen
  395. #if ENABLE_REGEX_CONFIG_OPTIONS
  396. , RegexStats* stats
  397. #endif
  398. ) const;
  399. template
  400. bool TextbookBoyerMoore<char16>::Match<CaseInsensitive::EquivClassSize>
  401. ( const Char *const input
  402. , const CharCount inputLength
  403. , CharCount& inputOffset
  404. , const Char* pat
  405. , const CharCount patLen
  406. #if ENABLE_REGEX_CONFIG_OPTIONS
  407. , RegexStats* stats
  408. #endif
  409. ) const;
  410. template
  411. bool TextbookBoyerMoore<char16>::Match<CaseInsensitive::EquivClassSize, 1>
  412. ( const Char *const input
  413. , const CharCount inputLength
  414. , CharCount& inputOffset
  415. , const Char* pat
  416. , const CharCount patLen
  417. #if ENABLE_REGEX_CONFIG_OPTIONS
  418. , RegexStats* stats
  419. #endif
  420. ) const;
  421. template
  422. bool TextbookBoyerMooreWithLinearMap<char16>::Match<1>
  423. ( const Char *const input
  424. , const CharCount inputLength
  425. , CharCount& inputOffset
  426. , const Char* pat
  427. , const CharCount patLen
  428. #if ENABLE_REGEX_CONFIG_OPTIONS
  429. , RegexStats* stats
  430. #endif
  431. ) const;
  432. }