| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473 |
- //-------------------------------------------------------------------------------------------------------
- // Copyright (C) Microsoft. All rights reserved.
- // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
- //-------------------------------------------------------------------------------------------------------
- #include "ParserPch.h"
- namespace UnifiedRegex
- {
- template <typename C>
- void TextbookBoyerMooreSetup<C>::Init()
- {
- Assert(patLen > 0);
- for (uint i = 0; i < MaxCharMapLinearChars; i++)
- {
- lastOcc[i] = -1;
- }
- numLinearChars = 1;
- // Always put the last character in the first index
- linearChar[0] = pat[patLen - 1];
- for (CharCount i = 0; i < patLen; i++)
- {
- if (numLinearChars <= MaxCharMapLinearChars)
- {
- uint j = 0;
- for (; j < numLinearChars; j++)
- {
- if (linearChar[j] == pat[i])
- {
- lastOcc[j] = i;
- break;
- }
- }
- if (j == numLinearChars)
- {
- if (numLinearChars < MaxCharMapLinearChars)
- {
- linearChar[numLinearChars] = pat[i];
- lastOcc[numLinearChars] = i;
- }
- numLinearChars++;
- }
- }
- if (numLinearChars > MaxCharMapLinearChars)
- {
- break;
- }
- }
- if (numLinearChars <= MaxCharMapLinearChars)
- {
- scheme = LinearScheme;
- }
- else
- {
- scheme = DefaultScheme;
- }
- }
- template <typename C>
- void TextbookBoyerMoore<C>::Setup(ArenaAllocator* allocator, TextbookBoyerMooreSetup<C> const& info)
- {
- Assert(info.GetScheme() == TextbookBoyerMooreSetup<C>::DefaultScheme);
- this->Setup(allocator, info.pat, info.patLen, 1);
- }
- template <typename C>
- void TextbookBoyerMoore<C>::Setup(ArenaAllocator * allocator, const Char * pat, CharCount patLen, int skip)
- {
- // character c |-> index of last occurrence of c in pat, otherwise -1
- for (CharCount i = 0; i < patLen; i++)
- {
- for (int j = 0; j < skip; j++)
- lastOccurrence.Set(allocator, pat[i * skip + j], i);
- }
- goodSuffix = TextbookBoyerMooreSetup<C>::GetGoodSuffix(allocator, pat, patLen, skip);
- }
- template <typename C>
- int32 * TextbookBoyerMooreSetup<C>::GetGoodSuffix(ArenaAllocator* allocator, const Char * pat, CharCount patLen, int skip)
- {
- // pat offset q |-> longest prefix of pat which is a proper suffix of pat[0..q]
- // (thanks to equivalence classes being in canonical order we only need to look at the first
- // character of each skip grouping in the pattern)
- int32* prefix = AnewArray(allocator, int32, patLen);
- prefix[0] = 0;
- int32 k = 0;
- for (CharCount q = 1; q < patLen; q++)
- {
- while (k > 0 && pat[k * skip] != pat[q * skip])
- k = prefix[k - 1];
- if (pat[k * skip] == pat[q * skip])
- k++;
- prefix[q] = k;
- }
- // As above, but for rev(pat)
- int32* revPrefix = AnewArray(allocator, int32, patLen);
- revPrefix[0] = 0;
- k = 0;
- for (CharCount q = 1; q < patLen; q++)
- {
- while (k > 0 && pat[(patLen - k - 1) * skip] != pat[(patLen - q - 1) * skip])
- k = revPrefix[k - 1];
- if (pat[(patLen - k - 1) * skip] == pat[(patLen - q - 1) * skip])
- k++;
- revPrefix[q] = k;
- }
- // pat prefix length l |-> least shift s.t. pat[0..l-1] is not mismatched
- int32 * goodSuffix = AnewArray(allocator, int32, patLen + 1);
- for (CharCount j = 0; j <= patLen; j++)
- goodSuffix[j] = patLen - prefix[patLen - 1];
- for (CharCount l = 1; l <= patLen; l++)
- {
- CharCount j = patLen - revPrefix[l - 1];
- int32 s = l - revPrefix[l - 1];
- if (goodSuffix[j] > s)
- goodSuffix[j] = s;
- }
- // shift above one to the left
- for (CharCount j = 0; j < patLen; j++)
- goodSuffix[j] = goodSuffix[j + 1];
- AdeleteArray(allocator, patLen, prefix);
- AdeleteArray(allocator, patLen, revPrefix);
- return goodSuffix;
- }
- template <typename C>
- void TextbookBoyerMoore<C>::FreeBody(ArenaAllocator* allocator, CharCount patLen)
- {
- if(goodSuffix)
- {
- AdeleteArray(allocator, patLen + 1, PointerValue(goodSuffix));
- #if DBG
- goodSuffix = nullptr;
- #endif
- }
- lastOccurrence.FreeBody(allocator);
- }
- template <uint equivClassSize, uint compareCount>
- static bool MatchPatternAt(uint inputChar, char16 const * pat, CharCount index);
- template <>
- bool MatchPatternAt<1, 1>(uint inputChar, char16 const* pat, CharCount index)
- {
- return inputChar == pat[index];
- }
- template <>
- bool MatchPatternAt<CaseInsensitive::EquivClassSize, CaseInsensitive::EquivClassSize>(uint inputChar, char16 const * pat, CharCount index)
- {
- CompileAssert(CaseInsensitive::EquivClassSize == 4);
- return inputChar == pat[index * CaseInsensitive::EquivClassSize]
- || inputChar == pat[index * CaseInsensitive::EquivClassSize + 1]
- || inputChar == pat[index * CaseInsensitive::EquivClassSize + 2]
- || inputChar == pat[index * CaseInsensitive::EquivClassSize + 3];
- }
- template <>
- bool MatchPatternAt<CaseInsensitive::EquivClassSize, 1>(uint inputChar, char16 const * pat, CharCount index)
- {
- CompileAssert(CaseInsensitive::EquivClassSize == 4);
- return inputChar == pat[index * 4];
- }
- template <typename C>
- template <uint equivClassSize, uint lastPatCharEquivClass>
- bool TextbookBoyerMoore<C>::Match
- ( const Char *const input
- , const CharCount inputLength
- , CharCount& inputOffset
- , const Char* pat
- , const CharCount patLen
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , RegexStats* stats
- #endif
- ) const
- {
- Assert(input != 0);
- Assert(inputOffset <= inputLength);
- if (inputLength < patLen)
- return false;
- CharCount offset = inputOffset;
- const CharCount endOffset = inputLength - (patLen - 1);
- const int32* const localGoodSuffix = goodSuffix;
- const LastOccMap* const localLastOccurrence = &lastOccurrence;
- const CharCount lastPatCharIndex = (patLen - 1);
- while (offset < endOffset)
- {
- // A separate tight loop to find the last character
- while (true)
- {
- uint inputChar = Chars<Char>::CTU(input[offset + lastPatCharIndex]);
- if (MatchPatternAt<equivClassSize, lastPatCharEquivClass>(inputChar, pat, lastPatCharIndex))
- {
- // Found a match. Break out of this loop and go to the match pattern loop
- break;
- }
- // Negative case is more common,
- // Write the checks so that we have a super tight loop
- int lastOcc;
- if (inputChar < localLastOccurrence->GetDirectMapSize())
- {
- if (!localLastOccurrence->IsInDirectMap(inputChar))
- {
- offset += patLen;
- if (offset >= endOffset)
- {
- return false;
- }
- continue;
- }
- lastOcc = localLastOccurrence->GetDirectMap(inputChar);
- }
- else if (!localLastOccurrence->GetNonDirect(inputChar, lastOcc))
- {
- offset += patLen;
- if (offset >= endOffset)
- {
- return false;
- }
- continue;
- }
- Assert((int)lastPatCharIndex - lastOcc >= localGoodSuffix[lastPatCharIndex]);
- offset += lastPatCharIndex - lastOcc;
- if (offset >= endOffset)
- {
- return false;
- }
- }
- // CONSIDER: we can remove this check if we stop using TextbookBoyerMoore for one char pattern
- if (lastPatCharIndex == 0)
- {
- inputOffset = offset;
- return true;
- }
- // Match the rest of the pattern
- int32 j = lastPatCharIndex - 1;
- while (true)
- {
- #if ENABLE_REGEX_CONFIG_OPTIONS
- if (stats != 0)
- stats->numCompares++;
- #endif
- uint inputChar = Chars<Char>::CTU(input[offset + j]);
- if (!MatchPatternAt<equivClassSize, equivClassSize>(inputChar, pat, j))
- {
- const int32 e = j - localLastOccurrence->Get((Char)inputChar);
- offset += e > localGoodSuffix[j] ? e : localGoodSuffix[j];
- break;
- }
- if (--j < 0)
- {
- inputOffset = offset;
- return true;
- }
- }
- }
- return false;
- }
- // Specialized linear char map version
- template <typename C>
- void TextbookBoyerMooreWithLinearMap<C>::Setup(ArenaAllocator * allocator, TextbookBoyerMooreSetup<C> const& setup)
- {
- Assert(setup.GetScheme() == TextbookBoyerMooreSetup<C>::LinearScheme);
- lastOccurrence.Set(setup.numLinearChars, setup.linearChar, setup.lastOcc);
- goodSuffix = TextbookBoyerMooreSetup<C>::GetGoodSuffix(allocator, setup.pat, setup.patLen);
- }
- template <typename C>
- void TextbookBoyerMooreWithLinearMap<C>::FreeBody(ArenaAllocator* allocator, CharCount patLen)
- {
- if(goodSuffix)
- {
- AdeleteArray(allocator, patLen + 1, goodSuffix);
- #if DBG
- goodSuffix = 0;
- #endif
- }
- }
- template <typename C>
- template <uint equivClassSize>
- bool TextbookBoyerMooreWithLinearMap<C>::Match
- ( const Char *const input
- , const CharCount inputLength
- , CharCount& inputOffset
- , const Char* pat
- , const CharCount patLen
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , RegexStats* stats
- #endif
- ) const
- {
- CompileAssert(equivClassSize == 1);
- Assert(input != 0);
- Assert(inputOffset <= inputLength);
- if (inputLength < patLen)
- return false;
- const int32* const localGoodSuffix = goodSuffix;
- const LastOccMap* const localLastOccurrence = &lastOccurrence;
- CharCount offset = inputOffset;
- const CharCount lastPatCharIndex = (patLen - 1);
- const CharCount endOffset = inputLength - lastPatCharIndex;
- // Using int size instead of Char value is faster
- const uint lastPatChar = pat[lastPatCharIndex];
- Assert(lastPatChar == localLastOccurrence->GetChar(0));
- while (offset < endOffset)
- {
- // A separate tight loop to find the last character
- while (true)
- {
- #if ENABLE_REGEX_CONFIG_OPTIONS
- if (stats != 0)
- stats->numCompares++;
- #endif
- uint inputChar = Chars<Char>::CTU(input[offset + lastPatCharIndex]);
- if (inputChar == lastPatChar)
- {
- // Found a match. Break out of this loop and go to the match pattern loop
- break;
- }
- // Negative case is more common,
- // Write the checks so that we have a super tight loop
- Assert(inputChar != localLastOccurrence->GetChar(0));
- int32 lastOcc;
- if (localLastOccurrence->GetChar(1) != inputChar)
- {
- if (localLastOccurrence->GetChar(2) != inputChar)
- {
- if (localLastOccurrence->GetChar(3) != inputChar)
- {
- offset += patLen;
- if (offset >= endOffset)
- {
- return false;
- }
- continue;
- }
- lastOcc = localLastOccurrence->GetLastOcc(3);
- }
- else
- {
- lastOcc = localLastOccurrence->GetLastOcc(2);
- }
- }
- else
- {
- lastOcc = localLastOccurrence->GetLastOcc(1);
- }
- Assert((int)lastPatCharIndex - lastOcc >= localGoodSuffix[lastPatCharIndex]);
- offset += lastPatCharIndex - lastOcc;
- if (offset >= endOffset)
- {
- return false;
- }
- }
- // CONSIDER: we can remove this check if we stop using
- // TextbookBoyerMoore for one char pattern
- if (lastPatCharIndex == 0)
- {
- inputOffset = offset;
- return true;
- }
- // Match the rest of the pattern
- int32 j = lastPatCharIndex - 1;
- while (true)
- {
- #if ENABLE_REGEX_CONFIG_OPTIONS
- if (stats != 0)
- stats->numCompares++;
- #endif
- uint inputChar = Chars<Char>::CTU(input[offset + j]);
- if (inputChar != pat[j])
- {
- int goodSuffix = localGoodSuffix[j];
- Assert(patLen <= MaxCharCount);
- if (goodSuffix == (int)patLen)
- {
- offset += patLen;
- }
- else
- {
- const int32 e = j - localLastOccurrence->Get(inputChar);
- offset += e > goodSuffix ? e : goodSuffix;
- }
- break;
- }
- if (--j < 0)
- {
- inputOffset = offset;
- return true;
- }
- }
- }
- return false;
- }
- // explicit instantiation
- template struct TextbookBoyerMooreSetup<char16>;
- template class TextbookBoyerMoore<char16>;
- template class TextbookBoyerMooreWithLinearMap<char16>;
- template
- bool TextbookBoyerMoore<char16>::Match<1>
- ( const Char *const input
- , const CharCount inputLength
- , CharCount& inputOffset
- , const Char* pat
- , const CharCount patLen
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , RegexStats* stats
- #endif
- ) const;
- template
- bool TextbookBoyerMoore<char16>::Match<CaseInsensitive::EquivClassSize>
- ( const Char *const input
- , const CharCount inputLength
- , CharCount& inputOffset
- , const Char* pat
- , const CharCount patLen
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , RegexStats* stats
- #endif
- ) const;
- template
- bool TextbookBoyerMoore<char16>::Match<CaseInsensitive::EquivClassSize, 1>
- ( const Char *const input
- , const CharCount inputLength
- , CharCount& inputOffset
- , const Char* pat
- , const CharCount patLen
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , RegexStats* stats
- #endif
- ) const;
- template
- bool TextbookBoyerMooreWithLinearMap<char16>::Match<1>
- ( const Char *const input
- , const CharCount inputLength
- , CharCount& inputOffset
- , const Char* pat
- , const CharCount patLen
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , RegexStats* stats
- #endif
- ) const;
- }
|