//------------------------------------------------------------------------------------------------------- // Copyright (C) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information. //------------------------------------------------------------------------------------------------------- #include "ParserPch.h" namespace UnifiedRegex { template void TextbookBoyerMooreSetup::Init() { Assert(patLen > 0); for (uint i = 0; i < MaxCharMapLinearChars; i++) { lastOcc[i] = -1; } numLinearChars = 1; // Always put the last character in the first index linearChar[0] = pat[patLen - 1]; for (CharCount i = 0; i < patLen; i++) { if (numLinearChars <= MaxCharMapLinearChars) { uint j = 0; for (; j < numLinearChars; j++) { if (linearChar[j] == pat[i]) { lastOcc[j] = i; break; } } if (j == numLinearChars) { if (numLinearChars < MaxCharMapLinearChars) { linearChar[numLinearChars] = pat[i]; lastOcc[numLinearChars] = i; } numLinearChars++; } } if (numLinearChars > MaxCharMapLinearChars) { break; } } if (numLinearChars <= MaxCharMapLinearChars) { scheme = LinearScheme; } else { scheme = DefaultScheme; } } template void TextbookBoyerMoore::Setup(ArenaAllocator* allocator, TextbookBoyerMooreSetup const& info) { Assert(info.GetScheme() == TextbookBoyerMooreSetup::DefaultScheme); this->Setup(allocator, info.pat, info.patLen, 1); } template void TextbookBoyerMoore::Setup(ArenaAllocator * allocator, const Char * pat, CharCount patLen, int skip) { // character c |-> index of last occurrence of c in pat, otherwise -1 for (CharCount i = 0; i < patLen; i++) { for (int j = 0; j < skip; j++) lastOccurrence.Set(allocator, pat[i * skip + j], i); } goodSuffix = TextbookBoyerMooreSetup::GetGoodSuffix(allocator, pat, patLen, skip); } template int32 * TextbookBoyerMooreSetup::GetGoodSuffix(ArenaAllocator* allocator, const Char * pat, CharCount patLen, int skip) { // pat offset q |-> longest prefix of pat which is a proper suffix of pat[0..q] // (thanks to equivalence classes being in canonical order we only need to look at the first // character of each skip grouping in the pattern) int32* prefix = AnewArray(allocator, int32, patLen); prefix[0] = 0; int32 k = 0; for (CharCount q = 1; q < patLen; q++) { while (k > 0 && pat[k * skip] != pat[q * skip]) k = prefix[k - 1]; if (pat[k * skip] == pat[q * skip]) k++; prefix[q] = k; } // As above, but for rev(pat) int32* revPrefix = AnewArray(allocator, int32, patLen); revPrefix[0] = 0; k = 0; for (CharCount q = 1; q < patLen; q++) { while (k > 0 && pat[(patLen - k - 1) * skip] != pat[(patLen - q - 1) * skip]) k = revPrefix[k - 1]; if (pat[(patLen - k - 1) * skip] == pat[(patLen - q - 1) * skip]) k++; revPrefix[q] = k; } // pat prefix length l |-> least shift s.t. pat[0..l-1] is not mismatched int32 * goodSuffix = AnewArray(allocator, int32, patLen + 1); for (CharCount j = 0; j <= patLen; j++) goodSuffix[j] = patLen - prefix[patLen - 1]; for (CharCount l = 1; l <= patLen; l++) { CharCount j = patLen - revPrefix[l - 1]; int32 s = l - revPrefix[l - 1]; if (goodSuffix[j] > s) goodSuffix[j] = s; } // shift above one to the left for (CharCount j = 0; j < patLen; j++) goodSuffix[j] = goodSuffix[j + 1]; AdeleteArray(allocator, patLen, prefix); AdeleteArray(allocator, patLen, revPrefix); return goodSuffix; } template void TextbookBoyerMoore::FreeBody(ArenaAllocator* allocator, CharCount patLen) { if(goodSuffix) { AdeleteArray(allocator, patLen + 1, PointerValue(goodSuffix)); #if DBG goodSuffix = nullptr; #endif } lastOccurrence.FreeBody(allocator); } template static bool MatchPatternAt(uint inputChar, char16 const * pat, CharCount index); template <> bool MatchPatternAt<1, 1>(uint inputChar, char16 const* pat, CharCount index) { return inputChar == pat[index]; } template <> bool MatchPatternAt(uint inputChar, char16 const * pat, CharCount index) { CompileAssert(CaseInsensitive::EquivClassSize == 4); return inputChar == pat[index * CaseInsensitive::EquivClassSize] || inputChar == pat[index * CaseInsensitive::EquivClassSize + 1] || inputChar == pat[index * CaseInsensitive::EquivClassSize + 2] || inputChar == pat[index * CaseInsensitive::EquivClassSize + 3]; } template <> bool MatchPatternAt(uint inputChar, char16 const * pat, CharCount index) { CompileAssert(CaseInsensitive::EquivClassSize == 4); return inputChar == pat[index * 4]; } template template bool TextbookBoyerMoore::Match ( const Char *const input , const CharCount inputLength , CharCount& inputOffset , const Char* pat , const CharCount patLen #if ENABLE_REGEX_CONFIG_OPTIONS , RegexStats* stats #endif ) const { Assert(input != 0); Assert(inputOffset <= inputLength); if (inputLength < patLen) return false; CharCount offset = inputOffset; const CharCount endOffset = inputLength - (patLen - 1); const int32* const localGoodSuffix = goodSuffix; const LastOccMap* const localLastOccurrence = &lastOccurrence; const CharCount lastPatCharIndex = (patLen - 1); while (offset < endOffset) { // A separate tight loop to find the last character while (true) { uint inputChar = Chars::CTU(input[offset + lastPatCharIndex]); if (MatchPatternAt(inputChar, pat, lastPatCharIndex)) { // Found a match. Break out of this loop and go to the match pattern loop break; } // Negative case is more common, // Write the checks so that we have a super tight loop int lastOcc; if (inputChar < localLastOccurrence->GetDirectMapSize()) { if (!localLastOccurrence->IsInDirectMap(inputChar)) { offset += patLen; if (offset >= endOffset) { return false; } continue; } lastOcc = localLastOccurrence->GetDirectMap(inputChar); } else if (!localLastOccurrence->GetNonDirect(inputChar, lastOcc)) { offset += patLen; if (offset >= endOffset) { return false; } continue; } Assert((int)lastPatCharIndex - lastOcc >= localGoodSuffix[lastPatCharIndex]); offset += lastPatCharIndex - lastOcc; if (offset >= endOffset) { return false; } } // CONSIDER: we can remove this check if we stop using TextbookBoyerMoore for one char pattern if (lastPatCharIndex == 0) { inputOffset = offset; return true; } // Match the rest of the pattern int32 j = lastPatCharIndex - 1; while (true) { #if ENABLE_REGEX_CONFIG_OPTIONS if (stats != 0) stats->numCompares++; #endif uint inputChar = Chars::CTU(input[offset + j]); if (!MatchPatternAt(inputChar, pat, j)) { const int32 e = j - localLastOccurrence->Get((Char)inputChar); offset += e > localGoodSuffix[j] ? e : localGoodSuffix[j]; break; } if (--j < 0) { inputOffset = offset; return true; } } } return false; } // Specialized linear char map version template void TextbookBoyerMooreWithLinearMap::Setup(ArenaAllocator * allocator, TextbookBoyerMooreSetup const& setup) { Assert(setup.GetScheme() == TextbookBoyerMooreSetup::LinearScheme); lastOccurrence.Set(setup.numLinearChars, setup.linearChar, setup.lastOcc); goodSuffix = TextbookBoyerMooreSetup::GetGoodSuffix(allocator, setup.pat, setup.patLen); } template void TextbookBoyerMooreWithLinearMap::FreeBody(ArenaAllocator* allocator, CharCount patLen) { if(goodSuffix) { AdeleteArray(allocator, patLen + 1, goodSuffix); #if DBG goodSuffix = 0; #endif } } template template bool TextbookBoyerMooreWithLinearMap::Match ( const Char *const input , const CharCount inputLength , CharCount& inputOffset , const Char* pat , const CharCount patLen #if ENABLE_REGEX_CONFIG_OPTIONS , RegexStats* stats #endif ) const { CompileAssert(equivClassSize == 1); Assert(input != 0); Assert(inputOffset <= inputLength); if (inputLength < patLen) return false; const int32* const localGoodSuffix = goodSuffix; const LastOccMap* const localLastOccurrence = &lastOccurrence; CharCount offset = inputOffset; const CharCount lastPatCharIndex = (patLen - 1); const CharCount endOffset = inputLength - lastPatCharIndex; // Using int size instead of Char value is faster const uint lastPatChar = pat[lastPatCharIndex]; Assert(lastPatChar == localLastOccurrence->GetChar(0)); while (offset < endOffset) { // A separate tight loop to find the last character while (true) { #if ENABLE_REGEX_CONFIG_OPTIONS if (stats != 0) stats->numCompares++; #endif uint inputChar = Chars::CTU(input[offset + lastPatCharIndex]); if (inputChar == lastPatChar) { // Found a match. Break out of this loop and go to the match pattern loop break; } // Negative case is more common, // Write the checks so that we have a super tight loop Assert(inputChar != localLastOccurrence->GetChar(0)); int32 lastOcc; if (localLastOccurrence->GetChar(1) != inputChar) { if (localLastOccurrence->GetChar(2) != inputChar) { if (localLastOccurrence->GetChar(3) != inputChar) { offset += patLen; if (offset >= endOffset) { return false; } continue; } lastOcc = localLastOccurrence->GetLastOcc(3); } else { lastOcc = localLastOccurrence->GetLastOcc(2); } } else { lastOcc = localLastOccurrence->GetLastOcc(1); } Assert((int)lastPatCharIndex - lastOcc >= localGoodSuffix[lastPatCharIndex]); offset += lastPatCharIndex - lastOcc; if (offset >= endOffset) { return false; } } // CONSIDER: we can remove this check if we stop using // TextbookBoyerMoore for one char pattern if (lastPatCharIndex == 0) { inputOffset = offset; return true; } // Match the rest of the pattern int32 j = lastPatCharIndex - 1; while (true) { #if ENABLE_REGEX_CONFIG_OPTIONS if (stats != 0) stats->numCompares++; #endif uint inputChar = Chars::CTU(input[offset + j]); if (inputChar != pat[j]) { int goodSuffix = localGoodSuffix[j]; Assert(patLen <= MaxCharCount); if (goodSuffix == (int)patLen) { offset += patLen; } else { const int32 e = j - localLastOccurrence->Get(inputChar); offset += e > goodSuffix ? e : goodSuffix; } break; } if (--j < 0) { inputOffset = offset; return true; } } } return false; } // explicit instantiation template struct TextbookBoyerMooreSetup; template class TextbookBoyerMoore; template class TextbookBoyerMooreWithLinearMap; template bool TextbookBoyerMoore::Match<1> ( const Char *const input , const CharCount inputLength , CharCount& inputOffset , const Char* pat , const CharCount patLen #if ENABLE_REGEX_CONFIG_OPTIONS , RegexStats* stats #endif ) const; template bool TextbookBoyerMoore::Match ( const Char *const input , const CharCount inputLength , CharCount& inputOffset , const Char* pat , const CharCount patLen #if ENABLE_REGEX_CONFIG_OPTIONS , RegexStats* stats #endif ) const; template bool TextbookBoyerMoore::Match ( const Char *const input , const CharCount inputLength , CharCount& inputOffset , const Char* pat , const CharCount patLen #if ENABLE_REGEX_CONFIG_OPTIONS , RegexStats* stats #endif ) const; template bool TextbookBoyerMooreWithLinearMap::Match<1> ( const Char *const input , const CharCount inputLength , CharCount& inputOffset , const Char* pat , const CharCount patLen #if ENABLE_REGEX_CONFIG_OPTIONS , RegexStats* stats #endif ) const; }