OctoquadIdentifier.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "ParserPch.h"
  6. namespace UnifiedRegex
  7. {
  8. // ----------------------------------------------------------------------
  9. // Trigrams
  10. // ----------------------------------------------------------------------
  11. TrigramInfo::TrigramInfo(__in_ecount(PatternLength) char* pat1,__in_ecount(PatternLength) char* pat2, Recycler* recycler)
  12. {
  13. isTrigramPattern=true;
  14. hasCachedResultString = false;
  15. int k;
  16. triPat1=0;
  17. triPat2=0;
  18. resultCount=0;
  19. for (k=3;k<PatternLength;k++) {
  20. triPat1=(triPat1<<4)+pat1[k];
  21. triPat2=(triPat2<<4)+pat2[k];
  22. }
  23. }
  24. void TrigramAlphabet::InitTrigramMap() {
  25. input=NULL;
  26. // set up mapping from 9 bits to trigram
  27. for (int i=0;i<TrigramMapSize;i++) {
  28. int t1=i>>6;
  29. int t2=(i>>3)&0x7;
  30. int t3=i&0x7;
  31. if ((t1>=AlphaCount)||(t2>=AlphaCount)||(t3>=AlphaCount)) {
  32. trigramMap[i]=TrigramNotInPattern;
  33. }
  34. else {
  35. // number of trigram
  36. trigramMap[i]=(char)((t1<<4)+(t2<<2)+t3);
  37. }
  38. }
  39. for (int j=0;j<TrigramCount;j++) {
  40. trigramStarts[j].count=0;
  41. }
  42. }
  43. bool TrigramAlphabet::AddStarts(__in_xcount(TrigramInfo::PatternLength) char* pat1,__in_xcount(TrigramInfo::PatternLength) char* pat2, RegexPattern* pattern)
  44. {
  45. for (int k=0;k<TrigramCount;k++) {
  46. char t1=1<<(k>>4);
  47. char t2=1<<((k>>2)&0x3);
  48. char t3=1<<(k&0x3);
  49. if ((t1&pat1[0])&&(t2&pat1[1])&&(t3&pat1[2])) {
  50. if ((t1&pat2[0])&&(t2&pat2[1])&&(t3&pat2[2])) {
  51. return false;
  52. }
  53. else {
  54. TrigramStart* trigramStart=(&trigramStarts[k]);
  55. if (trigramStart->count>=TrigramStart::MaxPatPerStart) {
  56. return false;
  57. }
  58. else {
  59. PatternTri* tri= &(trigramStart->patterns[trigramStart->count++]);
  60. tri->pattern=pattern;
  61. tri->encodedPattern=pattern->rep.unified.trigramInfo->triPat1;
  62. }
  63. }
  64. }
  65. else if ((t1&pat2[0])&&(t2&pat2[1])&&(t3&pat2[2])) {
  66. TrigramStart* trigramStart=(&trigramStarts[k]);
  67. if (trigramStart->count>=TrigramStart::MaxPatPerStart) {
  68. return false;
  69. }
  70. else {
  71. PatternTri* tri= &(trigramStart->patterns[trigramStart->count++]);
  72. tri->pattern=pattern;
  73. tri->encodedPattern=pattern->rep.unified.trigramInfo->triPat2;
  74. }
  75. }
  76. }
  77. return true;
  78. }
  79. void TrigramAlphabet::MegaMatch(__in_ecount(inputLen) const char16* input,int inputLen) {
  80. this->input=input;
  81. this->inputLen=inputLen;
  82. if (inputLen<TrigramInfo::PatternLength) {
  83. return;
  84. }
  85. // prime the pump
  86. unsigned char c1=alphaBits[input[0]&UpperCaseMask];
  87. unsigned char c2=alphaBits[input[1]&UpperCaseMask];
  88. unsigned char c3=alphaBits[input[2]&UpperCaseMask];
  89. // pump
  90. for (int k=3;k<inputLen-5;k++) {
  91. int index=(c1<<6)+(c2<<3)+c3;
  92. if (index<TrigramMapSize) {
  93. int t=trigramMap[index];
  94. if (t!=TrigramNotInPattern) {
  95. int count=trigramStarts[t].count;
  96. if (count>0) {
  97. int inputMask=0;
  98. bool validInput=true;
  99. for (int j=0;j<5;j++) {
  100. // ascii check
  101. if (input[k+j]<128) {
  102. int bits=alphaBits[input[k+j]&UpperCaseMask];
  103. if (bits==BitsNotInAlpha) {
  104. validInput=false;
  105. break;
  106. }
  107. inputMask=(inputMask<<AlphaCount)+(1<<bits);
  108. }
  109. else {
  110. validInput=false;
  111. break;
  112. }
  113. }
  114. if (validInput) {
  115. for (int j=0;j<count;j++) {
  116. PatternTri* tri= &(trigramStarts[t].patterns[j]);
  117. if ((inputMask&(tri->encodedPattern))==inputMask) {
  118. if (tri->pattern->rep.unified.trigramInfo->resultCount<TrigramInfo::MaxResults) {
  119. tri->pattern->rep.unified.trigramInfo->offsets[tri->pattern->rep.unified.trigramInfo->resultCount++]=k-3;
  120. }
  121. else {
  122. tri->pattern->rep.unified.trigramInfo->isTrigramPattern=false;
  123. }
  124. }
  125. }
  126. }
  127. }
  128. }
  129. }
  130. c1=c2;
  131. c2=c3;
  132. c3=alphaBits[input[k]&UpperCaseMask];
  133. }
  134. }
  135. // ----------------------------------------------------------------------
  136. // OctoquadIdentifier
  137. // ----------------------------------------------------------------------
  138. bool OctoquadIdentifier::Qualifies(const Program *const program)
  139. {
  140. return (program->flags & (GlobalRegexFlag | IgnoreCaseRegexFlag)) == (GlobalRegexFlag | IgnoreCaseRegexFlag);
  141. }
  142. OctoquadIdentifier::OctoquadIdentifier(
  143. const int numCodes,
  144. char (&codeToChar)[TrigramAlphabet::AlphaCount],
  145. char (&charToCode)[TrigramAlphabet::AsciiTableSize])
  146. : numCodes(numCodes),
  147. codeToChar(codeToChar),
  148. charToCode(charToCode),
  149. currPatternLength(0),
  150. currPatternNum(-1)
  151. {
  152. // 'patternBits' will be initialized as necessary
  153. }
  154. int OctoquadIdentifier::GetOrAddCharCode(const Char c)
  155. {
  156. if (c >= static_cast<Char>('A') && c <= static_cast<Char>('Z'))
  157. {
  158. for (int i = 0; i < numCodes; i++)
  159. {
  160. if (codeToChar[i] == static_cast<char>(c))
  161. return i;
  162. }
  163. if (numCodes == TrigramAlphabet::AlphaCount)
  164. return -1;
  165. codeToChar[numCodes] = static_cast<char>(c);
  166. charToCode[c] = static_cast<char>(numCodes);
  167. return numCodes++;
  168. }
  169. else
  170. return -1;
  171. }
  172. bool OctoquadIdentifier::BeginConcat()
  173. {
  174. if (currPatternNum >= 0 && currPatternLength != TrigramInfo::PatternLength)
  175. return false;
  176. if (currPatternNum >= NumPatterns)
  177. return false;
  178. currPatternNum++;
  179. currPatternLength = 0;
  180. return true;
  181. }
  182. bool OctoquadIdentifier::CouldAppend(const CharCount n) const
  183. {
  184. return n <= static_cast<CharCount>(TrigramInfo::PatternLength - currPatternLength);
  185. }
  186. bool OctoquadIdentifier::AppendChar(Char c)
  187. {
  188. if (currPatternLength >= TrigramInfo::PatternLength || currPatternNum < 0 || currPatternNum >= NumPatterns)
  189. return false;
  190. int code = GetOrAddCharCode(c);
  191. if (code < 0)
  192. return false;
  193. patternBits[currPatternNum][currPatternLength++] = 1 << code;
  194. return true;
  195. }
  196. bool OctoquadIdentifier::BeginUnions()
  197. {
  198. if(currPatternLength >= TrigramInfo::PatternLength || currPatternNum < 0 || currPatternNum >= NumPatterns)
  199. return false;
  200. patternBits[currPatternNum][currPatternLength] = 0;
  201. return true;
  202. }
  203. bool OctoquadIdentifier::UnionChar(Char c)
  204. {
  205. if (currPatternLength >= TrigramInfo::PatternLength || currPatternNum < 0 || currPatternNum >= NumPatterns)
  206. return false;
  207. int code = GetOrAddCharCode(c);
  208. if (code < 0)
  209. return false;
  210. patternBits[currPatternNum][currPatternLength] |= 1 << code;
  211. return true;
  212. }
  213. void OctoquadIdentifier::EndUnions()
  214. {
  215. Assert(currPatternLength < TrigramInfo::PatternLength);
  216. ++currPatternLength;
  217. }
  218. bool OctoquadIdentifier::IsOctoquad()
  219. {
  220. return
  221. numCodes == TrigramAlphabet::AlphaCount &&
  222. currPatternLength == TrigramInfo::PatternLength &&
  223. currPatternNum == NumPatterns - 1;
  224. }
  225. void OctoquadIdentifier::SetTrigramAlphabet(Js::ScriptContext * scriptContext,
  226. __in_xcount(regex::TrigramAlphabet::AlphaCount) char* alpha
  227. , __in_xcount(regex::TrigramAlphabet::AsciiTableSize) char* alphaBits)
  228. {
  229. ArenaAllocator* alloc = scriptContext->RegexAllocator();
  230. TrigramAlphabet * trigramAlphabet = AnewStruct(alloc, UnifiedRegex::TrigramAlphabet);
  231. for (uint i = 0; i < UnifiedRegex::TrigramAlphabet::AsciiTableSize; i++) {
  232. trigramAlphabet->alphaBits[i] = UnifiedRegex::TrigramAlphabet::BitsNotInAlpha;
  233. }
  234. for (uint i = 0; i < UnifiedRegex::TrigramAlphabet::AlphaCount; i++) {
  235. trigramAlphabet->alpha[i] = alpha[i];
  236. trigramAlphabet->alphaBits[alpha[i]] = alphaBits[alpha[i]];
  237. }
  238. trigramAlphabet->InitTrigramMap();
  239. scriptContext->SetTrigramAlphabet(trigramAlphabet);
  240. }
  241. void OctoquadIdentifier::InitializeTrigramInfo(Js::ScriptContext* scriptContext, RegexPattern* const pattern)
  242. {
  243. if(!scriptContext->GetTrigramAlphabet())
  244. {
  245. this->SetTrigramAlphabet(scriptContext, codeToChar, charToCode);
  246. }
  247. const auto recycler = scriptContext->GetRecycler();
  248. pattern->rep.unified.trigramInfo = RecyclerNew(recycler, TrigramInfo, patternBits[0], patternBits[1], recycler);
  249. pattern->rep.unified.trigramInfo->isTrigramPattern =
  250. scriptContext->GetTrigramAlphabet()->AddStarts(patternBits[0], patternBits[1], pattern);
  251. }
  252. // ----------------------------------------------------------------------
  253. // OctoquadMatcher
  254. // ----------------------------------------------------------------------
  255. OctoquadMatcher::OctoquadMatcher(const StandardChars<Char>* standardChars, CaseInsensitive::MappingSource mappingSource, OctoquadIdentifier* identifier)
  256. {
  257. for (int i = 0; i < TrigramAlphabet::AlphaCount; i++)
  258. codeToChar[i] = (Char)identifier->codeToChar[i];
  259. for (int i = 0; i < TrigramAlphabet::AsciiTableSize; i++)
  260. charToBits[i] = 0;
  261. for (int i = 0; i < TrigramAlphabet::AlphaCount; i++)
  262. {
  263. Char equivs[CaseInsensitive::EquivClassSize];
  264. standardChars->ToEquivs(mappingSource, codeToChar[i], equivs);
  265. for (int j = 0; j < CaseInsensitive::EquivClassSize; j++)
  266. {
  267. if (CTU(equivs[j]) < TrigramAlphabet::AsciiTableSize)
  268. charToBits[CTU(equivs[j])] = 1 << i;
  269. }
  270. }
  271. for (int i = 0; i < OctoquadIdentifier::NumPatterns; i++)
  272. {
  273. patterns[i] = 0;
  274. for (int j = 0; j < TrigramInfo::PatternLength; j++)
  275. {
  276. patterns[i] <<= 4;
  277. patterns[i] |= (uint32)identifier->patternBits[i][j];
  278. }
  279. }
  280. }
  281. OctoquadMatcher *OctoquadMatcher::New(
  282. Recycler* recycler,
  283. const StandardChars<Char>* standardChars,
  284. CaseInsensitive::MappingSource mappingSource,
  285. OctoquadIdentifier* identifier)
  286. {
  287. return RecyclerNewLeaf(recycler, OctoquadMatcher, standardChars, mappingSource, identifier);
  288. }
  289. // It exploits the fact that each quad of bits has at most only one bit set.
  290. __inline bool oneBitSetInEveryQuad(uint32 x)
  291. {
  292. x -= 0x11111111;
  293. return (x & 0x88888888u) == 0;
  294. }
  295. bool OctoquadMatcher::Match
  296. ( const Char* const input
  297. , const CharCount inputLength
  298. , CharCount& offset
  299. #if ENABLE_REGEX_CONFIG_OPTIONS
  300. , RegexStats* stats
  301. #endif
  302. )
  303. {
  304. Assert(TrigramInfo::PatternLength == 8);
  305. Assert(OctoquadIdentifier::NumPatterns == 2);
  306. if (inputLength < TrigramInfo::PatternLength)
  307. return false;
  308. if (offset > inputLength - TrigramInfo::PatternLength)
  309. return false;
  310. uint32 v = 0;
  311. for (int i = 0; i < TrigramInfo::PatternLength; i++)
  312. {
  313. #if ENABLE_REGEX_CONFIG_OPTIONS
  314. if (stats != 0)
  315. stats->numCompares++;
  316. #endif
  317. v <<= 4;
  318. if (CTU(input[offset + i]) < TrigramAlphabet::AsciiTableSize)
  319. v |= charToBits[CTU(input[offset + i])];
  320. }
  321. const uint32 lp = patterns[0];
  322. const uint32 rp = patterns[1];
  323. CharCount next = offset + TrigramInfo::PatternLength;
  324. while (true)
  325. {
  326. if (oneBitSetInEveryQuad(v & lp) || oneBitSetInEveryQuad(v & rp))
  327. {
  328. offset = next - TrigramInfo::PatternLength;
  329. return true;
  330. }
  331. if (next >= inputLength)
  332. return false;
  333. #if ENABLE_REGEX_CONFIG_OPTIONS
  334. if (stats != 0)
  335. stats->numCompares++;
  336. #endif
  337. v <<= 4;
  338. if (CTU(input[next]) < TrigramAlphabet::AsciiTableSize)
  339. v |= charToBits[CTU(input[next])];
  340. next++;
  341. }
  342. }
  343. #if ENABLE_REGEX_CONFIG_OPTIONS
  344. void OctoquadMatcher::Print(DebugWriter* w) const
  345. {
  346. for (int i = 0; i < OctoquadIdentifier::NumPatterns; i++)
  347. {
  348. if (i > 0)
  349. w->Print(_u("|"));
  350. for (int j = 0; j < TrigramInfo::PatternLength; j++)
  351. {
  352. uint8 v = (patterns[i] >> ((TrigramInfo::PatternLength - j - 1) * TrigramAlphabet::AlphaCount)) & 0xf;
  353. int n = 0;
  354. uint8 x = v;
  355. while (x > 0)
  356. {
  357. x &= x-1;
  358. n++;
  359. }
  360. if (n != 1)
  361. w->Print(_u("["));
  362. for (int k = 0; k < TrigramAlphabet::AlphaCount; k++)
  363. {
  364. if ((v & 1) == 1)
  365. w->PrintEscapedChar(codeToChar[k]);
  366. v >>= 1;
  367. }
  368. if (n != 1)
  369. w->Print(_u("]"));
  370. }
  371. }
  372. }
  373. #endif
  374. }