CharSet.cpp 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "ParserPch.h"
  6. namespace UnifiedRegex
  7. {
  8. // ----------------------------------------------------------------------
  9. // CharBitVec
  10. // ----------------------------------------------------------------------
  11. inline uint32 popcnt(uint32 x)
  12. {
  13. // sum set bits in every bit pair
  14. x -= (x >> 1) & 0x55555555u;
  15. // sum pairs into quads
  16. x = (x & 0x33333333u) + ((x >> 2) & 0x33333333u);
  17. // sum quads into octets
  18. x = (x + (x >> 4)) & 0x0f0f0f0fu;
  19. // sum octets into topmost octet
  20. x *= 0x01010101u;
  21. return x >> 24;
  22. }
  23. uint CharBitvec::Count() const
  24. {
  25. uint n = 0;
  26. for (int w = 0; w < vecSize; w++)
  27. {
  28. n += popcnt(vec[w]);
  29. }
  30. return n;
  31. }
  32. int CharBitvec::NextSet(int k) const
  33. {
  34. if (k < 0 || k >= Size)
  35. return -1;
  36. uint w = k / wordSize;
  37. uint o = k % wordSize;
  38. uint32 v = vec[w] >> o;
  39. do
  40. {
  41. if (v == 0)
  42. {
  43. k += wordSize - o;
  44. break;
  45. }
  46. else if ((v & 0x1) != 0)
  47. return k;
  48. else
  49. {
  50. v >>= 1;
  51. o++;
  52. k++;
  53. }
  54. }
  55. while (o < wordSize);
  56. w++;
  57. while (w < vecSize)
  58. {
  59. o = 0;
  60. v = vec[w];
  61. do
  62. {
  63. if (v == 0)
  64. {
  65. k += wordSize - o;
  66. break;
  67. }
  68. else if ((v & 0x1) != 0)
  69. return k;
  70. else
  71. {
  72. v >>= 1;
  73. o++;
  74. k++;
  75. }
  76. }
  77. while (o < wordSize);
  78. w++;
  79. }
  80. return -1;
  81. }
  82. int CharBitvec::NextClear(int k) const
  83. {
  84. if (k < 0 || k >= Size)
  85. return -1;
  86. uint w = k / wordSize;
  87. uint o = k % wordSize;
  88. uint32 v = vec[w] >> o;
  89. do
  90. {
  91. if (v == ones)
  92. {
  93. k += wordSize - o;
  94. break;
  95. }
  96. else if ((v & 0x1) == 0)
  97. return k;
  98. else
  99. {
  100. v >>= 1;
  101. o++;
  102. k++;
  103. }
  104. }
  105. while (o < wordSize);
  106. w++;
  107. while (w < vecSize)
  108. {
  109. o = 0;
  110. v = vec[w];
  111. do
  112. {
  113. if (v == ones)
  114. {
  115. k += wordSize - o;
  116. break;
  117. }
  118. else if ((v & 0x1) == 0)
  119. return k;
  120. else
  121. {
  122. v >>= 1;
  123. o++;
  124. k++;
  125. }
  126. }
  127. while (o < wordSize);
  128. w++;
  129. }
  130. return -1;
  131. }
  132. template <typename C>
  133. void CharBitvec::ToComplement(ArenaAllocator* allocator, uint base, CharSet<C>& result) const
  134. {
  135. int hi = -1;
  136. while (true)
  137. {
  138. // Find the next range of clear bits in vector
  139. int li = NextClear(hi + 1);
  140. if (li < 0)
  141. return;
  142. hi = NextSet(li + 1);
  143. if (hi < 0)
  144. hi = Size - 1;
  145. else
  146. {
  147. Assert(hi > 0);
  148. hi--;
  149. }
  150. // Add range as characters
  151. result.SetRange(allocator, Chars<C>::ITC(base + li), Chars<C>::ITC(base + hi));
  152. }
  153. }
  154. template <typename C>
  155. void CharBitvec::ToEquivClass(ArenaAllocator* allocator, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset) const
  156. {
  157. int hi = -1;
  158. while (true)
  159. {
  160. // Find the next range of set bits in vector
  161. int li = NextSet(hi + 1);
  162. if (li < 0)
  163. return;
  164. hi = NextClear(li + 1);
  165. if (hi < 0)
  166. hi = Size - 1;
  167. else
  168. {
  169. Assert(hi > 0);
  170. hi--;
  171. }
  172. // Convert to character codes
  173. uint l = base + li + baseOffset;
  174. uint h = base + hi + baseOffset;
  175. do
  176. {
  177. uint acth;
  178. C equivl[CaseInsensitive::EquivClassSize];
  179. CaseInsensitive::RangeToEquivClass(tblidx, l, h, acth, equivl);
  180. uint n = acth - l;
  181. for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
  182. {
  183. result.SetRange(allocator, equivl[i], Chars<C>::Shift(equivl[i], n));
  184. }
  185. // Go around again for rest of this range
  186. l = acth + 1;
  187. }
  188. while (l <= h);
  189. }
  190. }
  191. // ----------------------------------------------------------------------
  192. // CharSetNode
  193. // ----------------------------------------------------------------------
  194. inline CharSetNode* CharSetNode::For(ArenaAllocator* allocator, int level)
  195. {
  196. if (level == 0)
  197. return Anew(allocator, CharSetLeaf);
  198. else
  199. return Anew(allocator, CharSetInner);
  200. }
  201. // ----------------------------------------------------------------------
  202. // CharSetFull
  203. // ----------------------------------------------------------------------
  204. CharSetFull CharSetFull::Instance;
  205. CharSetFull* const CharSetFull::TheFullNode = &CharSetFull::Instance;
  206. CharSetFull::CharSetFull() {}
  207. void CharSetFull::FreeSelf(ArenaAllocator* allocator)
  208. {
  209. Assert(this == TheFullNode);
  210. // Never allocated
  211. }
  212. CharSetNode* CharSetFull::Clone(ArenaAllocator* allocator) const
  213. {
  214. // Always shared
  215. return (CharSetNode*)this;
  216. }
  217. CharSetNode* CharSetFull::Set(ArenaAllocator* allocator, uint level, uint l, uint h)
  218. {
  219. return this;
  220. }
  221. CharSetNode* CharSetFull::ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h)
  222. {
  223. AssertMsg(h <= lim(level), "The range for clearing provided is invalid for this level.");
  224. AssertMsg(l <= h, "Can't clear where lower is bigger than the higher.");
  225. if (l == 0 && h == lim(level))
  226. {
  227. return nullptr;
  228. }
  229. CharSetNode* toReturn = For(allocator, level);
  230. if (l > 0)
  231. {
  232. AssertVerify(toReturn->Set(allocator, level, 0, l - 1) == toReturn);
  233. }
  234. if (h < lim(level))
  235. {
  236. AssertVerify(toReturn->Set(allocator, level, h + 1, lim(level)) == toReturn);
  237. }
  238. return toReturn;
  239. }
  240. CharSetNode* CharSetFull::UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other)
  241. {
  242. return this;
  243. }
  244. bool CharSetFull::Get(uint level, uint k) const
  245. {
  246. return true;
  247. }
  248. void CharSetFull::ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const
  249. {
  250. // Empty, so add nothing
  251. }
  252. void CharSetFull::ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const
  253. {
  254. this->ToEquivClass<char16>(allocator, level, base, tblidx, result);
  255. }
  256. void CharSetFull::ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const
  257. {
  258. this->ToEquivClass<codepoint_t>(allocator, level, base, tblidx, result, baseOffset);
  259. }
  260. template <typename C>
  261. void CharSetFull::ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset) const
  262. {
  263. uint l = base + (CharSetNode::levels - 1 == level ? 0xff : 0) + baseOffset;
  264. uint h = base + lim(level) + baseOffset;
  265. do
  266. {
  267. uint acth;
  268. C equivl[CaseInsensitive::EquivClassSize];
  269. CaseInsensitive::RangeToEquivClass(tblidx, l, h, acth, equivl);
  270. uint n = acth - l;
  271. for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
  272. {
  273. result.SetRange(allocator, equivl[i], Chars<C>::Shift(equivl[i], n));
  274. }
  275. // Go around again for rest of this range
  276. l = acth + 1;
  277. }
  278. while (l <= h);
  279. }
  280. bool CharSetFull::IsSubsetOf(uint level, const CharSetNode* other) const
  281. {
  282. Assert(other != nullptr);
  283. return other == TheFullNode;
  284. }
  285. bool CharSetFull::IsEqualTo(uint level, const CharSetNode* other) const
  286. {
  287. Assert(other != nullptr);
  288. return other == TheFullNode;
  289. }
  290. uint CharSetFull::Count(uint level) const
  291. {
  292. return lim(level) + 1;
  293. }
  294. _Success_(return)
  295. bool CharSetFull::GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const
  296. {
  297. Assert(searchCharStart < this->Count(level));
  298. *outLowerChar = searchCharStart;
  299. *outHigherChar = (Char)this->Count(level) - 1;
  300. return true;
  301. }
  302. #if DBG
  303. bool CharSetFull::IsLeaf() const
  304. {
  305. return false;
  306. }
  307. #endif
  308. // ----------------------------------------------------------------------
  309. // CharSetInner
  310. // ----------------------------------------------------------------------
  311. CharSetInner::CharSetInner()
  312. {
  313. for (uint i = 0; i < branchingPerInnerLevel; i++)
  314. children[i] = 0;
  315. }
  316. void CharSetInner::FreeSelf(ArenaAllocator* allocator)
  317. {
  318. for (uint i = 0; i < branchingPerInnerLevel; i++)
  319. {
  320. if (children[i] != 0)
  321. {
  322. children[i]->FreeSelf(allocator);
  323. #if DBG
  324. children[i] = 0;
  325. #endif
  326. }
  327. }
  328. Adelete(allocator, this);
  329. }
  330. CharSetNode* CharSetInner::Clone(ArenaAllocator* allocator) const
  331. {
  332. CharSetInner* res = Anew(allocator, CharSetInner);
  333. for (uint i = 0; i < branchingPerInnerLevel; i++)
  334. {
  335. if (children[i] != 0)
  336. res->children[i] = children[i]->Clone(allocator);
  337. }
  338. return res;
  339. }
  340. CharSetNode* CharSetInner::ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h)
  341. {
  342. Assert(level > 0);
  343. AssertMsg(h <= lim(level), "The range for clearing provided is invalid for this level.");
  344. AssertMsg(l <= h, "Can't clear where lower is bigger than the higher.");
  345. if (l == 0 && h == lim(level))
  346. {
  347. return nullptr;
  348. }
  349. uint lowerIndex = innerIdx(level, l);
  350. uint higherIndex = innerIdx(level--, h);
  351. l = l & lim(level);
  352. h = h & lim(level);
  353. if (lowerIndex == higherIndex)
  354. {
  355. if (children[lowerIndex] != nullptr)
  356. {
  357. children[lowerIndex] = children[lowerIndex]->ClearRange(allocator, level, l, h);
  358. }
  359. }
  360. else
  361. {
  362. if (children[lowerIndex] != nullptr)
  363. {
  364. children[lowerIndex] = children[lowerIndex]->ClearRange(allocator, level, l, lim(level));
  365. }
  366. for (uint i = lowerIndex + 1; i < higherIndex; i++)
  367. {
  368. if (children[i] != nullptr)
  369. {
  370. children[i]->FreeSelf(allocator);
  371. }
  372. children[i] = nullptr;
  373. }
  374. if (children[higherIndex] != nullptr)
  375. {
  376. children[higherIndex] = children[higherIndex]->ClearRange(allocator, level, 0, h);
  377. }
  378. }
  379. for (int i = 0; i < branchingPerInnerLevel; i++)
  380. {
  381. if (children[i] != nullptr)
  382. {
  383. return this;
  384. }
  385. }
  386. return nullptr;
  387. }
  388. CharSetNode* CharSetInner::Set(ArenaAllocator* allocator, uint level, uint l, uint h)
  389. {
  390. Assert(level > 0);
  391. uint li = innerIdx(level, l);
  392. uint hi = innerIdx(level--, h);
  393. bool couldBeFull = true;
  394. if (li == hi)
  395. {
  396. if (children[li] == nullptr)
  397. {
  398. if (remain(level, l) == 0 && remain(level, h + 1) == 0)
  399. children[li] = CharSetFull::TheFullNode;
  400. else
  401. {
  402. children[li] = For(allocator, level);
  403. children[li] = children[li]->Set(allocator, level, l, h);
  404. couldBeFull = false;
  405. }
  406. }
  407. else
  408. children[li] = children[li]->Set(allocator, level, l, h);
  409. }
  410. else
  411. {
  412. if (children[li] == nullptr)
  413. {
  414. if (remain(level, l) == 0)
  415. children[li] = CharSetFull::TheFullNode;
  416. else
  417. {
  418. children[li] = For(allocator, level);
  419. children[li] = children[li]->Set(allocator, level, l, lim(level));
  420. couldBeFull = false;
  421. }
  422. }
  423. else
  424. children[li] = children[li]->Set(allocator, level, l, lim(level));
  425. for (uint i = li + 1; i < hi; i++)
  426. {
  427. if (children[i] != nullptr)
  428. children[i]->FreeSelf(allocator);
  429. children[i] = CharSetFull::TheFullNode;
  430. }
  431. if (children[hi] == nullptr)
  432. {
  433. if (remain(level, h + 1) == 0)
  434. children[hi] = CharSetFull::TheFullNode;
  435. else
  436. {
  437. children[hi] = For(allocator, level);
  438. children[hi] = children[hi]->Set(allocator, level, 0, h);
  439. couldBeFull = false;
  440. }
  441. }
  442. else
  443. children[hi] = children[hi]->Set(allocator, level, 0, h);
  444. }
  445. if (couldBeFull)
  446. {
  447. for (uint i = 0; i < branchingPerInnerLevel; i++)
  448. {
  449. if (children[i] != CharSetFull::TheFullNode)
  450. return this;
  451. }
  452. FreeSelf(allocator);
  453. return CharSetFull::TheFullNode;
  454. }
  455. else
  456. return this;
  457. }
  458. CharSetNode* CharSetInner::UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other)
  459. {
  460. Assert(level > 0);
  461. Assert(other != nullptr && other != CharSetFull::TheFullNode && !other->IsLeaf());
  462. CharSetInner* otherInner = (CharSetInner*)other;
  463. level--;
  464. bool isFull = true;
  465. for (uint i = 0; i < branchingPerInnerLevel; i++)
  466. {
  467. if (otherInner->children[i] != nullptr)
  468. {
  469. if (otherInner->children[i] == CharSetFull::TheFullNode)
  470. {
  471. if (children[i] != nullptr)
  472. children[i]->FreeSelf(allocator);
  473. children[i] = CharSetFull::TheFullNode;
  474. }
  475. else
  476. {
  477. if (children[i] == nullptr)
  478. children[i] = For(allocator, level);
  479. children[i] = children[i]->UnionInPlace(allocator, level, otherInner->children[i]);
  480. if (children[i] != CharSetFull::TheFullNode)
  481. isFull = false;
  482. }
  483. }
  484. else if (children[i] != CharSetFull::TheFullNode)
  485. isFull = false;
  486. }
  487. if (isFull)
  488. {
  489. FreeSelf(allocator);
  490. return CharSetFull::TheFullNode;
  491. }
  492. else
  493. return this;
  494. }
  495. bool CharSetInner::Get(uint level, uint k) const
  496. {
  497. Assert(level > 0);
  498. uint i = innerIdx(level--, k);
  499. if (children[i] == nullptr)
  500. return false;
  501. else
  502. return children[i]->Get(level, k);
  503. }
  504. void CharSetInner::ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const
  505. {
  506. Assert(level > 0);
  507. level--;
  508. uint delta = lim(level) + 1;
  509. for (uint i = 0; i < branchingPerInnerLevel; i++)
  510. {
  511. if (children[i] == nullptr)
  512. // Caution: Part of the range for this child may overlap with direct vector
  513. result.SetRange(allocator, UTC(max(base, directSize)), UTC(base + delta - 1));
  514. else
  515. children[i]->ToComplement(allocator, level, base, result);
  516. base += delta;
  517. }
  518. }
  519. void CharSetInner::ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const
  520. {
  521. Assert(level > 0);
  522. level--;
  523. uint delta = lim(level) + 1;
  524. for (uint i = 0; i < branchingPerInnerLevel; i++)
  525. {
  526. if (children[i] != nullptr)
  527. {
  528. children[i]->ToEquivClassW(allocator, level, base, tblidx, result);
  529. }
  530. base += delta;
  531. }
  532. }
  533. void CharSetInner::ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const
  534. {
  535. Assert(level > 0);
  536. level--;
  537. uint delta = lim(level) + 1;
  538. for (uint i = 0; i < branchingPerInnerLevel; i++)
  539. {
  540. if (children[i] != nullptr)
  541. {
  542. children[i]->ToEquivClassCP(allocator, level, base, tblidx, result, baseOffset);
  543. }
  544. base += delta;
  545. }
  546. }
  547. bool CharSetInner::IsSubsetOf(uint level, const CharSetNode* other) const
  548. {
  549. Assert(level > 0);
  550. Assert(other != nullptr && !other->IsLeaf());
  551. if (other == CharSetFull::TheFullNode)
  552. return true;
  553. level--;
  554. const CharSetInner* otherInner = (CharSetInner*)other;
  555. for (uint i = 0; i < branchingPerInnerLevel; i++)
  556. {
  557. if (children[i] != nullptr)
  558. {
  559. if (otherInner->children[i] == nullptr)
  560. return false;
  561. if (children[i]->IsSubsetOf(level, otherInner->children[i]))
  562. return false;
  563. }
  564. }
  565. return true;
  566. }
  567. bool CharSetInner::IsEqualTo(uint level, const CharSetNode* other) const
  568. {
  569. Assert(level > 0);
  570. Assert(other != nullptr && !other->IsLeaf());
  571. if (other == CharSetFull::TheFullNode)
  572. return false;
  573. level--;
  574. const CharSetInner* otherInner = (CharSetInner*)other;
  575. for (uint i = 0; i < branchingPerInnerLevel; i++)
  576. {
  577. if (children[i] != 0)
  578. {
  579. if (otherInner->children[i] == nullptr)
  580. return false;
  581. if (children[i]->IsSubsetOf(level, otherInner->children[i]))
  582. return false;
  583. }
  584. }
  585. return true;
  586. }
  587. uint CharSetInner::Count(uint level) const
  588. {
  589. uint n = 0;
  590. Assert(level > 0);
  591. level--;
  592. for (uint i = 0; i < branchingPerInnerLevel; i++)
  593. {
  594. if (children[i] != nullptr)
  595. n += children[i]->Count(level);
  596. }
  597. return n;
  598. }
  599. _Success_(return)
  600. bool CharSetInner::GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const
  601. {
  602. Assert(searchCharStart < this->lim(level) + 1);
  603. uint innerIndex = innerIdx(level--, searchCharStart);
  604. Char currentLowChar = 0, currentHighChar = 0;
  605. for (; innerIndex < branchingPerInnerLevel; innerIndex++)
  606. {
  607. if (children[innerIndex] != nullptr && children[innerIndex]->GetNextRange(level, (Char)remain(level, searchCharStart), &currentLowChar, &currentHighChar))
  608. {
  609. break;
  610. }
  611. if (innerIndex < branchingPerInnerLevel - 1)
  612. {
  613. searchCharStart = (Char)indexToValue(level + 1, innerIndex + 1, 0);
  614. }
  615. }
  616. if (innerIndex == branchingPerInnerLevel)
  617. {
  618. return false;
  619. }
  620. currentLowChar = (Char)indexToValue(level + 1, innerIndex, currentLowChar);
  621. currentHighChar = (Char)indexToValue(level + 1, innerIndex, currentHighChar);
  622. innerIndex += 1;
  623. for (; remain(level, currentHighChar) == lim(level) && innerIndex < branchingPerInnerLevel; innerIndex++)
  624. {
  625. Char tempLower, tempHigher;
  626. if (children[innerIndex] == nullptr || !children[innerIndex]->GetNextRange(level, 0x0, &tempLower, &tempHigher) || remain(level, tempLower) != 0)
  627. {
  628. break;
  629. }
  630. currentHighChar = (Char)indexToValue(level + 1, innerIndex, tempHigher);
  631. }
  632. *outLowerChar = currentLowChar;
  633. *outHigherChar = currentHighChar;
  634. return true;
  635. }
  636. #if DBG
  637. bool CharSetInner::IsLeaf() const
  638. {
  639. return false;
  640. }
  641. #endif
  642. // ----------------------------------------------------------------------
  643. // CharSetLeaf
  644. // ----------------------------------------------------------------------
  645. CharSetLeaf::CharSetLeaf()
  646. {
  647. vec.Clear();
  648. }
  649. void CharSetLeaf::FreeSelf(ArenaAllocator* allocator)
  650. {
  651. Adelete(allocator, this);
  652. }
  653. CharSetNode* CharSetLeaf::Clone(ArenaAllocator* allocator) const
  654. {
  655. return Anew(allocator, CharSetLeaf, *this);
  656. }
  657. CharSetNode* CharSetLeaf::Set(ArenaAllocator* allocator, uint level, uint l, uint h)
  658. {
  659. Assert(level == 0);
  660. vec.SetRange(leafIdx(l), leafIdx(h));
  661. if (vec.IsFull())
  662. {
  663. FreeSelf(allocator);
  664. return CharSetFull::TheFullNode;
  665. }
  666. else
  667. return this;
  668. }
  669. CharSetNode* CharSetLeaf::ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h)
  670. {
  671. Assert(level == 0);
  672. AssertMsg(h <= lim(level), "The range for clearing provided is invalid for this level.");
  673. AssertMsg(l <= h, "Can't clear where lower is bigger than the higher.");
  674. if (l == 0 && h == lim(level))
  675. {
  676. return nullptr;
  677. }
  678. vec.ClearRange(leafIdx(l), leafIdx(h));
  679. if (vec.IsEmpty())
  680. {
  681. FreeSelf(allocator);
  682. return nullptr;
  683. }
  684. return this;
  685. }
  686. CharSetNode* CharSetLeaf::UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other)
  687. {
  688. Assert(level == 0);
  689. Assert(other != nullptr && other->IsLeaf());
  690. CharSetLeaf* otherLeaf = (CharSetLeaf*)other;
  691. if (vec.UnionInPlaceFullCheck(otherLeaf->vec))
  692. {
  693. FreeSelf(allocator);
  694. return CharSetFull::TheFullNode;
  695. }
  696. else
  697. return this;
  698. }
  699. bool CharSetLeaf::Get(uint level, uint k) const
  700. {
  701. Assert(level == 0);
  702. return vec.Get(leafIdx(k));
  703. }
  704. void CharSetLeaf::ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const
  705. {
  706. Assert(level == 0);
  707. vec.ToComplement<char16>(allocator, base, result);
  708. }
  709. void CharSetLeaf::ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const
  710. {
  711. this->ToEquivClass<char16>(allocator, level, base, tblidx, result);
  712. }
  713. void CharSetLeaf::ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const
  714. {
  715. this->ToEquivClass<codepoint_t>(allocator, level, base, tblidx, result, baseOffset);
  716. }
  717. template <typename C>
  718. void CharSetLeaf::ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset) const
  719. {
  720. Assert(level == 0);
  721. vec.ToEquivClass<C>(allocator, base, tblidx, result, baseOffset);
  722. }
  723. bool CharSetLeaf::IsSubsetOf(uint level, const CharSetNode* other) const
  724. {
  725. Assert(level == 0);
  726. Assert(other != nullptr);
  727. if (other == CharSetFull::TheFullNode)
  728. return true;
  729. Assert(other->IsLeaf());
  730. CharSetLeaf* otherLeaf = (CharSetLeaf*)other;
  731. return vec.IsSubsetOf(otherLeaf->vec);
  732. }
  733. bool CharSetLeaf::IsEqualTo(uint level, const CharSetNode* other) const
  734. {
  735. Assert(level == 0);
  736. Assert(other != nullptr);
  737. if (other == CharSetFull::TheFullNode)
  738. return false;
  739. Assert(other->IsLeaf());
  740. CharSetLeaf* otherLeaf = (CharSetLeaf*)other;
  741. return vec.IsSubsetOf(otherLeaf->vec);
  742. }
  743. uint CharSetLeaf::Count(uint level) const
  744. {
  745. Assert(level == 0);
  746. return vec.Count();
  747. }
  748. _Success_(return)
  749. bool CharSetLeaf::GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const
  750. {
  751. Assert(searchCharStart < lim(level) + 1);
  752. int nextSet = vec.NextSet(searchCharStart);
  753. if (nextSet == -1)
  754. {
  755. return false;
  756. }
  757. *outLowerChar = (char16)nextSet;
  758. int nextClear = vec.NextClear(nextSet);
  759. *outHigherChar = UTC(nextClear == -1 ? lim(level) : nextClear - 1);
  760. return true;
  761. }
  762. #if DBG
  763. bool CharSetLeaf::IsLeaf() const
  764. {
  765. return true;
  766. }
  767. #endif
  768. // ----------------------------------------------------------------------
  769. // CharSet<char16>
  770. // ----------------------------------------------------------------------
  771. void CharSet<char16>::SwitchRepresentations(ArenaAllocator* allocator)
  772. {
  773. Assert(IsCompact());
  774. uint existCount = this->GetCompactLength();
  775. __assume(existCount <= MaxCompact);
  776. if (existCount <= MaxCompact)
  777. {
  778. Char existCs[MaxCompact];
  779. for (uint i = 0; i < existCount; i++)
  780. {
  781. existCs[i] = GetCompactChar(i);
  782. }
  783. rep.full.root = nullptr;
  784. rep.full.direct.Clear();
  785. for (uint i = 0; i < existCount; i++)
  786. Set(allocator, existCs[i]);
  787. }
  788. }
  789. void CharSet<char16>::Sort()
  790. {
  791. Assert(IsCompact());
  792. __assume(this->GetCompactLength() <= MaxCompact);
  793. for (uint i = 1; i < this->GetCompactLength(); i++)
  794. {
  795. uint curr = GetCompactCharU(i);
  796. for (uint j = 0; j < i; j++)
  797. {
  798. if (GetCompactCharU(j) > curr)
  799. {
  800. for (int k = i; k > (int)j; k--)
  801. {
  802. this->ReplaceCompactCharU(k, this->GetCompactCharU(k - 1));
  803. }
  804. this->ReplaceCompactCharU(j, curr);
  805. break;
  806. }
  807. }
  808. }
  809. }
  810. CharSet<char16>::CharSet()
  811. {
  812. Assert(sizeof(Node*) == sizeof(size_t));
  813. Assert(sizeof(CompactRep) == sizeof(FullRep));
  814. rep.compact.countPlusOne = 1;
  815. for (int i = 0; i < MaxCompact; i++)
  816. rep.compact.cs[i] = emptySlot;
  817. }
  818. void CharSet<char16>::FreeBody(ArenaAllocator* allocator)
  819. {
  820. if (!IsCompact() && rep.full.root != nullptr)
  821. {
  822. rep.full.root->FreeSelf(allocator);
  823. #if DBG
  824. rep.full.root = nullptr;
  825. #endif
  826. }
  827. }
  828. void CharSet<char16>::Clear(ArenaAllocator* allocator)
  829. {
  830. if (!IsCompact() && rep.full.root != nullptr)
  831. rep.full.root->FreeSelf(allocator);
  832. rep.compact.countPlusOne = 1;
  833. for (int i = 0; i < MaxCompact; i++)
  834. rep.compact.cs[i] = emptySlot;
  835. }
  836. void CharSet<char16>::CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other)
  837. {
  838. Clear(allocator);
  839. Assert(IsCompact());
  840. if (other.IsCompact())
  841. {
  842. this->SetCompactLength(other.GetCompactLength());
  843. for (uint i = 0; i < other.GetCompactLength(); i++)
  844. {
  845. this->ReplaceCompactCharU(i, other.GetCompactCharU(i));
  846. }
  847. }
  848. else
  849. {
  850. rep.full.root = other.rep.full.root == nullptr ? nullptr : other.rep.full.root->Clone(allocator);
  851. rep.full.direct.CloneFrom(other.rep.full.direct);
  852. }
  853. }
  854. void CharSet<char16>::CloneNonSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<Char>& other)
  855. {
  856. if (this->IsCompact())
  857. {
  858. for (uint i = 0; i < this->GetCompactLength(); i++)
  859. {
  860. Char c = this->GetCompactChar(i);
  861. uint uChar = CTU(c);
  862. if (uChar < 0xD800 || uChar > 0xDFFF)
  863. {
  864. other.Set(allocator, c);
  865. }
  866. }
  867. }
  868. else
  869. {
  870. other.rep.full.direct.CloneFrom(rep.full.direct);
  871. if (rep.full.root == nullptr)
  872. {
  873. other.rep.full.root = nullptr;
  874. }
  875. else
  876. {
  877. other.rep.full.root = rep.full.root->Clone(allocator);
  878. other.rep.full.root->ClearRange(allocator, CharSetNode::levels - 1, 0xD800, 0XDFFF);
  879. }
  880. }
  881. }
  882. void CharSet<char16>::CloneSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<Char>& other)
  883. {
  884. if (this->IsCompact())
  885. {
  886. for (uint i = 0; i < this->GetCompactLength(); i++)
  887. {
  888. Char c = this->GetCompactChar(i);
  889. uint uChar = CTU(c);
  890. if (0xD800 <= uChar && uChar <= 0xDFFF)
  891. {
  892. other.Set(allocator, c);
  893. }
  894. }
  895. }
  896. else
  897. {
  898. other.rep.full.direct.CloneFrom(rep.full.direct);
  899. if (rep.full.root == nullptr)
  900. {
  901. other.rep.full.root = nullptr;
  902. }
  903. else
  904. {
  905. other.rep.full.root = rep.full.root->Clone(allocator);
  906. other.rep.full.root->ClearRange(allocator, CharSetNode::levels - 1, 0, 0xD7FF);
  907. }
  908. }
  909. }
  910. void CharSet<char16>::SubtractRange(ArenaAllocator* allocator, Char lowerChar, Char higherChar)
  911. {
  912. uint lowerValue = CTU(lowerChar);
  913. uint higherValue = CTU(higherChar);
  914. if (higherValue < lowerValue)
  915. return;
  916. if (IsCompact())
  917. {
  918. for (uint i = 0; i < this->GetCompactLength(); )
  919. {
  920. uint value = this->GetCompactCharU(i);
  921. if (value >= lowerValue && value <= higherValue)
  922. {
  923. this->RemoveCompactChar(i);
  924. }
  925. else
  926. {
  927. i++;
  928. }
  929. }
  930. }
  931. else if(lowerValue == 0 && higherValue == MaxUChar)
  932. {
  933. this->Clear(allocator);
  934. }
  935. else
  936. {
  937. if (lowerValue < CharSetNode::directSize)
  938. {
  939. uint maxDirectValue = min(higherValue, CharSetNode::directSize - 1);
  940. rep.full.direct.ClearRange(lowerValue, maxDirectValue);
  941. }
  942. if (rep.full.root != nullptr)
  943. {
  944. rep.full.root = rep.full.root->ClearRange(allocator, CharSetNode::levels - 1, lowerValue, higherValue);
  945. }
  946. }
  947. }
  948. void CharSet<char16>::SetRange(ArenaAllocator* allocator, Char lc, Char hc)
  949. {
  950. uint l = CTU(lc);
  951. uint h = CTU(hc);
  952. if (h < l)
  953. return;
  954. if (IsCompact())
  955. {
  956. if (h - l < MaxCompact)
  957. {
  958. do
  959. {
  960. uint i;
  961. for (i = 0; i < this->GetCompactLength(); i++)
  962. {
  963. __assume(l <= MaxUChar);
  964. if (l <= MaxUChar && i < MaxCompact)
  965. {
  966. if (this->GetCompactCharU(i) == l)
  967. break;
  968. }
  969. }
  970. if (i == this->GetCompactLength())
  971. {
  972. // Character not already in compact set
  973. if (i < MaxCompact)
  974. {
  975. this->AddCompactCharU(l);
  976. }
  977. else
  978. // Must switch representations
  979. break;
  980. }
  981. l++;
  982. }
  983. while (l <= h);
  984. if (h < l)
  985. // All chars are now in compact set
  986. return;
  987. // else: fall-through to general case for remaining chars
  988. }
  989. // else: no use even trying
  990. SwitchRepresentations(allocator);
  991. }
  992. Assert(!IsCompact());
  993. if (l == 0 && h == MaxUChar)
  994. {
  995. rep.full.direct.SetRange(0, CharSetNode::directSize - 1);
  996. if (rep.full.root != nullptr)
  997. rep.full.root->FreeSelf(allocator);
  998. rep.full.root = CharSetFull::TheFullNode;
  999. }
  1000. else
  1001. {
  1002. if (l < CharSetNode::directSize)
  1003. {
  1004. if (h < CharSetNode::directSize)
  1005. {
  1006. rep.full.direct.SetRange(l, h);
  1007. return;
  1008. }
  1009. rep.full.direct.SetRange(l, CharSetNode::directSize - 1);
  1010. l = CharSetNode::directSize;
  1011. }
  1012. if (rep.full.root == nullptr)
  1013. rep.full.root = Anew(allocator, CharSetInner);
  1014. rep.full.root = rep.full.root->Set(allocator, CharSetNode::levels - 1, l, h);
  1015. }
  1016. }
  1017. void CharSet<char16>::SetRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs)
  1018. {
  1019. for (int i = 0; i < numSortedPairs * 2; i += 2)
  1020. {
  1021. Assert(i == 0 || sortedPairs[i-1] < sortedPairs[i]);
  1022. Assert(sortedPairs[i] <= sortedPairs[i+1]);
  1023. SetRange(allocator, sortedPairs[i], sortedPairs[i+1]);
  1024. }
  1025. }
  1026. void CharSet<char16>::SetNotRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs)
  1027. {
  1028. if (numSortedPairs == 0)
  1029. SetRange(allocator, MinChar, MaxChar);
  1030. else
  1031. {
  1032. if (sortedPairs[0] != MinChar)
  1033. SetRange(allocator, MinChar, sortedPairs[0] - 1);
  1034. for (int i = 1; i < numSortedPairs * 2 - 1; i += 2)
  1035. SetRange(allocator, sortedPairs[i] + 1, sortedPairs[i+1] - 1);
  1036. if (sortedPairs[numSortedPairs * 2 - 1] != MaxChar)
  1037. SetRange(allocator, sortedPairs[numSortedPairs * 2 - 1] + 1, MaxChar);
  1038. }
  1039. }
  1040. void CharSet<char16>::UnionInPlace(ArenaAllocator* allocator, const CharSet<Char>& other)
  1041. {
  1042. if (other.IsCompact())
  1043. {
  1044. for (uint i = 0; i < other.GetCompactLength(); i++)
  1045. {
  1046. Set(allocator, other.GetCompactChar(i));
  1047. }
  1048. return;
  1049. }
  1050. if (IsCompact())
  1051. SwitchRepresentations(allocator);
  1052. Assert(!IsCompact() && !other.IsCompact());
  1053. rep.full.direct.UnionInPlace(other.rep.full.direct);
  1054. if (other.rep.full.root != nullptr)
  1055. {
  1056. if (other.rep.full.root == CharSetFull::TheFullNode)
  1057. {
  1058. if (rep.full.root != nullptr)
  1059. rep.full.root->FreeSelf(allocator);
  1060. rep.full.root = CharSetFull::TheFullNode;
  1061. }
  1062. else
  1063. {
  1064. if (rep.full.root == nullptr)
  1065. rep.full.root = Anew(allocator, CharSetInner);
  1066. rep.full.root = rep.full.root->UnionInPlace(allocator, CharSetNode::levels - 1, other.rep.full.root);
  1067. }
  1068. }
  1069. }
  1070. _Success_(return)
  1071. bool CharSet<char16>::GetNextRange(Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar)
  1072. {
  1073. int count = this->Count();
  1074. if (count == 0)
  1075. {
  1076. return false;
  1077. }
  1078. else if (count == 1)
  1079. {
  1080. Char singleton = this->Singleton();
  1081. if (singleton < searchCharStart)
  1082. {
  1083. return false;
  1084. }
  1085. *outLowerChar = *outHigherChar = singleton;
  1086. return true;
  1087. }
  1088. if (IsCompact())
  1089. {
  1090. this->Sort();
  1091. uint i = 0;
  1092. size_t compactLength = this->GetCompactLength();
  1093. for (; i < compactLength; i++)
  1094. {
  1095. Char nextChar = this->GetCompactChar(i);
  1096. if (nextChar >= searchCharStart)
  1097. {
  1098. *outLowerChar = *outHigherChar = nextChar;
  1099. break;
  1100. }
  1101. }
  1102. if (i == compactLength)
  1103. {
  1104. return false;
  1105. }
  1106. i++;
  1107. for (; i < compactLength; i++)
  1108. {
  1109. Char nextChar = this->GetCompactChar(i);
  1110. if (nextChar != *outHigherChar + 1)
  1111. {
  1112. return true;
  1113. }
  1114. *outHigherChar += 1;
  1115. }
  1116. return true;
  1117. }
  1118. else
  1119. {
  1120. bool found = false;
  1121. if (CTU(searchCharStart) < CharSetNode::directSize)
  1122. {
  1123. int nextSet = rep.full.direct.NextSet(searchCharStart);
  1124. if (nextSet != -1)
  1125. {
  1126. found = true;
  1127. *outLowerChar = (char16)nextSet;
  1128. int nextClear = rep.full.direct.NextClear(nextSet);
  1129. if (nextClear != -1)
  1130. {
  1131. *outHigherChar = UTC(nextClear - 1);
  1132. return true;
  1133. }
  1134. *outHigherChar = CharSetNode::directSize - 1;
  1135. }
  1136. }
  1137. if (rep.full.root == nullptr)
  1138. {
  1139. return found;
  1140. }
  1141. Char tempLowChar = 0, tempHighChar = 0;
  1142. if (found)
  1143. {
  1144. searchCharStart = *outHigherChar + 1;
  1145. }
  1146. else
  1147. {
  1148. searchCharStart = searchCharStart > CharSetNode::directSize ? searchCharStart : CharSetNode::directSize;
  1149. }
  1150. if (rep.full.root->GetNextRange(CharSetNode::levels - 1, searchCharStart, &tempLowChar, &tempHighChar) && (!found || tempLowChar == *outHigherChar + 1))
  1151. {
  1152. if (!found)
  1153. {
  1154. *outLowerChar = tempLowChar;
  1155. }
  1156. *outHigherChar = tempHighChar;
  1157. return true;
  1158. }
  1159. return found;
  1160. }
  1161. }
  1162. bool CharSet<char16>::Get_helper(uint k) const
  1163. {
  1164. Assert(!IsCompact());
  1165. CharSetNode* curr = rep.full.root;
  1166. for (int level = CharSetNode::levels - 1; level > 0; level--)
  1167. {
  1168. if (curr == CharSetFull::TheFullNode)
  1169. return true;
  1170. CharSetInner* inner = (CharSetInner*)curr;
  1171. uint i = CharSetNode::innerIdx(level, k);
  1172. if (inner->children[i] == 0)
  1173. return false;
  1174. else
  1175. curr = inner->children[i];
  1176. }
  1177. if (curr == CharSetFull::TheFullNode)
  1178. return true;
  1179. CharSetLeaf* leaf = (CharSetLeaf*)curr;
  1180. return leaf->vec.Get(CharSetNode::leafIdx(k));
  1181. }
  1182. void CharSet<char16>::ToComplement(ArenaAllocator* allocator, CharSet<Char>& result)
  1183. {
  1184. if (IsCompact())
  1185. {
  1186. Sort();
  1187. if (this->GetCompactLength() > 0)
  1188. {
  1189. if (this->GetCompactCharU(0) > 0)
  1190. result.SetRange(allocator, UTC(0), UTC(this->GetCompactCharU(0) - 1));
  1191. for (uint i = 0; i < this->GetCompactLength() - 1; i++)
  1192. {
  1193. result.SetRange(allocator, UTC(this->GetCompactCharU(i) + 1), UTC(this->GetCompactCharU(i + 1) - 1));
  1194. }
  1195. if (this->GetCompactCharU(this->GetCompactLength() - 1) < MaxUChar)
  1196. {
  1197. result.SetRange(allocator, UTC(this->GetCompactCharU(this->GetCompactLength() - 1) + 1), UTC(MaxUChar));
  1198. }
  1199. }
  1200. else if (this->GetCompactLength() == 0)
  1201. {
  1202. result.SetRange(allocator, UTC(0), UTC(MaxUChar));
  1203. }
  1204. }
  1205. else
  1206. {
  1207. rep.full.direct.ToComplement<char16>(allocator, 0, result);
  1208. if (rep.full.root == nullptr)
  1209. result.SetRange(allocator, UTC(CharSetNode::directSize), MaxChar);
  1210. else
  1211. rep.full.root->ToComplement(allocator, CharSetNode::levels - 1, 0, result);
  1212. }
  1213. }
  1214. void CharSet<char16>::ToEquivClass(ArenaAllocator* allocator, CharSet<Char>& result)
  1215. {
  1216. uint tblidx = 0;
  1217. if (IsCompact())
  1218. {
  1219. Sort();
  1220. for (uint i = 0; i < this->GetCompactLength(); i++)
  1221. {
  1222. uint acth;
  1223. Char equivs[CaseInsensitive::EquivClassSize];
  1224. if (CaseInsensitive::RangeToEquivClass(tblidx, this->GetCompactCharU(i), this->GetCompactCharU(i), acth, equivs))
  1225. {
  1226. for (int j = 0; j < CaseInsensitive::EquivClassSize; j++)
  1227. {
  1228. result.Set(allocator, equivs[j]);
  1229. }
  1230. }
  1231. else
  1232. {
  1233. result.Set(allocator, this->GetCompactChar(i));
  1234. }
  1235. }
  1236. }
  1237. else
  1238. {
  1239. rep.full.direct.ToEquivClass<char16>(allocator, 0, tblidx, result);
  1240. if (rep.full.root != nullptr)
  1241. {
  1242. rep.full.root->ToEquivClassW(allocator, CharSetNode::levels - 1, 0, tblidx, result);
  1243. }
  1244. }
  1245. }
  1246. void CharSet<char16>::ToEquivClassCP(ArenaAllocator* allocator, CharSet<codepoint_t>& result, codepoint_t baseOffset)
  1247. {
  1248. uint tblidx = 0;
  1249. if (IsCompact())
  1250. {
  1251. Sort();
  1252. for (uint i = 0; i < this->GetCompactLength(); i++)
  1253. {
  1254. uint acth;
  1255. codepoint_t equivs[CaseInsensitive::EquivClassSize];
  1256. if (CaseInsensitive::RangeToEquivClass(tblidx, this->GetCompactCharU(i) + baseOffset, this->GetCompactCharU(i) + baseOffset, acth, equivs))
  1257. {
  1258. for (int j = 0; j < CaseInsensitive::EquivClassSize; j++)
  1259. {
  1260. result.Set(allocator, equivs[j]);
  1261. }
  1262. }
  1263. else
  1264. {
  1265. result.Set(allocator, this->GetCompactChar(i) + baseOffset);
  1266. }
  1267. }
  1268. }
  1269. else
  1270. {
  1271. rep.full.direct.ToEquivClass<codepoint_t>(allocator, 0, tblidx, result, baseOffset);
  1272. if (rep.full.root != nullptr)
  1273. {
  1274. rep.full.root->ToEquivClassCP(allocator, CharSetNode::levels - 1, 0, tblidx, result, baseOffset);
  1275. }
  1276. }
  1277. }
  1278. int CharSet<char16>::GetCompactEntries(uint max, __out_ecount(max) Char* entries) const
  1279. {
  1280. Assert(max <= MaxCompact);
  1281. if (!IsCompact())
  1282. return -1;
  1283. uint count = min(max, (uint)(this->GetCompactLength()));
  1284. __analysis_assume(count <= max);
  1285. for (uint i = 0; i < count; i++)
  1286. {
  1287. // Bug in oacr. it can't figure out count is less than or equal to max
  1288. #pragma warning(suppress: 22102)
  1289. entries[i] = this->GetCompactChar(i);
  1290. }
  1291. return static_cast<int>(rep.compact.countPlusOne - 1);
  1292. }
  1293. bool CharSet<char16>::IsSubsetOf(const CharSet<Char>& other) const
  1294. {
  1295. if (IsCompact())
  1296. {
  1297. for (uint i = 0; i < this->GetCompactLength(); i++)
  1298. {
  1299. if (!other.Get(this->GetCompactChar(i)))
  1300. return false;
  1301. }
  1302. return true;
  1303. }
  1304. else
  1305. {
  1306. if (other.IsCompact())
  1307. return false;
  1308. if (!rep.full.direct.IsSubsetOf(other.rep.full.direct))
  1309. return false;
  1310. if (rep.full.root == nullptr)
  1311. return true;
  1312. if (other.rep.full.root == nullptr)
  1313. return false;
  1314. return rep.full.root->IsSubsetOf(CharSetNode::levels - 1, other.rep.full.root);
  1315. }
  1316. }
  1317. bool CharSet<char16>::IsEqualTo(const CharSet<Char>& other) const
  1318. {
  1319. if (IsCompact())
  1320. {
  1321. if (!other.IsCompact())
  1322. return false;
  1323. if (rep.compact.countPlusOne != other.rep.compact.countPlusOne)
  1324. return false;
  1325. for (uint i = 0; i < this->GetCompactLength(); i++)
  1326. {
  1327. if (!other.Get(this->GetCompactChar(i)))
  1328. return false;
  1329. }
  1330. return true;
  1331. }
  1332. else
  1333. {
  1334. if (other.IsCompact())
  1335. return false;
  1336. if (!rep.full.direct.IsEqualTo(other.rep.full.direct))
  1337. return false;
  1338. if ((rep.full.root == nullptr) != (other.rep.full.root == nullptr))
  1339. return false;
  1340. if (rep.full.root == nullptr)
  1341. return true;
  1342. return rep.full.root->IsEqualTo(CharSetNode::levels - 1, other.rep.full.root);
  1343. }
  1344. }
  1345. #if ENABLE_REGEX_CONFIG_OPTIONS
  1346. // CAUTION: This method is very slow.
  1347. void CharSet<char16>::Print(DebugWriter* w) const
  1348. {
  1349. w->Print(_u("["));
  1350. int start = -1;
  1351. for (uint i = 0; i < NumChars; i++)
  1352. {
  1353. if (Get(UTC(i)))
  1354. {
  1355. if (start < 0)
  1356. {
  1357. start = i;
  1358. w->PrintEscapedChar(UTC(i));
  1359. }
  1360. }
  1361. else
  1362. {
  1363. if (start >= 0)
  1364. {
  1365. if (i > (uint)(start + 1))
  1366. {
  1367. if (i > (uint)(start + 2))
  1368. w->Print(_u("-"));
  1369. w->PrintEscapedChar(UTC(i - 1));
  1370. }
  1371. start = -1;
  1372. }
  1373. }
  1374. }
  1375. if (start >= 0)
  1376. {
  1377. if ((uint)start < MaxUChar - 1)
  1378. w->Print(_u("-"));
  1379. w->PrintEscapedChar(MaxChar);
  1380. }
  1381. w->Print(_u("]"));
  1382. }
  1383. #endif
  1384. // ----------------------------------------------------------------------
  1385. // CharSet<codepoint_t>
  1386. // ----------------------------------------------------------------------
  1387. CharSet<codepoint_t>::CharSet()
  1388. {
  1389. #if DBG
  1390. for (int i = 0; i < NumberOfPlanes; i++)
  1391. {
  1392. this->characterPlanes[i].IsEmpty();
  1393. }
  1394. #endif
  1395. }
  1396. void CharSet<codepoint_t>::FreeBody(ArenaAllocator* allocator)
  1397. {
  1398. for (int i = 0; i < NumberOfPlanes; i++)
  1399. {
  1400. this->characterPlanes[i].FreeBody(allocator);
  1401. }
  1402. }
  1403. void CharSet<codepoint_t>::Clear(ArenaAllocator* allocator)
  1404. {
  1405. for (int i = 0; i < NumberOfPlanes; i++)
  1406. {
  1407. this->characterPlanes[i].Clear(allocator);
  1408. }
  1409. }
  1410. void CharSet<codepoint_t>::CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other)
  1411. {
  1412. for (int i = 0; i < NumberOfPlanes; i++)
  1413. {
  1414. this->characterPlanes[i].Clear(allocator);
  1415. this->characterPlanes[i].CloneFrom(allocator, other.characterPlanes[i]);
  1416. }
  1417. }
  1418. void CharSet<codepoint_t>::CloneSimpleCharsTo(ArenaAllocator* allocator, CharSet<char16>& other) const
  1419. {
  1420. other.CloneFrom(allocator, this->characterPlanes[0]);
  1421. }
  1422. void CharSet<codepoint_t>::SetRange(ArenaAllocator* allocator, Char lc, Char hc)
  1423. {
  1424. Assert(lc <= hc);
  1425. int lowerIndex = this->CharToIndex(lc);
  1426. int upperIndex = this->CharToIndex(hc);
  1427. if (lowerIndex == upperIndex)
  1428. {
  1429. this->characterPlanes[lowerIndex].SetRange(allocator, this->RemoveOffset(lc), this->RemoveOffset(hc));
  1430. }
  1431. else
  1432. {
  1433. // Do the partial ranges
  1434. char16 partialLower = this->RemoveOffset(lc);
  1435. char16 partialHigher = this->RemoveOffset(hc);
  1436. if (partialLower != 0)
  1437. {
  1438. this->characterPlanes[lowerIndex].SetRange(allocator, partialLower, Chars<char16>::MaxUChar);
  1439. lowerIndex++;
  1440. }
  1441. for(; lowerIndex < upperIndex; lowerIndex++)
  1442. {
  1443. this->characterPlanes[lowerIndex].SetRange(allocator, 0, Chars<char16>::MaxUChar);
  1444. }
  1445. this->characterPlanes[upperIndex].SetRange(allocator, 0, partialHigher);
  1446. }
  1447. }
  1448. void CharSet<codepoint_t>::SetRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs)
  1449. {
  1450. for (int i = 0; i < numSortedPairs * 2; i += 2)
  1451. {
  1452. Assert(i == 0 || sortedPairs[i-1] < sortedPairs[i]);
  1453. Assert(sortedPairs[i] <= sortedPairs[i+1]);
  1454. SetRange(allocator, sortedPairs[i], sortedPairs[i+1]);
  1455. }
  1456. }
  1457. void CharSet<codepoint_t>::SetNotRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs)
  1458. {
  1459. if (numSortedPairs == 0)
  1460. {
  1461. for (int i = 0; i < NumberOfPlanes; i++)
  1462. {
  1463. this->characterPlanes[i].SetRange(allocator, 0, Chars<char16>::MaxUChar);
  1464. }
  1465. }
  1466. else
  1467. {
  1468. if (sortedPairs[0] != MinChar)
  1469. {
  1470. SetRange(allocator, MinChar, sortedPairs[0] - 1);
  1471. }
  1472. for (int i = 1; i < numSortedPairs * 2 - 1; i += 2)
  1473. {
  1474. SetRange(allocator, sortedPairs[i] + 1, sortedPairs[i+1] - 1);
  1475. }
  1476. if (sortedPairs[numSortedPairs * 2 - 1] != MaxChar)
  1477. {
  1478. SetRange(allocator, sortedPairs[numSortedPairs * 2 - 1] + 1, MaxChar);
  1479. }
  1480. }
  1481. }
  1482. void CharSet<codepoint_t>::UnionInPlace(ArenaAllocator* allocator, const CharSet<Char>& other)
  1483. {
  1484. for (int i = 0; i < NumberOfPlanes; i++)
  1485. {
  1486. this->characterPlanes[i].UnionInPlace(allocator, other.characterPlanes[i]);
  1487. }
  1488. }
  1489. void CharSet<codepoint_t>::UnionInPlace(ArenaAllocator* allocator, const CharSet<char16>& other)
  1490. {
  1491. this->characterPlanes[0].UnionInPlace(allocator, other);
  1492. }
  1493. _Success_(return)
  1494. bool CharSet<codepoint_t>::GetNextRange(Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar)
  1495. {
  1496. Assert(outLowerChar != nullptr);
  1497. Assert(outHigherChar != nullptr);
  1498. if (searchCharStart >= 0x110000)
  1499. {
  1500. return false;
  1501. }
  1502. char16 currentLowChar = 1, currentHighChar = 0;
  1503. int index = this->CharToIndex(searchCharStart);
  1504. char16 offsetLessSearchCharStart = this->RemoveOffset(searchCharStart);
  1505. for (; index < NumberOfPlanes; index++)
  1506. {
  1507. if (this->characterPlanes[index].GetNextRange(offsetLessSearchCharStart, &currentLowChar, &currentHighChar))
  1508. {
  1509. break;
  1510. }
  1511. offsetLessSearchCharStart = 0x0;
  1512. }
  1513. if (index == NumberOfPlanes)
  1514. {
  1515. return false;
  1516. }
  1517. Assert(currentHighChar >= currentLowChar);
  1518. // else found range
  1519. *outLowerChar = this->AddOffset(currentLowChar, index);
  1520. *outHigherChar = this->AddOffset(currentHighChar, index);
  1521. // Check if range crosses plane boundaries
  1522. index ++;
  1523. for (; index < NumberOfPlanes; index++)
  1524. {
  1525. if (!this->characterPlanes[index].GetNextRange(0x0, &currentLowChar, &currentHighChar) || *outHigherChar + 1 != this->AddOffset(currentLowChar, index))
  1526. {
  1527. break;
  1528. }
  1529. Assert(this->AddOffset(currentHighChar, index) > *outHigherChar);
  1530. *outHigherChar = this->AddOffset(currentHighChar, index);
  1531. }
  1532. return true;
  1533. }
  1534. void CharSet<codepoint_t>::ToComplement(ArenaAllocator* allocator, CharSet<Char>& result)
  1535. {
  1536. for (int i = 0; i < NumberOfPlanes; i++)
  1537. {
  1538. this->characterPlanes[i].ToComplement(allocator, result.characterPlanes[i]);
  1539. }
  1540. }
  1541. void CharSet<codepoint_t>::ToSimpleComplement(ArenaAllocator* allocator, CharSet<Char>& result)
  1542. {
  1543. this->characterPlanes[0].ToComplement(allocator, result.characterPlanes[0]);
  1544. }
  1545. void CharSet<codepoint_t>::ToSimpleComplement(ArenaAllocator* allocator, CharSet<char16>& result)
  1546. {
  1547. this->characterPlanes[0].ToComplement(allocator, result);
  1548. }
  1549. void CharSet<codepoint_t>::ToEquivClass(ArenaAllocator* allocator, CharSet<Char>& result)
  1550. {
  1551. for (int i = 0; i < NumberOfPlanes; i++)
  1552. {
  1553. this->characterPlanes[i].ToEquivClassCP(allocator, result, AddOffset(0, i));
  1554. }
  1555. }
  1556. void CharSet<codepoint_t>::ToSurrogateEquivClass(ArenaAllocator* allocator, CharSet<Char>& result)
  1557. {
  1558. this->CloneSimpleCharsTo(allocator, result.characterPlanes[0]);
  1559. for (int i = 1; i < NumberOfPlanes; i++)
  1560. {
  1561. this->characterPlanes[i].ToEquivClassCP(allocator, result, AddOffset(0, i));
  1562. }
  1563. }
  1564. #if ENABLE_REGEX_CONFIG_OPTIONS
  1565. void CharSet<codepoint_t>::Print(DebugWriter* w) const
  1566. {
  1567. w->Print(_u("Characters 0 - 65535"));
  1568. for (int i = 0; i < NumberOfPlanes; i++)
  1569. {
  1570. int base = (i + 1) * 0x10000;
  1571. w->Print(_u("Characters %d - %d"), base, base + 0xFFFF);
  1572. this->characterPlanes[i].Print(w);
  1573. }
  1574. }
  1575. #endif
  1576. // ----------------------------------------------------------------------
  1577. // RuntimeCharSet<char16>
  1578. // ----------------------------------------------------------------------
  1579. RuntimeCharSet<char16>::RuntimeCharSet()
  1580. {
  1581. root = nullptr;
  1582. direct.Clear();
  1583. }
  1584. void RuntimeCharSet<char16>::FreeBody(ArenaAllocator* allocator)
  1585. {
  1586. if (root != nullptr)
  1587. {
  1588. root->FreeSelf(allocator);
  1589. #if DBG
  1590. root = nullptr;
  1591. #endif
  1592. }
  1593. }
  1594. void RuntimeCharSet<char16>::CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other)
  1595. {
  1596. Assert(root == nullptr);
  1597. Assert(direct.Count() == 0);
  1598. if (other.IsCompact())
  1599. {
  1600. for (uint i = 0; i < other.GetCompactLength(); i++)
  1601. {
  1602. uint k = other.GetCompactCharU(i);
  1603. if (k < CharSetNode::directSize)
  1604. direct.Set(k);
  1605. else
  1606. {
  1607. if (root == nullptr)
  1608. root = Anew(allocator, CharSetInner);
  1609. #if DBG
  1610. CharSetNode* newRoot =
  1611. #endif
  1612. root->Set(allocator, CharSetNode::levels - 1, k, k);
  1613. #if DBG
  1614. // NOTE: Since we can add at most MaxCompact characters, we can never fill a leaf or inner node,
  1615. // thus we will never need to reallocated nodes
  1616. Assert(newRoot == root);
  1617. #endif
  1618. }
  1619. }
  1620. }
  1621. else
  1622. {
  1623. root = other.rep.full.root == nullptr ? nullptr : other.rep.full.root->Clone(allocator);
  1624. direct.CloneFrom(other.rep.full.direct);
  1625. }
  1626. }
  1627. bool RuntimeCharSet<char16>::Get_helper(uint k) const
  1628. {
  1629. CharSetNode* curr = root;
  1630. for (int level = CharSetNode::levels - 1; level > 0; level--)
  1631. {
  1632. if (curr == CharSetFull::TheFullNode)
  1633. return true;
  1634. CharSetInner* inner = (CharSetInner*)curr;
  1635. uint i = CharSetNode::innerIdx(level, k);
  1636. if (inner->children[i] == 0)
  1637. return false;
  1638. else
  1639. curr = inner->children[i];
  1640. }
  1641. if (curr == CharSetFull::TheFullNode)
  1642. return true;
  1643. CharSetLeaf* leaf = (CharSetLeaf*)curr;
  1644. return leaf->vec.Get(CharSetNode::leafIdx(k));
  1645. }
  1646. #if ENABLE_REGEX_CONFIG_OPTIONS
  1647. // CAUTION: This method is very slow.
  1648. void RuntimeCharSet<char16>::Print(DebugWriter* w) const
  1649. {
  1650. w->Print(_u("["));
  1651. int start = -1;
  1652. for (uint i = 0; i < NumChars; i++)
  1653. {
  1654. if (Get(UTC(i)))
  1655. {
  1656. if (start < 0)
  1657. {
  1658. start = i;
  1659. w->PrintEscapedChar(UTC(i));
  1660. }
  1661. }
  1662. else
  1663. {
  1664. if (start >= 0)
  1665. {
  1666. if (i > (uint)(start + 1))
  1667. {
  1668. if (i > (uint)(start + 2))
  1669. w->Print(_u("-"));
  1670. w->PrintEscapedChar(UTC(i - 1));
  1671. }
  1672. start = -1;
  1673. }
  1674. }
  1675. }
  1676. if (start >= 0)
  1677. {
  1678. if ((uint)start < MaxUChar - 1)
  1679. w->Print(_u("-"));
  1680. w->PrintEscapedChar(MaxChar);
  1681. }
  1682. w->Print(_u("]"));
  1683. }
  1684. #endif
  1685. }