CharSet.h 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. namespace UnifiedRegex
  6. {
  7. template <typename C>
  8. class CharSet;
  9. template <typename C>
  10. class RuntimeCharSet;
  11. class CharBitvec : private Chars<char>
  12. {
  13. public:
  14. static const int Width = Chars<char>::CharWidth;
  15. static const int Size = NumChars;
  16. private:
  17. static const int wordSize = sizeof(uint32) * 8;
  18. static const int vecSize = Size / wordSize;
  19. static const uint32 ones = (uint32)-1;
  20. static const uint8 oneBits[Size];
  21. uint32 vec[vecSize];
  22. inline static void setrng(uint32 &v, uint l, uint h)
  23. {
  24. uint w = h - l + 1;
  25. if (w == wordSize)
  26. {
  27. v = ones;
  28. }
  29. else
  30. {
  31. v |= ((1U << w) - 1) << l;
  32. }
  33. }
  34. inline static void clearrng(uint32 &v, uint l, uint h)
  35. {
  36. uint w = h - l + 1;
  37. if (w == wordSize)
  38. {
  39. v = 0;
  40. }
  41. else
  42. {
  43. v &= ~(((1U << w) - 1) << l);
  44. }
  45. }
  46. public:
  47. inline void CloneFrom(const CharBitvec& other)
  48. {
  49. for (int w = 0; w < vecSize; w++)
  50. {
  51. vec[w] = other.vec[w];
  52. }
  53. }
  54. inline void Clear()
  55. {
  56. for (int w = 0; w < vecSize; w++)
  57. {
  58. vec[w] = 0;
  59. }
  60. }
  61. inline void SetAll()
  62. {
  63. for (int w = 0; w < vecSize; w++)
  64. {
  65. vec[w] = ones;
  66. }
  67. }
  68. inline void Set(uint k)
  69. {
  70. Assert(k < Size);
  71. __assume(k < Size);
  72. if (k < Size)
  73. {
  74. vec[k / wordSize] |= 1U << (k % wordSize);
  75. }
  76. }
  77. inline void SetRange(uint l, uint h)
  78. {
  79. Assert(l < Size);
  80. Assert(h < Size);
  81. __assume(l < Size);
  82. __assume(h < Size);
  83. if (l < Size && h < Size)
  84. {
  85. if (l == h)
  86. {
  87. vec[l / wordSize] |= 1U << (l % wordSize);
  88. }
  89. else if (l < h)
  90. {
  91. int lw = l / wordSize;
  92. int hw = h / wordSize;
  93. int lo = l % wordSize;
  94. int hio = h % wordSize;
  95. if (lw == hw)
  96. {
  97. setrng(vec[lw], lo, hio);
  98. }
  99. else
  100. {
  101. setrng(vec[lw], lo, wordSize - 1);
  102. for (int w = lw + 1; w < hw; w++)
  103. {
  104. vec[w] = ones;
  105. }
  106. setrng(vec[hw], 0, hio);
  107. }
  108. }
  109. }
  110. }
  111. inline void ClearRange(uint l, uint h)
  112. {
  113. Assert(l < Size);
  114. Assert(h < Size);
  115. __assume(l < Size);
  116. __assume(h < Size);
  117. if (l < Size && h < Size)
  118. {
  119. if (l == h)
  120. {
  121. vec[l / wordSize] &= ~(1U << (l % wordSize));
  122. }
  123. else if (l < h)
  124. {
  125. int lw = l / wordSize;
  126. int hw = h / wordSize;
  127. int lo = l % wordSize;
  128. int hio = h % wordSize;
  129. if (lw == hw)
  130. {
  131. clearrng(vec[lw], lo, hio);
  132. }
  133. else
  134. {
  135. clearrng(vec[lw], lo, wordSize - 1);
  136. for (int w = lw + 1; w < hw; w++)
  137. {
  138. vec[w] = 0;
  139. }
  140. clearrng(vec[hw], 0, hio);
  141. }
  142. }
  143. }
  144. }
  145. inline bool IsEmpty()
  146. {
  147. for (int i = 0; i < vecSize; i++)
  148. {
  149. if (vec[i] != 0)
  150. {
  151. return false;
  152. }
  153. }
  154. return true;
  155. }
  156. inline void UnionInPlace(const CharBitvec& other)
  157. {
  158. for (int w = 0; w < vecSize; w++)
  159. {
  160. vec[w] |= other.vec[w];
  161. }
  162. }
  163. inline bool UnionInPlaceFullCheck(const CharBitvec& other)
  164. {
  165. bool isFull = true;
  166. for (int w = 0; w < vecSize; w++)
  167. {
  168. vec[w] |= other.vec[w];
  169. if (vec[w] != ones)
  170. {
  171. isFull = false;
  172. }
  173. }
  174. return isFull;
  175. }
  176. inline bool Get(uint k) const
  177. {
  178. Assert(k < Size);
  179. __assume(k < Size);
  180. return ((vec[k / wordSize] >> (k % wordSize)) & 1) != 0;
  181. }
  182. inline bool IsFull() const
  183. {
  184. for (int w = 0; w < vecSize; w++)
  185. {
  186. if (vec[w] != ones)
  187. {
  188. return false;
  189. }
  190. }
  191. return true;
  192. }
  193. inline bool IsSubsetOf(const CharBitvec& other) const
  194. {
  195. for (int w = 0; w < vecSize; w++)
  196. {
  197. uint32 v = other.vec[w];
  198. if (v != (vec[w] | v))
  199. {
  200. return false;
  201. }
  202. }
  203. return true;
  204. }
  205. inline bool IsEqualTo(const CharBitvec& other) const
  206. {
  207. for (int w = 0; w < vecSize; w++)
  208. {
  209. if (vec[w] != other.vec[w])
  210. {
  211. return false;
  212. }
  213. }
  214. return true;
  215. }
  216. uint Count() const;
  217. int NextSet(int k) const;
  218. int NextClear(int k) const;
  219. template <typename C>
  220. void ToComplement(ArenaAllocator* allocator, uint base, CharSet<C>& result) const;
  221. template <typename C>
  222. void ToEquivClass(ArenaAllocator* allocator, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset = 0x0) const;
  223. };
  224. template <typename C>
  225. class CharSet {};
  226. struct CharSetNode : protected Chars<char16>
  227. {
  228. static const int directBits = Chars<char>::CharWidth;
  229. static const uint directSize = Chars<char>::NumChars;
  230. static const uint bitsPerInnerLevel = 4;
  231. static const uint branchingPerInnerLevel = 1 << bitsPerInnerLevel;
  232. static const uint innerMask = branchingPerInnerLevel - 1;
  233. static const int bitsPerLeafLevel = CharBitvec::Width;
  234. static const int branchingPerLeafLevel = CharBitvec::Size;
  235. static const uint leafMask = branchingPerLeafLevel - 1;
  236. static const uint levels = 1 + (CharWidth - bitsPerLeafLevel) / bitsPerInnerLevel;
  237. inline static uint innerIdx(uint level, uint v)
  238. {
  239. return (v >> ((level + 1) * bitsPerInnerLevel)) & innerMask;
  240. }
  241. inline static uint indexToValue(uint level, uint index, uint offset)
  242. {
  243. Assert((index & innerMask) == index);
  244. Assert((uint)(1 << ((level + 1) * bitsPerInnerLevel)) > offset);
  245. return (index << ((level + 1) * bitsPerInnerLevel)) + offset;
  246. }
  247. inline static uint leafIdx(uint v)
  248. {
  249. return v & leafMask;
  250. }
  251. inline static uint lim(uint level)
  252. {
  253. return (1U << (bitsPerLeafLevel + level * bitsPerInnerLevel)) - 1;
  254. }
  255. inline static uint remain(uint level, uint v)
  256. {
  257. return v & lim(level);
  258. }
  259. virtual void FreeSelf(ArenaAllocator* allocator) = 0;
  260. virtual CharSetNode* Clone(ArenaAllocator* allocator) const = 0;
  261. virtual CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) = 0;
  262. virtual CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) = 0;
  263. virtual CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) = 0;
  264. virtual bool Get(uint level, uint k) const = 0;
  265. virtual void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const = 0;
  266. virtual void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const = 0;
  267. virtual void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const = 0;
  268. virtual bool IsSubsetOf(uint level, const CharSetNode* other) const = 0;
  269. virtual bool IsEqualTo(uint level, const CharSetNode* other) const = 0;
  270. virtual uint Count(uint level) const = 0;
  271. _Success_(return) virtual bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const = 0;
  272. #if DBG
  273. virtual bool IsLeaf() const = 0;
  274. #endif
  275. static CharSetNode* For(ArenaAllocator* allocator, int level);
  276. };
  277. struct CharSetFull : CharSetNode
  278. {
  279. private:
  280. template <typename C>
  281. void ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset = 0x0) const;
  282. public:
  283. static CharSetFull Instance;
  284. static CharSetFull * const TheFullNode;
  285. CharSetFull();
  286. void FreeSelf(ArenaAllocator* allocator) override;
  287. CharSetNode* Clone(ArenaAllocator* allocator) const override;
  288. CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  289. CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  290. CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) override;
  291. bool Get(uint level, uint k) const override;
  292. void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const override;
  293. void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const override;
  294. void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const override;
  295. bool IsSubsetOf(uint level, const CharSetNode* other) const override;
  296. bool IsEqualTo(uint level, const CharSetNode* other) const override;
  297. uint Count(uint level) const override;
  298. _Success_(return) bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const override;
  299. #if DBG
  300. bool IsLeaf() const override;
  301. #endif
  302. };
  303. struct CharSetInner sealed : CharSetNode
  304. {
  305. private:
  306. template <typename C>
  307. void ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result) const;
  308. public:
  309. CharSetNode * children[branchingPerInnerLevel];
  310. CharSetInner();
  311. void FreeSelf(ArenaAllocator* allocator) override;
  312. CharSetNode* Clone(ArenaAllocator* allocator) const override;
  313. CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  314. CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  315. CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) override;
  316. bool Get(uint level, uint k) const override;
  317. void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const override;
  318. void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const override;
  319. void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const override;
  320. bool IsSubsetOf(uint level, const CharSetNode* other) const override;
  321. bool IsEqualTo(uint level, const CharSetNode* other) const override;
  322. uint Count(uint level) const override;
  323. _Success_(return) bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const override;
  324. #if DBG
  325. bool IsLeaf() const override;
  326. #endif
  327. };
  328. struct CharSetLeaf sealed : CharSetNode
  329. {
  330. private:
  331. template <typename C>
  332. void ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset = 0x0) const;
  333. public:
  334. CharBitvec vec;
  335. CharSetLeaf();
  336. void FreeSelf(ArenaAllocator* allocator) override;
  337. CharSetNode* Clone(ArenaAllocator* allocator) const override;
  338. CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  339. CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  340. CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) override;
  341. bool Get(uint level, uint k) const override;
  342. void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const override;
  343. void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const override;
  344. void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const override;
  345. bool IsSubsetOf(uint level, const CharSetNode* other) const override;
  346. bool IsEqualTo(uint level, const CharSetNode* other) const override;
  347. uint Count(uint level) const override;
  348. _Success_(return) bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const override;
  349. #if DBG
  350. bool IsLeaf() const override;
  351. #endif
  352. };
  353. template <>
  354. class CharSet<char16> : private Chars<char16>
  355. {
  356. public:
  357. static const uint MaxCompact = 4;
  358. static const uint emptySlot = (uint)-1;
  359. struct CompactRep
  360. {
  361. // 1 + number of distinct characters, 1..MaxCompact+1
  362. size_t countPlusOne;
  363. // Characters, in no particular order, or (uint)-1 for tail empty slots
  364. uint cs[MaxCompact];
  365. uint8 padding[sizeof(CharBitvec) - sizeof(uint) * MaxCompact];
  366. };
  367. struct FullRep
  368. {
  369. // Trie for remaining characters. Pointer value will be 0 or >> MaxCompact.
  370. CharSetNode* root;
  371. // Entries for first 256 characters
  372. CharBitvec direct;
  373. };
  374. union Rep
  375. {
  376. struct CompactRep compact;
  377. struct FullRep full;
  378. } rep;
  379. static const int compactSize = sizeof(CompactRep);
  380. static const int fullSize = sizeof(FullRep);
  381. inline bool IsCompact() const { return rep.compact.countPlusOne - 1 <= MaxCompact; }
  382. void SwitchRepresentations(ArenaAllocator* allocator);
  383. void Sort();
  384. public:
  385. CharSet();
  386. void FreeBody(ArenaAllocator* allocator);
  387. void Clear(ArenaAllocator* allocator);
  388. void CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other);
  389. void CloneNonSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<Char>& other);
  390. void CloneSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<Char>& other);
  391. inline void Set(ArenaAllocator* allocator, Char kc) { SetRange(allocator, kc, kc); }
  392. void SetRange(ArenaAllocator* allocator, Char lc, Char hc);
  393. void SubtractRange(ArenaAllocator* allocator, Char lc, Char hc);
  394. void SetRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
  395. void SetNotRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
  396. void UnionInPlace(ArenaAllocator* allocator, const CharSet<Char>& other);
  397. _Success_(return) bool GetNextRange(Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar);
  398. bool Get_helper(uint k) const;
  399. inline bool Get(Char kc) const
  400. {
  401. if (IsCompact())
  402. {
  403. Assert(MaxCompact == 4);
  404. return rep.compact.cs[0] == CTU(kc) ||
  405. rep.compact.cs[1] == CTU(kc) ||
  406. rep.compact.cs[2] == CTU(kc) ||
  407. rep.compact.cs[3] == CTU(kc);
  408. }
  409. else
  410. {
  411. if (CTU(kc) < CharSetNode::directSize)
  412. {
  413. return rep.full.direct.Get(CTU(kc));
  414. }
  415. else if (rep.full.root == 0)
  416. {
  417. return false;
  418. }
  419. else
  420. {
  421. return Get_helper(CTU(kc));
  422. }
  423. }
  424. }
  425. inline bool IsEmpty() const
  426. {
  427. return rep.compact.countPlusOne == 1;
  428. }
  429. inline bool IsSingleton() const
  430. {
  431. return rep.compact.countPlusOne == 2;
  432. }
  433. // Helpers to clean up the code
  434. inline uint GetCompactLength() const
  435. {
  436. Assert(IsCompact());
  437. return (uint)(rep.compact.countPlusOne - 1u);
  438. }
  439. inline void SetCompactLength(size_t length)
  440. {
  441. rep.compact.countPlusOne = length + 1;
  442. }
  443. inline uint GetCompactCharU(uint index) const
  444. {
  445. Assert(index < this->GetCompactLength());
  446. Assert(IsCompact());
  447. Assert(rep.compact.cs[index] <= MaxUChar);
  448. return rep.compact.cs[index];
  449. }
  450. inline Char GetCompactChar(uint index) const
  451. {
  452. return (Char)(GetCompactCharU(index));
  453. }
  454. //Replaces an existing character with a new value
  455. inline void ReplaceCompactCharU(uint index, uint value)
  456. {
  457. Assert(index < this->GetCompactLength());
  458. Assert(IsCompact());
  459. Assert(value <= MaxUChar);
  460. rep.compact.cs[index] = value;
  461. }
  462. inline void ClearCompactChar(uint index)
  463. {
  464. Assert(index < this->GetCompactLength());
  465. Assert(IsCompact());
  466. rep.compact.cs[index] = emptySlot;
  467. }
  468. // Adds the character to the end, assuming there is enough space. (Assert in place)
  469. // Increments count.
  470. inline void AddCompactCharU(uint value)
  471. {
  472. Assert(this->GetCompactLength() < MaxCompact);
  473. Assert(IsCompact());
  474. rep.compact.cs[this->GetCompactLength()] = value;
  475. rep.compact.countPlusOne += 1;
  476. }
  477. // Adds the character to the end, assuming there is enough space. (Assert in place)
  478. // Increments count.
  479. inline void AddCompactChar(Char value)
  480. {
  481. AddCompactCharU((Char)(value));
  482. }
  483. // This performs a check to see if the index is the last char, if so sets it to emptySlot
  484. // If not, replaces it with last index.
  485. inline void RemoveCompactChar(uint index)
  486. {
  487. Assert(index < this->GetCompactLength());
  488. Assert(IsCompact());
  489. if (index == this->GetCompactLength() - 1)
  490. {
  491. this->ClearCompactChar(index);
  492. }
  493. else
  494. {
  495. this->ReplaceCompactCharU(index, this->GetCompactCharU((uint)this->GetCompactLength() - 1));
  496. }
  497. rep.compact.countPlusOne -= 1;
  498. }
  499. inline char16 Singleton() const
  500. {
  501. Assert(IsSingleton());
  502. Assert(rep.compact.cs[0] <= MaxUChar);
  503. return UTC(rep.compact.cs[0]);
  504. }
  505. int GetCompactEntries(uint max, __out_ecount(max) Char* entries) const;
  506. bool IsSubsetOf(const CharSet<Char>& other) const;
  507. bool IsEqualTo(const CharSet<Char>& other) const;
  508. inline uint Count() const
  509. {
  510. if (IsCompact())
  511. {
  512. return (uint)rep.compact.countPlusOne - 1;
  513. }
  514. else if (rep.full.root == 0)
  515. {
  516. return rep.full.direct.Count();
  517. }
  518. else
  519. {
  520. //The bit vector
  521. Assert(rep.full.root == CharSetFull::TheFullNode || rep.full.root->Count(CharSetNode::levels - 1) <= 0xFF00);
  522. return rep.full.direct.Count() + (rep.full.root == CharSetFull::TheFullNode ? 0xFF00 : rep.full.root->Count(CharSetNode::levels - 1));
  523. }
  524. }
  525. // NOTE: These are not 'const' methods since they may sort the compact representation internally
  526. void ToComplement(ArenaAllocator* allocator, CharSet<Char>& result);
  527. void ToEquivClass(ArenaAllocator* allocator, CharSet<Char>& result);
  528. void ToEquivClassCP(ArenaAllocator* allocator, CharSet<codepoint_t>& result, codepoint_t baseOffset);
  529. #if ENABLE_REGEX_CONFIG_OPTIONS
  530. void Print(DebugWriter* w) const;
  531. #endif
  532. };
  533. template <>
  534. class CharSet<codepoint_t> : private Chars<codepoint_t>
  535. {
  536. static const int NumberOfPlanes = 17;
  537. private:
  538. // Character planes are composed of 65536 characters each.
  539. // First plane is the Basic Multilingual Plane (characters 0 - 65535)
  540. // Every subsequent plane also stores characters in the form [0 - 65535]; to get the actual value, add 'index * 0x10000' to it
  541. CharSet<char16> characterPlanes[NumberOfPlanes];
  542. // Takes a character, and returns the index of the CharSet<char16> that holds it.
  543. inline int CharToIndex(Char c) const
  544. {
  545. Assert(c <= Chars<codepoint_t>::MaxUChar);
  546. return (int)(CTU(c) / (Chars<char16>::MaxUChar + 1));
  547. }
  548. // Takes a character, and removes the offset to make it < 0x10000
  549. inline char16 RemoveOffset(Char c) const
  550. {
  551. Assert(c <= Chars<codepoint_t>::MaxUChar);
  552. return (char16)(CTU(c) % 0x10000);
  553. }
  554. // Takes a character, and removes the offset to make it < 0x10000
  555. inline Char AddOffset(char16 c, int index) const
  556. {
  557. Assert(c <= Chars<char16>::MaxUChar);
  558. Assert(index >= 0);
  559. Assert(index < NumberOfPlanes);
  560. return ((Char)c) + 0x10000 * index;
  561. }
  562. public:
  563. CharSet();
  564. void FreeBody(ArenaAllocator* allocator);
  565. void Clear(ArenaAllocator* allocator);
  566. void CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other);
  567. void CloneSimpleCharsTo(ArenaAllocator* allocator, CharSet<char16>& other) const;
  568. inline void CloneNonSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<char16>& other)
  569. {
  570. Assert(this->SimpleCharCount() > 0);
  571. AssertMsg(this->ContainSurrogateCodeUnits(), "This doesn't contain surrogate code units, a simple clone is faster.");
  572. this->characterPlanes[0].CloneNonSurrogateCodeUnitsTo(allocator, other);
  573. }
  574. inline void CloneSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<char16>& other)
  575. {
  576. Assert(this->SimpleCharCount() > 0);
  577. AssertMsg(this->ContainSurrogateCodeUnits(), "This doesn't contain surrogate code units, will not produce any result.");
  578. this->characterPlanes[0].CloneSurrogateCodeUnitsTo(allocator, other);
  579. }
  580. inline void Set(ArenaAllocator* allocator, Char kc) { SetRange(allocator, kc, kc); }
  581. inline bool ContainSurrogateCodeUnits()
  582. {
  583. char16 outLower = 0xFFFF, ignore = 0x0;
  584. return this->characterPlanes[0].GetNextRange(0xD800, &outLower, &ignore) ? outLower <= 0xDFFF : false;
  585. }
  586. void SetRange(ArenaAllocator* allocator, Char lc, Char hc);
  587. void SetRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
  588. void SetNotRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
  589. void UnionInPlace(ArenaAllocator* allocator, const CharSet<Char>& other);
  590. void UnionInPlace(ArenaAllocator* allocator, const CharSet<char16>& other);
  591. _Success_(return) bool GetNextRange(Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar);
  592. inline bool Get(Char kc) const
  593. {
  594. return this->characterPlanes[CharToIndex(kc)].Get(RemoveOffset(kc));
  595. }
  596. inline bool IsEmpty() const
  597. {
  598. for (int i = 0; i < NumberOfPlanes; i++)
  599. {
  600. if (!this->characterPlanes[i].IsEmpty())
  601. {
  602. return false;
  603. }
  604. }
  605. return true;
  606. }
  607. inline bool IsSimpleCharASingleton() const
  608. {
  609. return this->characterPlanes[0].IsSingleton();
  610. }
  611. inline char16 SimpleCharSingleton() const
  612. {
  613. return this->characterPlanes[0].Singleton();
  614. }
  615. inline bool IsSingleton() const
  616. {
  617. return this->Count() == 1;
  618. }
  619. inline codepoint_t Singleton() const
  620. {
  621. Assert(IsSingleton());
  622. for (int i = 0; i < NumberOfPlanes; i++)
  623. {
  624. if (this->characterPlanes[i].IsSingleton())
  625. {
  626. return AddOffset(this->characterPlanes[i].Singleton(), i);
  627. }
  628. }
  629. AssertMsg(false, "Should not reach here, first Assert verifies we are a singleton.");
  630. return INVALID_CODEPOINT;
  631. }
  632. bool IsSubsetOf(const CharSet<Char>& other) const;
  633. bool IsEqualTo(const CharSet<Char>& other) const;
  634. inline uint Count() const
  635. {
  636. uint totalCount = 0;
  637. for (int i = 0; i < NumberOfPlanes; i++)
  638. {
  639. totalCount += this->characterPlanes[i].Count();
  640. }
  641. return totalCount;
  642. }
  643. inline uint SimpleCharCount() const
  644. {
  645. return this->characterPlanes[0].Count();
  646. }
  647. // NOTE: These are not 'const' methods since they may sort the compact representation internally
  648. void ToComplement(ArenaAllocator* allocator, CharSet<Char>& result);
  649. void ToSimpleComplement(ArenaAllocator* allocator, CharSet<codepoint_t>& result);
  650. void ToSimpleComplement(ArenaAllocator* allocator, CharSet<char16>& result);
  651. void ToEquivClass(ArenaAllocator* allocator, CharSet<Char>& result);
  652. void ToSurrogateEquivClass(ArenaAllocator* allocator, CharSet<Char>& result);
  653. #if ENABLE_REGEX_CONFIG_OPTIONS
  654. void Print(DebugWriter* w) const;
  655. #endif
  656. };
  657. template <>
  658. class RuntimeCharSet<char16> : private Chars<char16>
  659. {
  660. private:
  661. // Trie for remaining characters. Pointer value will be 0 or >> MaxCompact.
  662. CharSetNode * root;
  663. // Entries for first 256 characters
  664. CharBitvec direct;
  665. public:
  666. RuntimeCharSet();
  667. void FreeBody(ArenaAllocator* allocator);
  668. void CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other);
  669. bool Get_helper(uint k) const;
  670. inline bool Get(Char kc) const
  671. {
  672. if (CTU(kc) < CharSetNode::directSize)
  673. {
  674. return direct.Get(CTU(kc));
  675. }
  676. else if (root == 0)
  677. {
  678. return false;
  679. }
  680. else
  681. {
  682. return Get_helper(CTU(kc));
  683. }
  684. }
  685. #if ENABLE_REGEX_CONFIG_OPTIONS
  686. void Print(DebugWriter* w) const;
  687. #endif
  688. };
  689. typedef CharSet<char16> UnicodeCharSet;
  690. typedef RuntimeCharSet<char16> UnicodeRuntimeCharSet;
  691. }