2
0

CharSet.h 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. namespace UnifiedRegex
  6. {
  7. template <typename C>
  8. class CharSet;
  9. template <typename C>
  10. class RuntimeCharSet;
  11. class CharBitvec : private Chars<char>
  12. {
  13. public:
  14. static const int Width = Chars<char>::CharWidth;
  15. static const int Size = NumChars;
  16. private:
  17. static const int wordSize = sizeof(uint32) * 8;
  18. static const int vecSize = Size / wordSize;
  19. static const uint32 ones = (uint32)-1;
  20. static const uint8 oneBits[Size];
  21. uint32 vec[vecSize];
  22. inline static void setrng(uint32 &v, uint l, uint h)
  23. {
  24. uint w = h - l + 1;
  25. if (w == wordSize)
  26. v = ones;
  27. else
  28. v |= ((1U << w) - 1) << l;
  29. }
  30. inline static void clearrng(uint32 &v, uint l, uint h)
  31. {
  32. uint w = h - l + 1;
  33. if (w == wordSize)
  34. v = 0;
  35. else
  36. v &= ~(((1U << w) - 1) << l);
  37. }
  38. public:
  39. inline void CloneFrom(const CharBitvec& other)
  40. {
  41. for (int w = 0; w < vecSize; w++)
  42. vec[w] = other.vec[w];
  43. }
  44. inline void Clear()
  45. {
  46. for (int w = 0; w < vecSize; w++)
  47. vec[w] = 0;
  48. }
  49. inline void SetAll()
  50. {
  51. for (int w = 0; w < vecSize; w++)
  52. vec[w] = ones;
  53. }
  54. inline void Set(uint k)
  55. {
  56. Assert(k < Size);
  57. __assume(k < Size);
  58. if (k < Size)
  59. vec[k / wordSize] |= 1U << (k % wordSize);
  60. }
  61. inline void SetRange(uint l, uint h)
  62. {
  63. Assert(l < Size);
  64. Assert(h < Size);
  65. __assume(l < Size);
  66. __assume(h < Size);
  67. if (l < Size && h < Size)
  68. {
  69. if (l == h)
  70. vec[l / wordSize] |= 1U << (l % wordSize);
  71. else if (l < h)
  72. {
  73. int lw = l / wordSize;
  74. int hw = h / wordSize;
  75. int lo = l % wordSize;
  76. int hio = h % wordSize;
  77. if (lw == hw)
  78. setrng(vec[lw], lo, hio);
  79. else
  80. {
  81. setrng(vec[lw], lo, wordSize-1);
  82. for (int w = lw + 1; w < hw; w++)
  83. vec[w] = ones;
  84. setrng(vec[hw], 0, hio);
  85. }
  86. }
  87. }
  88. }
  89. inline void ClearRange(uint l, uint h)
  90. {
  91. Assert(l < Size);
  92. Assert(h < Size);
  93. __assume(l < Size);
  94. __assume(h < Size);
  95. if (l < Size && h < Size)
  96. {
  97. if (l == h)
  98. {
  99. vec[l / wordSize] &= ~(1U << (l % wordSize));
  100. }
  101. else if (l < h)
  102. {
  103. int lw = l / wordSize;
  104. int hw = h / wordSize;
  105. int lo = l % wordSize;
  106. int hio = h % wordSize;
  107. if (lw == hw)
  108. {
  109. clearrng(vec[lw], lo, hio);
  110. }
  111. else
  112. {
  113. clearrng(vec[lw], lo, wordSize-1);
  114. for (int w = lw + 1; w < hw; w++)
  115. vec[w] = 0;
  116. clearrng(vec[hw], 0, hio);
  117. }
  118. }
  119. }
  120. }
  121. inline bool IsEmpty()
  122. {
  123. for (int i = 0; i < vecSize; i++)
  124. {
  125. if(vec[i] != 0)
  126. {
  127. return false;
  128. }
  129. }
  130. return true;
  131. }
  132. inline void UnionInPlace(const CharBitvec& other)
  133. {
  134. for (int w = 0; w < vecSize; w++)
  135. vec[w] |= other.vec[w];
  136. }
  137. inline bool UnionInPlaceFullCheck(const CharBitvec& other)
  138. {
  139. bool isFull = true;
  140. for (int w = 0; w < vecSize; w++)
  141. {
  142. vec[w] |= other.vec[w];
  143. if (vec[w] != ones)
  144. isFull = false;
  145. }
  146. return isFull;
  147. }
  148. inline bool Get(uint k) const
  149. {
  150. Assert(k < Size);
  151. __assume(k < Size);
  152. return ((vec[k / wordSize] >> (k % wordSize)) & 1) != 0;
  153. }
  154. inline bool IsFull() const
  155. {
  156. for (int w = 0; w < vecSize; w++)
  157. {
  158. if (vec[w] != ones)
  159. return false;
  160. }
  161. return true;
  162. }
  163. inline bool IsSubsetOf(const CharBitvec& other) const
  164. {
  165. for (int w = 0; w < vecSize; w++)
  166. {
  167. uint32 v = other.vec[w];
  168. if (v != (vec[w] | v))
  169. return false;
  170. }
  171. return true;
  172. }
  173. inline bool IsEqualTo(const CharBitvec& other) const
  174. {
  175. for (int w = 0; w < vecSize; w++)
  176. {
  177. if (vec[w] != other.vec[w])
  178. return false;
  179. }
  180. return true;
  181. }
  182. uint Count() const;
  183. int NextSet(int k) const;
  184. int NextClear(int k) const;
  185. template <typename C>
  186. void ToComplement(ArenaAllocator* allocator, uint base, CharSet<C>& result) const;
  187. template <typename C>
  188. void ToEquivClass(ArenaAllocator* allocator, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset = 0x0) const;
  189. };
  190. template <typename C>
  191. class CharSet {};
  192. struct CharSetNode : protected Chars<char16>
  193. {
  194. static const int directBits = Chars<char>::CharWidth;
  195. static const uint directSize = Chars<char>::NumChars;
  196. static const uint bitsPerInnerLevel = 4;
  197. static const uint branchingPerInnerLevel = 1 << bitsPerInnerLevel;
  198. static const uint innerMask = branchingPerInnerLevel - 1;
  199. static const int bitsPerLeafLevel = CharBitvec::Width;
  200. static const int branchingPerLeafLevel = CharBitvec::Size;
  201. static const uint leafMask = branchingPerLeafLevel - 1;
  202. static const uint levels = 1 + (CharWidth - bitsPerLeafLevel) / bitsPerInnerLevel;
  203. inline static uint innerIdx(uint level, uint v)
  204. {
  205. return (v >> ((level + 1) * bitsPerInnerLevel)) & innerMask;
  206. }
  207. inline static uint indexToValue(uint level, uint index, uint offset)
  208. {
  209. Assert((index & innerMask) == index);
  210. Assert((uint)(1 << ((level + 1) * bitsPerInnerLevel)) > offset);
  211. return (index << ((level + 1) * bitsPerInnerLevel)) + offset;
  212. }
  213. inline static uint leafIdx(uint v)
  214. {
  215. return v & leafMask;
  216. }
  217. inline static uint lim(uint level)
  218. {
  219. return (1U << (bitsPerLeafLevel + level * bitsPerInnerLevel)) - 1;
  220. }
  221. inline static uint remain(uint level, uint v)
  222. {
  223. return v & lim(level);
  224. }
  225. virtual void FreeSelf(ArenaAllocator* allocator) = 0;
  226. virtual CharSetNode* Clone(ArenaAllocator* allocator) const = 0;
  227. virtual CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) = 0;
  228. virtual CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) = 0;
  229. virtual CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) = 0;
  230. virtual bool Get(uint level, uint k) const = 0;
  231. virtual void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const = 0;
  232. virtual void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const = 0;
  233. virtual void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const = 0;
  234. virtual bool IsSubsetOf(uint level, const CharSetNode* other) const = 0;
  235. virtual bool IsEqualTo(uint level, const CharSetNode* other) const = 0;
  236. virtual uint Count(uint level) const = 0;
  237. _Success_(return) virtual bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const = 0;
  238. #if DBG
  239. virtual bool IsLeaf() const = 0;
  240. #endif
  241. static CharSetNode* For(ArenaAllocator* allocator, int level);
  242. };
  243. struct CharSetFull : CharSetNode
  244. {
  245. private:
  246. template <typename C>
  247. void ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset = 0x0) const;
  248. public:
  249. static CharSetFull Instance;
  250. static CharSetFull* const TheFullNode;
  251. CharSetFull();
  252. void FreeSelf(ArenaAllocator* allocator) override;
  253. CharSetNode* Clone(ArenaAllocator* allocator) const override;
  254. CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  255. CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  256. CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) override;
  257. bool Get(uint level, uint k) const override;
  258. void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const override;
  259. void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const override;
  260. void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const override;
  261. bool IsSubsetOf(uint level, const CharSetNode* other) const override;
  262. bool IsEqualTo(uint level, const CharSetNode* other) const override;
  263. uint Count(uint level) const override;
  264. _Success_(return) bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const override;
  265. #if DBG
  266. bool IsLeaf() const override;
  267. #endif
  268. };
  269. struct CharSetInner sealed : CharSetNode
  270. {
  271. private:
  272. template <typename C>
  273. void ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result) const;
  274. public:
  275. CharSetNode* children[branchingPerInnerLevel];
  276. CharSetInner();
  277. void FreeSelf(ArenaAllocator* allocator) override;
  278. CharSetNode* Clone(ArenaAllocator* allocator) const override;
  279. CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  280. CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  281. CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) override;
  282. bool Get(uint level, uint k) const override;
  283. void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const override;\
  284. void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const override;
  285. void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const override;
  286. bool IsSubsetOf(uint level, const CharSetNode* other) const override;
  287. bool IsEqualTo(uint level, const CharSetNode* other) const override;
  288. uint Count(uint level) const override;
  289. _Success_(return) bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const override;
  290. #if DBG
  291. bool IsLeaf() const override;
  292. #endif
  293. };
  294. struct CharSetLeaf sealed: CharSetNode
  295. {
  296. private:
  297. template <typename C>
  298. void ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset = 0x0) const;
  299. public:
  300. CharBitvec vec;
  301. CharSetLeaf();
  302. void FreeSelf(ArenaAllocator* allocator) override;
  303. CharSetNode* Clone(ArenaAllocator* allocator) const override;
  304. CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  305. CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) override;
  306. CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) override;
  307. bool Get(uint level, uint k) const override;
  308. void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const override;
  309. void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const override;
  310. void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const override;
  311. bool IsSubsetOf(uint level, const CharSetNode* other) const override;
  312. bool IsEqualTo(uint level, const CharSetNode* other) const override;
  313. uint Count(uint level) const override;
  314. _Success_(return) bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const override;
  315. #if DBG
  316. bool IsLeaf() const override;
  317. #endif
  318. };
  319. template <>
  320. class CharSet<char16> : private Chars<char16>
  321. {
  322. public:
  323. static const uint MaxCompact = 4;
  324. static const uint emptySlot = (uint)-1;
  325. struct CompactRep
  326. {
  327. // 1 + number of distinct characters, 1..MaxCompact+1
  328. size_t countPlusOne;
  329. // Characters, in no particular order, or (uint)-1 for tail empty slots
  330. uint cs[MaxCompact];
  331. uint8 padding[sizeof(CharBitvec) - sizeof(uint) * MaxCompact];
  332. };
  333. struct FullRep
  334. {
  335. // Trie for remaining characters. Pointer value will be 0 or >> MaxCompact.
  336. CharSetNode* root;
  337. // Entries for first 256 characters
  338. CharBitvec direct;
  339. };
  340. union Rep
  341. {
  342. struct CompactRep compact;
  343. struct FullRep full;
  344. } rep;
  345. static const int compactSize = sizeof(CompactRep);
  346. static const int fullSize = sizeof(FullRep);
  347. inline bool IsCompact() const { return rep.compact.countPlusOne - 1 <= MaxCompact; }
  348. void SwitchRepresentations(ArenaAllocator* allocator);
  349. void Sort();
  350. public:
  351. CharSet();
  352. void FreeBody(ArenaAllocator* allocator);
  353. void Clear(ArenaAllocator* allocator);
  354. void CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other);
  355. void CloneNonSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<Char>& other);
  356. void CloneSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<Char>& other);
  357. inline void Set(ArenaAllocator* allocator, Char kc) { SetRange(allocator, kc, kc); }
  358. void SetRange(ArenaAllocator* allocator, Char lc, Char hc);
  359. void SubtractRange(ArenaAllocator* allocator, Char lc, Char hc);
  360. void SetRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
  361. void SetNotRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
  362. void UnionInPlace(ArenaAllocator* allocator, const CharSet<Char>& other);
  363. _Success_(return) bool GetNextRange(Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar);
  364. bool Get_helper(uint k) const;
  365. __inline bool Get(Char kc) const
  366. {
  367. if (IsCompact())
  368. {
  369. Assert(MaxCompact == 4);
  370. return rep.compact.cs[0] == CTU(kc) ||
  371. rep.compact.cs[1] == CTU(kc) ||
  372. rep.compact.cs[2] == CTU(kc) ||
  373. rep.compact.cs[3] == CTU(kc);
  374. }
  375. else
  376. {
  377. if (CTU(kc) < CharSetNode::directSize)
  378. return rep.full.direct.Get(CTU(kc));
  379. else if (rep.full.root == 0)
  380. return false;
  381. else
  382. return Get_helper(CTU(kc));
  383. }
  384. }
  385. inline bool IsEmpty() const
  386. {
  387. return rep.compact.countPlusOne == 1;
  388. }
  389. inline bool IsSingleton() const
  390. {
  391. return rep.compact.countPlusOne == 2;
  392. }
  393. // Helpers to clean up the code
  394. inline uint GetCompactLength() const
  395. {
  396. Assert(IsCompact());
  397. return (uint)(rep.compact.countPlusOne - 1u);
  398. }
  399. inline void SetCompactLength(size_t length)
  400. {
  401. rep.compact.countPlusOne = length + 1;
  402. }
  403. inline uint GetCompactCharU(uint index) const
  404. {
  405. Assert(index < this->GetCompactLength());
  406. Assert(IsCompact());
  407. Assert(rep.compact.cs[index] <= MaxUChar);
  408. return rep.compact.cs[index];
  409. }
  410. inline Char GetCompactChar(uint index) const
  411. {
  412. return (Char)(GetCompactCharU(index));
  413. }
  414. //Replaces an existing character with a new value
  415. inline void ReplaceCompactChar(uint index, Char value)
  416. {
  417. ReplaceCompactChar(index, (Char)(value));
  418. }
  419. //Replaces an existing character with a new value
  420. inline void ReplaceCompactCharU(uint index, uint value)
  421. {
  422. Assert(index < this->GetCompactLength());
  423. Assert(IsCompact());
  424. Assert(value <= MaxUChar);
  425. rep.compact.cs[index] = value;
  426. }
  427. inline void ClearCompactChar(uint index)
  428. {
  429. Assert(index < this->GetCompactLength());
  430. Assert(IsCompact());
  431. rep.compact.cs[index] = emptySlot;
  432. }
  433. // Adds the character to the end, assuming there is enough space. (Assert in place)
  434. // Increments count.
  435. inline void AddCompactCharU(uint value)
  436. {
  437. Assert(this->GetCompactLength() < MaxCompact);
  438. Assert(IsCompact());
  439. rep.compact.cs[this->GetCompactLength()] = value;
  440. rep.compact.countPlusOne += 1;
  441. }
  442. // Adds the character to the end, assuming there is enough space. (Assert in place)
  443. // Increments count.
  444. inline void AddCompactChar(Char value)
  445. {
  446. AddCompactCharU((Char)(value));
  447. }
  448. // This performs a check to see if the index is the last char, if so sets it to emptySlot
  449. // If not, replaces it with last index.
  450. inline void RemoveCompactChar(uint index)
  451. {
  452. Assert(index < this->GetCompactLength());
  453. Assert(IsCompact());
  454. if (index == this->GetCompactLength() - 1)
  455. {
  456. this->ClearCompactChar(index);
  457. }
  458. else
  459. {
  460. this->ReplaceCompactCharU(index, this->GetCompactCharU((uint)this->GetCompactLength() - 1));
  461. }
  462. rep.compact.countPlusOne -= 1;
  463. }
  464. inline char16 Singleton() const
  465. {
  466. Assert(IsSingleton());
  467. Assert(rep.compact.cs[0] <= MaxUChar);
  468. return UTC(rep.compact.cs[0]);
  469. }
  470. int GetCompactEntries(uint max, __out_ecount(max) Char* entries) const;
  471. bool IsSubsetOf(const CharSet<Char>& other) const;
  472. bool IsEqualTo(const CharSet<Char>& other) const;
  473. inline uint Count() const
  474. {
  475. if (IsCompact())
  476. return (uint)rep.compact.countPlusOne - 1;
  477. else if (rep.full.root == 0)
  478. return rep.full.direct.Count();
  479. else
  480. {
  481. //The bit vector
  482. Assert(rep.full.root == CharSetFull::TheFullNode || rep.full.root->Count(CharSetNode::levels - 1) <= 0xFF00);
  483. return rep.full.direct.Count() + (rep.full.root == CharSetFull::TheFullNode ? 0xFF00 : rep.full.root->Count(CharSetNode::levels - 1));
  484. }
  485. }
  486. // NOTE: These are not 'const' methods since they may sort the compact representation internally
  487. void ToComplement(ArenaAllocator* allocator, CharSet<Char>& result);
  488. void ToEquivClass(ArenaAllocator* allocator, CharSet<Char>& result);
  489. void ToEquivClassCP(ArenaAllocator* allocator, CharSet<codepoint_t>& result, codepoint_t baseOffset);
  490. #if ENABLE_REGEX_CONFIG_OPTIONS
  491. void Print(DebugWriter* w) const;
  492. #endif
  493. };
  494. template <>
  495. class CharSet<codepoint_t> : private Chars<codepoint_t>
  496. {
  497. static const int NumberOfPlanes = 17;
  498. private:
  499. // Character planes are composed of 65536 characters each.
  500. // First plane is the Basic Multilingual Plane (characters 0 - 65535)
  501. // Every subsequent plane also stores characters in the form [0 - 65535]; to get the actual value, add 'index * 0x10000' to it
  502. CharSet<char16> characterPlanes [NumberOfPlanes];
  503. // Takes a character, and returns the index of the CharSet<char16> that holds it.
  504. inline int CharToIndex(Char c) const
  505. {
  506. Assert(c <= Chars<codepoint_t>::MaxUChar);
  507. return (int)(CTU(c) / (Chars<char16>::MaxUChar + 1));
  508. }
  509. // Takes a character, and removes the offset to make it < 0x10000
  510. inline char16 RemoveOffset(Char c) const
  511. {
  512. Assert(c <= Chars<codepoint_t>::MaxUChar);
  513. return (char16)(CTU(c) % 0x10000);
  514. }
  515. // Takes a character, and removes the offset to make it < 0x10000
  516. inline Char AddOffset(char16 c, int index) const
  517. {
  518. Assert(c <= Chars<char16>::MaxUChar);
  519. Assert(index >= 0);
  520. Assert(index < NumberOfPlanes);
  521. return (Char)(c) + 0x10000 * index;
  522. }
  523. public:
  524. CharSet();
  525. void FreeBody(ArenaAllocator* allocator);
  526. void Clear(ArenaAllocator* allocator);
  527. void CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other);
  528. void CloneSimpleCharsTo(ArenaAllocator* allocator, CharSet<char16>& other) const;
  529. inline void CloneNonSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<char16>& other)
  530. {
  531. Assert(this->SimpleCharCount() > 0);
  532. AssertMsg(this->ContainSurrogateCodeUnits(), "This doesn't contain surrogate code units, a simple clone is faster.");
  533. this->characterPlanes[0].CloneNonSurrogateCodeUnitsTo(allocator, other);
  534. }
  535. inline void CloneSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<char16>& other)
  536. {
  537. Assert(this->SimpleCharCount() > 0);
  538. AssertMsg(this->ContainSurrogateCodeUnits(), "This doesn't contain surrogate code units, will not produce any result.");
  539. this->characterPlanes[0].CloneSurrogateCodeUnitsTo(allocator, other);
  540. }
  541. inline void Set(ArenaAllocator* allocator, Char kc) { SetRange(allocator, kc, kc); }
  542. inline bool ContainSurrogateCodeUnits()
  543. {
  544. char16 outLower = 0xFFFF, ignore = 0x0;
  545. return this->characterPlanes[0].GetNextRange(0xD800, &outLower, &ignore) ? outLower <= 0xDFFF : false;
  546. }
  547. void SetRange(ArenaAllocator* allocator, Char lc, Char hc);
  548. void SetRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
  549. void SetNotRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
  550. void UnionInPlace(ArenaAllocator* allocator, const CharSet<Char>& other);
  551. void UnionInPlace(ArenaAllocator* allocator, const CharSet<char16>& other);
  552. _Success_(return) bool GetNextRange(Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar);
  553. inline bool Get(Char kc) const
  554. {
  555. return this->characterPlanes[CharToIndex(kc)].Get(RemoveOffset(kc));
  556. }
  557. inline bool IsEmpty() const
  558. {
  559. for (int i = 0; i < NumberOfPlanes; i++)
  560. {
  561. if (!this->characterPlanes[i].IsEmpty())
  562. {
  563. return false;
  564. }
  565. }
  566. return true;
  567. }
  568. inline bool IsSimpleCharASingleton() const
  569. {
  570. return this->characterPlanes[0].IsSingleton();
  571. }
  572. inline char16 SimpleCharSingleton() const
  573. {
  574. return this->characterPlanes[0].Singleton();
  575. }
  576. inline bool IsSingleton() const
  577. {
  578. return this->Count() == 1;
  579. }
  580. inline codepoint_t Singleton() const
  581. {
  582. Assert(IsSingleton());
  583. for (int i = 0; i < NumberOfPlanes; i++)
  584. {
  585. if (this->characterPlanes[i].IsSingleton())
  586. {
  587. return AddOffset(this->characterPlanes[i].Singleton(), i);
  588. }
  589. }
  590. AssertMsg(false, "Should not reach here, first Assert verifies we are a singleton.");
  591. return INVALID_CODEPOINT;
  592. }
  593. bool IsSubsetOf(const CharSet<Char>& other) const;
  594. bool IsEqualTo(const CharSet<Char>& other) const;
  595. inline uint Count() const
  596. {
  597. uint totalCount = 0;
  598. for (int i = 0; i < NumberOfPlanes; i++)
  599. {
  600. totalCount += this->characterPlanes[i].Count();
  601. }
  602. return totalCount;
  603. }
  604. inline uint SimpleCharCount() const
  605. {
  606. return this->characterPlanes[0].Count();
  607. }
  608. // NOTE: These are not 'const' methods since they may sort the compact representation internally
  609. void ToComplement(ArenaAllocator* allocator, CharSet<Char>& result);
  610. void ToSimpleComplement(ArenaAllocator* allocator, CharSet<codepoint_t>& result);
  611. void ToSimpleComplement(ArenaAllocator* allocator, CharSet<char16>& result);
  612. void ToEquivClass(ArenaAllocator* allocator, CharSet<Char>& result);
  613. void ToSurrogateEquivClass(ArenaAllocator* allocator, CharSet<Char>& result);
  614. #if ENABLE_REGEX_CONFIG_OPTIONS
  615. void Print(DebugWriter* w) const;
  616. #endif
  617. };
  618. template <>
  619. class RuntimeCharSet<char16> : private Chars<char16>
  620. {
  621. private:
  622. // Trie for remaining characters. Pointer value will be 0 or >> MaxCompact.
  623. CharSetNode* root;
  624. // Entries for first 256 characters
  625. CharBitvec direct;
  626. public:
  627. RuntimeCharSet();
  628. void FreeBody(ArenaAllocator* allocator);
  629. void CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other);
  630. bool Get_helper(uint k) const;
  631. __inline bool Get(Char kc) const
  632. {
  633. if (CTU(kc) < CharSetNode::directSize)
  634. return direct.Get(CTU(kc));
  635. else if (root == 0)
  636. return false;
  637. else
  638. return Get_helper(CTU(kc));
  639. }
  640. #if ENABLE_REGEX_CONFIG_OPTIONS
  641. void Print(DebugWriter* w) const;
  642. #endif
  643. };
  644. typedef CharSet<char16> UnicodeCharSet;
  645. typedef RuntimeCharSet<char16> UnicodeRuntimeCharSet;
  646. }