Utf8Codex.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "Utf8Codex.h"
  6. #ifndef _WIN32
  7. #undef _Analysis_assume_
  8. #define _Analysis_assume_(expr)
  9. #endif
  10. extern void CodexAssert(bool condition);
  11. namespace utf8
  12. {
  13. const unsigned int mAlignmentMask = 0x3;
  14. inline bool IsAligned(LPCUTF8 pch)
  15. {
  16. return (reinterpret_cast<size_t>(pch) & mAlignmentMask) == 0;
  17. }
  18. inline bool IsAligned(LPCOLESTR pch)
  19. {
  20. return (reinterpret_cast<size_t>(pch) & mAlignmentMask) == 0;
  21. }
  22. inline bool ShouldFastPath(LPCUTF8 pb, LPCOLESTR pch)
  23. {
  24. return (reinterpret_cast<size_t>(pb) & mAlignmentMask) == 0 || (reinterpret_cast<size_t>(pch) & mAlignmentMask) == 0;
  25. }
  26. inline size_t EncodedBytes(wchar16 prefix)
  27. {
  28. CodexAssert(0 == (prefix & 0xFF00)); // prefix must really be a byte. We use wchar16 for as a convenience for the API.
  29. // The number of bytes in an UTF8 encoding is determined by the 4 high-order bits of the first byte.
  30. // 0xxx -> 1
  31. // 10xx -> 1 (invalid)
  32. // 110x -> 2
  33. // 1110 -> 3
  34. // 1111 -> 4
  35. // If this value is XOR with 0xF0 and shift 3 bits to the right it can be used as an
  36. // index into a 16 element 2 bit array encoded as a uint32 of n - 1 where n is the number
  37. // of bits in the encoding.
  38. // The XOR prefix bits mapped to n - 1.
  39. // 1xxx -> 00 (8 - 15)
  40. // 01xx -> 00 (4 - 7)
  41. // 001x -> 01 (2 - 3)
  42. // 0001 -> 10 (1)
  43. // 0000 -> 11 (0)
  44. // This produces the following bit sequence:
  45. // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
  46. // 00 00 00 00 00 00 00 00 00 00 00 00 01 01 10 11
  47. // which is 0x5B
  48. return ((0x5B >> (((prefix ^ 0xF0) >> 3) & 0x1E)) & 0x03) + 1;
  49. }
  50. const wchar16 g_chUnknown = wchar16(UNICODE_UNKNOWN_CHAR_MARK);
  51. const wchar16 WCH_UTF16_HIGH_FIRST = wchar16(0xd800);
  52. const wchar16 WCH_UTF16_HIGH_LAST = wchar16(0xdbff);
  53. const wchar16 WCH_UTF16_LOW_FIRST = wchar16(0xdc00);
  54. const wchar16 WCH_UTF16_LOW_LAST = wchar16(0xdfff);
  55. inline BOOL InRange(const wchar16 ch, const wchar16 chMin, const wchar16 chMax)
  56. {
  57. return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
  58. }
  59. inline BOOL IsValidWideChar(const wchar16 ch)
  60. {
  61. return (ch < 0xfdd0) || ((ch > 0xfdef) && (ch <= 0xffef)) || ((ch >= 0xfff9) && (ch <= 0xfffd));
  62. }
  63. inline BOOL IsHighSurrogateChar(wchar16 ch)
  64. {
  65. return InRange( ch, WCH_UTF16_HIGH_FIRST, WCH_UTF16_HIGH_LAST );
  66. }
  67. inline BOOL IsLowSurrogateChar(wchar16 ch)
  68. {
  69. return InRange( ch, WCH_UTF16_LOW_FIRST, WCH_UTF16_LOW_LAST );
  70. }
  71. _At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end))
  72. inline wchar16 DecodeTail(wchar16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options)
  73. {
  74. wchar16 ch = 0;
  75. BYTE c2, c3, c4;
  76. switch (EncodedBytes(c1))
  77. {
  78. case 1:
  79. if (c1 < 0x80) return c1;
  80. if ((options & doSecondSurrogatePair) != 0)
  81. {
  82. // We're in the middle of decoding a surrogate pair from a four-byte utf8 sequence.
  83. // The high word has already been returned, but without advancing ptr, which was on byte 1.
  84. // ptr was then advanced externally when reading c1, which is byte 1, so ptr is now on byte 2.
  85. // byte 1 must have been a continuation byte, hence will be in case 1.
  86. ptr--; // back to byte 1
  87. c1 = ptr[-1]; // the original first byte
  88. // ptr is now on c2. We must also have c3 and c4, otherwise doSecondSurrogatePair won't set.
  89. _Analysis_assume_(ptr + 2 < end);
  90. goto LFourByte;
  91. }
  92. // 10xxxxxx (trail byte appearing in a lead byte position
  93. return g_chUnknown;
  94. case 2:
  95. // Look for an overlong utf-8 sequence.
  96. if (ptr >= end)
  97. {
  98. if ((options & doChunkedEncoding) != 0)
  99. // The is a sequence that spans a chunk, push ptr back to the beginning of the sequence.
  100. ptr--;
  101. return g_chUnknown;
  102. }
  103. c2 = *ptr++;
  104. // 110XXXXx 10xxxxxx
  105. // UTF16 | UTF8 1st byte 2nd byte
  106. // U+0080..U+07FF | C2..DF 80..BF
  107. if (
  108. InRange(c1, 0xC2, 0xDF)
  109. && InRange(c2, 0x80, 0xBF)
  110. )
  111. {
  112. ch |= WCHAR(c1 & 0x1f) << 6; // 0x0080 - 0x07ff
  113. ch |= WCHAR(c2 & 0x3f);
  114. if (!IsValidWideChar(ch) && ((options & doAllowInvalidWCHARs) == 0))
  115. ch = g_chUnknown;
  116. }
  117. else
  118. {
  119. ptr--;
  120. ch = g_chUnknown;
  121. }
  122. break;
  123. case 3:
  124. // 1110XXXX 10Xxxxxx 10xxxxxx
  125. // Look for overlong utf-8 sequence.
  126. if (ptr + 1 >= end)
  127. {
  128. if ((options & doChunkedEncoding) != 0)
  129. // The is a sequence that spans a chunk, push ptr back to the beginning of the sequence.
  130. ptr--;
  131. return g_chUnknown;
  132. }
  133. // UTF16 | UTF8 1st byte 2nd byte 3rd byte
  134. // U+0800..U+0FFF | E0 A0..BF 80..BF
  135. // U+1000..U+CFFF | E1..EC 80..BF 80..BF
  136. // U+D000..U+D7FF | ED 80..9F 80..BF
  137. // U+E000..U+FFFF | EE..EF 80..BF 80..BF
  138. c2 = ptr[0];
  139. c3 = ptr[1];
  140. if (
  141. // any following be true
  142. (c1 == 0xE0
  143. && InRange(c2, 0xA0, 0xBF)
  144. && InRange(c3, 0x80, 0xBF))
  145. ||
  146. (InRange(c1, 0xE1, 0xEC)
  147. && InRange(c2, 0x80, 0xBF)
  148. && InRange(c3, 0x80, 0xBF))
  149. ||
  150. (c1 == 0xED
  151. && InRange(c2, 0x80, 0x9F)
  152. && InRange(c3, 0x80, 0xBF))
  153. ||
  154. (InRange(c1, 0xEE, 0xEF)
  155. && InRange(c2, 0x80, 0xBF)
  156. && InRange(c3, 0x80, 0xBF))
  157. ||
  158. (((options & doAllowThreeByteSurrogates) != 0)
  159. &&
  160. c1 == 0xED
  161. && InRange(c2, 0x80, 0xBF)
  162. && InRange(c3, 0x80, 0xBF)
  163. )
  164. )
  165. {
  166. ch = WCHAR(c1 & 0x0f) << 12; // 0x0800 - 0xffff
  167. ch |= WCHAR(c2 & 0x3f) << 6; // 0x0080 - 0x07ff
  168. ch |= WCHAR(c3 & 0x3f);
  169. if (!IsValidWideChar(ch) && ((options & (doAllowThreeByteSurrogates | doAllowInvalidWCHARs)) == 0))
  170. ch = g_chUnknown;
  171. ptr += 2;
  172. }
  173. else
  174. {
  175. ch = g_chUnknown;
  176. // Windows OS 1713952. Only drop the illegal leading byte
  177. // Retry next byte.
  178. // ptr is already advanced.
  179. }
  180. break;
  181. case 4:
  182. LFourByte:
  183. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx or 11111xxx ....
  184. // NOTE: 11111xxx is not supported
  185. if (ptr + 2 >= end)
  186. {
  187. if ((options & doChunkedEncoding) != 0)
  188. // The is a sequence that spans a chunk, push ptr back to the beginning of the sequence.
  189. ptr--;
  190. ch = g_chUnknown;
  191. break;
  192. }
  193. c2 = ptr[0];
  194. c3 = ptr[1];
  195. c4 = ptr[2];
  196. // UTF16 | UTF8 1st byte 2nd byte 3rd byte 4th byte
  197. // U+10000..U+3FFFF | F0 90..BF 80..BF 80..BF
  198. // U+40000..U+FFFFF | F1..F3 80..BF 80..BF 80..BF
  199. // U+100000..U+10FFFF | F4 80..8F 80..BF 80..BF
  200. if (! // NOT Unicode well-formed byte sequences
  201. (
  202. // any following be true
  203. (c1 == 0xF0
  204. && InRange(c2, 0x90,0xBF)
  205. && InRange(c3, 0x80,0xBF)
  206. && InRange(c4, 0x80,0xBF))
  207. ||
  208. (InRange(c1, 0xF1, 0xF3)
  209. && InRange(c2, 0x80,0xBF)
  210. && InRange(c3, 0x80,0xBF)
  211. && InRange(c4, 0x80,0xBF))
  212. ||
  213. (c1 == 0xF4
  214. && InRange(c2, 0x80,0x8F)
  215. && InRange(c3, 0x80,0xBF)
  216. && InRange(c4, 0x80,0xBF))
  217. )
  218. )
  219. {
  220. // Windows OS 1713952. Only drop the illegal leading byte.
  221. // Retry next byte.
  222. // ptr is already advanced 1.
  223. ch = g_chUnknown;
  224. break;
  225. }
  226. if ((options & doSecondSurrogatePair) == 0)
  227. {
  228. // Decode high 10 bits of utf-8 20 bit char
  229. ch = WCHAR(c1 & 0x07) << 2;
  230. ch |= WCHAR(c2 & 0x30) >> 4;
  231. ch = (ch - 1) << 6; // ch == 0000 00ww ww00 0000
  232. ch |= WCHAR(c2 & 0x0f) << 2; // ch == 0000 00ww wwzz zz00
  233. ch |= WCHAR(c3 & 0x30) >> 4; // ch == 0000 00ww wwzz zzyy
  234. // Encode first word of utf-16 surrogate pair
  235. ch += 0xD800;
  236. // Remember next call must return second word
  237. options = (DecodeOptions)(options | doSecondSurrogatePair);
  238. // Leave ptr on byte 1, this way:
  239. // - callers who test that ptr has been advanced by utf8::Decode will see progress for
  240. // both words of the surrogate pair.
  241. // - callers who calculate the number of multi-unit chars by subtracting after from before ptr
  242. // will accumulate 0 for first word and 2 for second, thus utf8 chars equals 2 utf16 chars + 2
  243. // multi-unit chars, as it should be.
  244. }
  245. else
  246. {
  247. // Decode low 10 bits of utf-8 20 bit char
  248. ch = WCHAR(c3 & 0x0f) << 6; // ch == 0000 00yy yy00 0000
  249. ch |= WCHAR(c4 & 0x3f); // ch == 0000 00yy yyxx xxxx
  250. // Encode second word of utf-16 surrogate pair
  251. ch += 0xDC00;
  252. // We're done with this char
  253. options = (DecodeOptions)(options & ~doSecondSurrogatePair);
  254. ptr += 3; // remember, got here by subtracting one from ptr in case 1, so effective increment is 2
  255. }
  256. break;
  257. }
  258. return ch;
  259. }
  260. LPUTF8 EncodeFull(wchar16 ch, __out_ecount(3) LPUTF8 ptr)
  261. {
  262. if( ch < 0x0080 )
  263. {
  264. // One byte
  265. *ptr++ = static_cast< utf8char_t >(ch);
  266. }
  267. else if( ch < 0x0800 )
  268. {
  269. // Two bytes : 110yyyxx 10xxxxxx
  270. *ptr++ = static_cast<utf8char_t>(ch >> 6) | 0xc0;
  271. *ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80;
  272. }
  273. else
  274. {
  275. // Three bytes : 1110yyyy 10yyyyxx 10xxxxxx
  276. *ptr++ = static_cast<utf8char_t>(ch >> 12) | 0xE0;
  277. *ptr++ = static_cast<utf8char_t>((ch >> 6) & 0x3F) | 0x80;
  278. *ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80;
  279. }
  280. return ptr;
  281. }
  282. LPCUTF8 NextCharFull(LPCUTF8 ptr)
  283. {
  284. return ptr + EncodedBytes(*ptr);
  285. }
  286. LPCUTF8 PrevCharFull(LPCUTF8 ptr, LPCUTF8 start)
  287. {
  288. if (ptr > start)
  289. {
  290. LPCUTF8 current = ptr - 1;
  291. while (current > start && (*current & 0xC0) == 0x80)
  292. current--;
  293. if (NextChar(current) == ptr)
  294. return current;
  295. // It is not a valid encoding, just go back one character.
  296. return ptr - 1;
  297. }
  298. else
  299. return ptr;
  300. }
  301. void DecodeInto(__out_ecount_full(cch) wchar16 *buffer, LPCUTF8 ptr, size_t cch, DecodeOptions options)
  302. {
  303. DecodeOptions localOptions = options;
  304. if (!ShouldFastPath(ptr, buffer)) goto LSlowPath;
  305. LFastPath:
  306. while (cch >= 4)
  307. {
  308. uint32 bytes = *(uint32 *)ptr;
  309. if ((bytes & 0x80808080) != 0) goto LSlowPath;
  310. ((uint32 *)buffer)[0] = (bytes & 0x7F) | ((bytes << 8) & 0x7F0000);
  311. ((uint32 *)buffer)[1] = ((bytes >> 16) & 0x7F) | ((bytes >> 8) & 0x7F0000);
  312. ptr += 4;
  313. buffer += 4;
  314. cch -= 4;
  315. }
  316. LSlowPath:
  317. while (cch-- > 0)
  318. {
  319. *buffer++ = Decode(ptr, ptr + 4, localOptions); // WARNING: Assume cch correct, suppress end-of-buffer checking
  320. if (ShouldFastPath(ptr, buffer)) goto LFastPath;
  321. }
  322. }
  323. void DecodeIntoAndNullTerminate(__out_ecount(cch+1) __nullterminated wchar16 *buffer, LPCUTF8 ptr, size_t cch, DecodeOptions options)
  324. {
  325. DecodeInto(buffer, ptr, cch, options);
  326. buffer[cch] = 0;
  327. }
  328. _Ret_range_(0, pbEnd - _Old_(pbUtf8))
  329. size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) wchar16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
  330. {
  331. DecodeOptions localOptions = options;
  332. LPCUTF8 p = pbUtf8;
  333. wchar16 *dest = buffer;
  334. if (!ShouldFastPath(p, dest)) goto LSlowPath;
  335. LFastPath:
  336. while (p + 3 < pbEnd)
  337. {
  338. unsigned bytes = *(unsigned *)p;
  339. if ((bytes & 0x80808080) != 0) goto LSlowPath;
  340. ((uint32 *)dest)[0] = (wchar16(bytes) & 0x00FF) | ((wchar16(bytes) & 0xFF00) << 8);
  341. ((uint32 *)dest)[1] = (wchar16(bytes >> 16) & 0x00FF) | ((wchar16(bytes >> 16) & 0xFF00) << 8);
  342. p += 4;
  343. dest += 4;
  344. }
  345. LSlowPath:
  346. while (p < pbEnd)
  347. {
  348. LPCUTF8 s = p;
  349. wchar16 chDest = Decode(p, pbEnd, localOptions);
  350. if (s < p)
  351. {
  352. // We decoded the character, store it
  353. *dest++ = chDest;
  354. }
  355. else
  356. {
  357. // Nothing was converted. This might happen at the end of a buffer with doChunkedEncoding.
  358. break;
  359. }
  360. if (ShouldFastPath(p, dest)) goto LFastPath;
  361. }
  362. pbUtf8 = p;
  363. return dest - buffer;
  364. }
  365. size_t DecodeUnitsIntoAndNullTerminate(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated wchar16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
  366. {
  367. size_t result = DecodeUnitsInto(buffer, pbUtf8, pbEnd, options);
  368. buffer[(int)result] = 0;
  369. return result;
  370. }
  371. bool CharsAreEqual(__in_ecount(cch) LPCOLESTR pch, LPCUTF8 bch, size_t cch, DecodeOptions options)
  372. {
  373. DecodeOptions localOptions = options;
  374. while (cch-- > 0)
  375. {
  376. if (*pch++ != utf8::Decode(bch, bch + 4, localOptions)) // WARNING: Assume cch correct, suppress end-of-buffer checking
  377. return false;
  378. }
  379. return true;
  380. }
  381. __range(0, cch * 3)
  382. size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const wchar16 *source, charcount_t cch)
  383. {
  384. LPUTF8 dest = buffer;
  385. if (!ShouldFastPath(dest, source)) goto LSlowPath;
  386. LFastPath:
  387. while (cch >= 4)
  388. {
  389. uint32 first = ((const uint32 *)source)[0];
  390. if ( (first & 0xFF80FF80) != 0) goto LSlowPath;
  391. uint32 second = ((const uint32 *)source)[1];
  392. if ( (second & 0xFF80FF80) != 0) goto LSlowPath;
  393. *(uint32 *)dest = (first & 0x0000007F) | ((first & 0x007F0000) >> 8) | ((second & 0x0000007f) << 16) | ((second & 0x007F0000) << 8);
  394. dest += 4;
  395. source += 4;
  396. cch -= 4;
  397. }
  398. LSlowPath:
  399. while( cch-- > 0 )
  400. {
  401. dest = Encode(*source++, dest);
  402. if (ShouldFastPath(dest, source)) goto LFastPath;
  403. }
  404. return dest - buffer;
  405. }
  406. __range(0, cch * 3)
  407. size_t EncodeIntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const wchar16 *source, charcount_t cch)
  408. {
  409. size_t result = EncodeInto(buffer, source, cch);
  410. buffer[result] = 0;
  411. return result;
  412. }
  413. // Convert the character index into a byte index.
  414. size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, charcount_t cchIndex, DecodeOptions options)
  415. {
  416. return CharacterIndexToByteIndex(pch, cbLength, cchIndex, 0, 0, options);
  417. }
  418. size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, size_t cbStartIndex, charcount_t cchStartIndex, DecodeOptions options)
  419. {
  420. DecodeOptions localOptions = options;
  421. LPCUTF8 pchCurrent = pch + cbStartIndex;
  422. LPCUTF8 pchEnd = pch + cbLength;
  423. LPCUTF8 pchEndMinus4 = pch + (cbLength - 4);
  424. charcount_t i = cchIndex - cchStartIndex;
  425. // Avoid using a reinterpret_cast to start a misaligned read.
  426. if (!IsAligned(pchCurrent)) goto LSlowPath;
  427. LFastPath:
  428. // Skip 4 bytes at a time.
  429. while (pchCurrent < pchEndMinus4 && i > 4)
  430. {
  431. uint32 ch4 = *reinterpret_cast<const uint32 *>(pchCurrent);
  432. if ((ch4 & 0x80808080) == 0)
  433. {
  434. pchCurrent += 4;
  435. i -= 4;
  436. }
  437. else break;
  438. }
  439. LSlowPath:
  440. while (pchCurrent < pchEnd && i > 0)
  441. {
  442. Decode(pchCurrent, pchEnd, localOptions);
  443. i--;
  444. // Try to return to the fast path avoiding misaligned reads.
  445. if (i > 4 && IsAligned(pchCurrent)) goto LFastPath;
  446. }
  447. return i > 0 ? cbLength : pchCurrent - pch;
  448. }
  449. // Convert byte index into character index
  450. charcount_t ByteIndexIntoCharacterIndex(__in_ecount(cbIndex) LPCUTF8 pch, size_t cbIndex, DecodeOptions options)
  451. {
  452. DecodeOptions localOptions = options;
  453. LPCUTF8 pchCurrent = pch;
  454. LPCUTF8 pchEnd = pch + cbIndex;
  455. LPCUTF8 pchEndMinus4 = pch + (cbIndex - 4);
  456. charcount_t i = 0;
  457. // Avoid using a reinterpret_cast to start a misaligned read.
  458. if (!IsAligned(pchCurrent)) goto LSlowPath;
  459. LFastPath:
  460. // Skip 4 bytes at a time.
  461. while (pchCurrent < pchEndMinus4)
  462. {
  463. uint32 ch4 = *reinterpret_cast<const uint32 *>(pchCurrent);
  464. if ((ch4 & 0x80808080) == 0)
  465. {
  466. pchCurrent += 4;
  467. i += 4;
  468. }
  469. else break;
  470. }
  471. LSlowPath:
  472. while (pchCurrent < pchEnd)
  473. {
  474. LPCUTF8 s = pchCurrent;
  475. Decode(pchCurrent, pchEnd, localOptions);
  476. if (s == pchCurrent) break;
  477. i++;
  478. // Try to return to the fast path avoiding misaligned reads.
  479. if (IsAligned(pchCurrent)) goto LFastPath;
  480. }
  481. return i;
  482. }
  483. } // namespace utf8