Utf8Codex.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "Utf8Codex.h"
  6. #ifndef _WIN32
  7. #undef _Analysis_assume_
  8. #define _Analysis_assume_(expr)
  9. #endif
  10. #ifdef _MSC_VER
  11. //=============================
  12. // Disabled Warnings
  13. //=============================
  14. #pragma warning(push)
  15. #pragma warning(disable: 4127) // constant expression for template parameter
  16. #pragma warning(disable: 26451) // size-conversion/arithmetic-operation ordering
  17. #endif
  18. namespace utf8
  19. {
  20. const unsigned int mAlignmentMask = 0x3;
  21. inline bool IsAligned(LPCUTF8 pch)
  22. {
  23. return (reinterpret_cast<size_t>(pch) & mAlignmentMask) == 0;
  24. }
  25. inline bool IsAligned(LPCOLESTR pch)
  26. {
  27. return (reinterpret_cast<size_t>(pch) & mAlignmentMask) == 0;
  28. }
  29. inline bool ShouldFastPath(LPCUTF8 pb, LPCOLESTR pch)
  30. {
  31. return (reinterpret_cast<size_t>(pb) & mAlignmentMask) == 0 && (reinterpret_cast<size_t>(pch) & mAlignmentMask) == 0;
  32. }
  33. inline size_t EncodedBytes(char16 prefix)
  34. {
  35. CodexAssert(0 == (prefix & 0xFF00)); // prefix must really be a byte. We use char16 for as a convenience for the API.
  36. // The number of bytes in an UTF8 encoding is determined by the 4 high-order bits of the first byte.
  37. // 0xxx -> 1
  38. // 10xx -> 1 (invalid)
  39. // 110x -> 2
  40. // 1110 -> 3
  41. // 1111 -> 4
  42. // If this value is XOR with 0xF0 and shift 3 bits to the right it can be used as an
  43. // index into a 16 element 2 bit array encoded as a uint32 of n - 1 where n is the number
  44. // of bits in the encoding.
  45. // The XOR prefix bits mapped to n - 1.
  46. // 1xxx -> 00 (8 - 15)
  47. // 01xx -> 00 (4 - 7)
  48. // 001x -> 01 (2 - 3)
  49. // 0001 -> 10 (1)
  50. // 0000 -> 11 (0)
  51. // This produces the following bit sequence:
  52. // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
  53. // 00 00 00 00 00 00 00 00 00 00 00 00 01 01 10 11
  54. // which is 0x5B
  55. return ((0x5B >> (((prefix ^ 0xF0) >> 3) & 0x1E)) & 0x03) + 1;
  56. }
  57. const char16 WCH_UTF16_HIGH_FIRST = char16(0xd800);
  58. const char16 WCH_UTF16_HIGH_LAST = char16(0xdbff);
  59. const char16 WCH_UTF16_LOW_FIRST = char16(0xdc00);
  60. const char16 WCH_UTF16_LOW_LAST = char16(0xdfff);
  61. char16 GetUnknownCharacter(DecodeOptions options = doDefault)
  62. {
  63. if ((options & doThrowOnInvalidWCHARs) != 0)
  64. {
  65. throw InvalidWideCharException();
  66. }
  67. return char16(UNICODE_UNKNOWN_CHAR_MARK);
  68. }
  69. inline BOOL InRange(const char16 ch, const char16 chMin, const char16 chMax)
  70. {
  71. return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
  72. }
  73. BOOL IsValidWideChar(char16 ch)
  74. {
  75. return (ch < 0xfdd0) || ((ch > 0xfdef) && (ch <= 0xffef)) || ((ch >= 0xfff9) && (ch <= 0xfffd));
  76. }
  77. inline BOOL IsHighSurrogateChar(char16 ch)
  78. {
  79. return InRange( ch, WCH_UTF16_HIGH_FIRST, WCH_UTF16_HIGH_LAST );
  80. }
  81. inline BOOL IsLowSurrogateChar(char16 ch)
  82. {
  83. return InRange( ch, WCH_UTF16_LOW_FIRST, WCH_UTF16_LOW_LAST );
  84. }
  85. _At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end))
  86. inline char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence)
  87. {
  88. char16 ch = 0;
  89. BYTE c2, c3, c4;
  90. switch (EncodedBytes(c1))
  91. {
  92. case 1:
  93. if (c1 < 0x80) return c1;
  94. if ((options & doSecondSurrogatePair) != 0)
  95. {
  96. // We're in the middle of decoding a surrogate pair from a four-byte utf8 sequence.
  97. // The high word has already been returned, but without advancing ptr, which was on byte 1.
  98. // ptr was then advanced externally when reading c1, which is byte 1, so ptr is now on byte 2.
  99. // byte 1 must have been a continuation byte, hence will be in case 1.
  100. ptr--; // back to byte 1
  101. c1 = ptr[-1]; // the original first byte
  102. // ptr is now on c2. We must also have c3 and c4, otherwise doSecondSurrogatePair won't set.
  103. _Analysis_assume_(ptr + 2 < end);
  104. goto LFourByte;
  105. }
  106. // 10xxxxxx (trail byte appearing in a lead byte position
  107. return GetUnknownCharacter(options);
  108. case 2:
  109. // Look for an overlong utf-8 sequence.
  110. if (ptr >= end)
  111. {
  112. if ((options & doChunkedEncoding) != 0)
  113. {
  114. // The is a sequence that spans a chunk, push ptr back to the beginning of the sequence.
  115. ptr--;
  116. if (chunkEndsAtTruncatedSequence)
  117. {
  118. *chunkEndsAtTruncatedSequence = true;
  119. }
  120. }
  121. return GetUnknownCharacter(options);
  122. }
  123. c2 = *ptr++;
  124. // 110XXXXx 10xxxxxx
  125. // UTF16 | UTF8 1st byte 2nd byte
  126. // U+0080..U+07FF | C2..DF 80..BF
  127. if (
  128. InRange(c1, 0xC2, 0xDF)
  129. && InRange(c2, 0x80, 0xBF)
  130. )
  131. {
  132. ch |= WCHAR(c1 & 0x1f) << 6; // 0x0080 - 0x07ff
  133. ch |= WCHAR(c2 & 0x3f);
  134. if (!IsValidWideChar(ch) && ((options & doAllowInvalidWCHARs) == 0))
  135. {
  136. ch = GetUnknownCharacter(options);
  137. }
  138. }
  139. else
  140. {
  141. ptr--;
  142. ch = GetUnknownCharacter(options);
  143. }
  144. break;
  145. case 3:
  146. // 1110XXXX 10Xxxxxx 10xxxxxx
  147. // Look for overlong utf-8 sequence.
  148. if (ptr + 1 >= end)
  149. {
  150. if ((options & doChunkedEncoding) != 0)
  151. {
  152. // The is a sequence that spans a chunk, push ptr back to the beginning of the sequence.
  153. ptr--;
  154. if (chunkEndsAtTruncatedSequence)
  155. {
  156. *chunkEndsAtTruncatedSequence = true;
  157. }
  158. }
  159. return GetUnknownCharacter(options);
  160. }
  161. // UTF16 | UTF8 1st byte 2nd byte 3rd byte
  162. // U+0800..U+0FFF | E0 A0..BF 80..BF
  163. // U+1000..U+CFFF | E1..EC 80..BF 80..BF
  164. // U+D000..U+D7FF | ED 80..9F 80..BF
  165. // U+E000..U+FFFF | EE..EF 80..BF 80..BF
  166. c2 = ptr[0];
  167. c3 = ptr[1];
  168. if (
  169. // any following be true
  170. (c1 == 0xE0
  171. && InRange(c2, 0xA0, 0xBF)
  172. && InRange(c3, 0x80, 0xBF))
  173. ||
  174. (InRange(c1, 0xE1, 0xEC)
  175. && InRange(c2, 0x80, 0xBF)
  176. && InRange(c3, 0x80, 0xBF))
  177. ||
  178. (c1 == 0xED
  179. && InRange(c2, 0x80, 0x9F)
  180. && InRange(c3, 0x80, 0xBF))
  181. ||
  182. (InRange(c1, 0xEE, 0xEF)
  183. && InRange(c2, 0x80, 0xBF)
  184. && InRange(c3, 0x80, 0xBF))
  185. ||
  186. (((options & doAllowThreeByteSurrogates) != 0)
  187. &&
  188. c1 == 0xED
  189. && InRange(c2, 0x80, 0xBF)
  190. && InRange(c3, 0x80, 0xBF)
  191. )
  192. )
  193. {
  194. ch = WCHAR(c1 & 0x0f) << 12; // 0x0800 - 0xffff
  195. ch |= WCHAR(c2 & 0x3f) << 6; // 0x0080 - 0x07ff
  196. ch |= WCHAR(c3 & 0x3f);
  197. if (!IsValidWideChar(ch) && ((options & (doAllowThreeByteSurrogates | doAllowInvalidWCHARs)) == 0))
  198. {
  199. ch = GetUnknownCharacter(options);
  200. }
  201. ptr += 2;
  202. }
  203. else
  204. {
  205. ch = GetUnknownCharacter(options);
  206. // Windows OS 1713952. Only drop the illegal leading byte
  207. // Retry next byte.
  208. // ptr is already advanced.
  209. }
  210. break;
  211. case 4:
  212. LFourByte:
  213. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx or 11111xxx ....
  214. // NOTE: 11111xxx is not supported
  215. if (ptr + 2 >= end)
  216. {
  217. if ((options & doChunkedEncoding) != 0)
  218. {
  219. // The is a sequence that spans a chunk, push ptr back to the beginning of the sequence.
  220. ptr--;
  221. if (chunkEndsAtTruncatedSequence)
  222. {
  223. *chunkEndsAtTruncatedSequence = true;
  224. }
  225. }
  226. ch = GetUnknownCharacter(options);
  227. break;
  228. }
  229. c2 = ptr[0];
  230. c3 = ptr[1];
  231. c4 = ptr[2];
  232. // UTF16 | UTF8 1st byte 2nd byte 3rd byte 4th byte
  233. // U+10000..U+3FFFF | F0 90..BF 80..BF 80..BF
  234. // U+40000..U+FFFFF | F1..F3 80..BF 80..BF 80..BF
  235. // U+100000..U+10FFFF | F4 80..8F 80..BF 80..BF
  236. if (! // NOT Unicode well-formed byte sequences
  237. (
  238. // any following be true
  239. (c1 == 0xF0
  240. && InRange(c2, 0x90,0xBF)
  241. && InRange(c3, 0x80,0xBF)
  242. && InRange(c4, 0x80,0xBF))
  243. ||
  244. (InRange(c1, 0xF1, 0xF3)
  245. && InRange(c2, 0x80,0xBF)
  246. && InRange(c3, 0x80,0xBF)
  247. && InRange(c4, 0x80,0xBF))
  248. ||
  249. (c1 == 0xF4
  250. && InRange(c2, 0x80,0x8F)
  251. && InRange(c3, 0x80,0xBF)
  252. && InRange(c4, 0x80,0xBF))
  253. )
  254. )
  255. {
  256. // Windows OS 1713952. Only drop the illegal leading byte.
  257. // Retry next byte.
  258. // ptr is already advanced 1.
  259. ch = GetUnknownCharacter(options);
  260. break;
  261. }
  262. if ((options & doSecondSurrogatePair) == 0)
  263. {
  264. // Decode high 10 bits of utf-8 20 bit char
  265. ch = WCHAR(c1 & 0x07) << 2;
  266. ch |= WCHAR(c2 & 0x30) >> 4;
  267. ch = (ch - 1) << 6; // ch == 0000 00ww ww00 0000
  268. ch |= WCHAR(c2 & 0x0f) << 2; // ch == 0000 00ww wwzz zz00
  269. ch |= WCHAR(c3 & 0x30) >> 4; // ch == 0000 00ww wwzz zzyy
  270. // Encode first word of utf-16 surrogate pair
  271. ch += 0xD800;
  272. // Remember next call must return second word
  273. options = (DecodeOptions)(options | doSecondSurrogatePair);
  274. // Leave ptr on byte 1, this way:
  275. // - callers who test that ptr has been advanced by utf8::Decode will see progress for
  276. // both words of the surrogate pair.
  277. // - callers who calculate the number of multi-unit chars by subtracting after from before ptr
  278. // will accumulate 0 for first word and 2 for second, thus utf8 chars equals 2 utf16 chars + 2
  279. // multi-unit chars, as it should be.
  280. }
  281. else
  282. {
  283. // Decode low 10 bits of utf-8 20 bit char
  284. ch = WCHAR(c3 & 0x0f) << 6; // ch == 0000 00yy yy00 0000
  285. ch |= WCHAR(c4 & 0x3f); // ch == 0000 00yy yyxx xxxx
  286. // Encode second word of utf-16 surrogate pair
  287. ch += 0xDC00;
  288. // We're done with this char
  289. options = (DecodeOptions)(options & ~doSecondSurrogatePair);
  290. ptr += 3; // remember, got here by subtracting one from ptr in case 1, so effective increment is 2
  291. }
  292. break;
  293. }
  294. return ch;
  295. }
  296. LPCUTF8 NextCharFull(LPCUTF8 ptr)
  297. {
  298. return ptr + EncodedBytes(*ptr);
  299. }
  300. LPCUTF8 PrevCharFull(LPCUTF8 ptr, LPCUTF8 start)
  301. {
  302. if (ptr > start)
  303. {
  304. LPCUTF8 current = ptr - 1;
  305. while (current > start && (*current & 0xC0) == 0x80)
  306. current--;
  307. if (NextChar(current) == ptr)
  308. return current;
  309. // It is not a valid encoding, just go back one character.
  310. return ptr - 1;
  311. }
  312. else
  313. return ptr;
  314. }
  315. _Use_decl_annotations_
  316. size_t DecodeUnitsInto(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options, bool *chunkEndsAtTruncatedSequence)
  317. {
  318. DecodeOptions localOptions = options;
  319. if (chunkEndsAtTruncatedSequence)
  320. {
  321. *chunkEndsAtTruncatedSequence = false;
  322. }
  323. LPCUTF8 p = pbUtf8;
  324. char16 *dest = buffer;
  325. if (!ShouldFastPath(p, dest)) goto LSlowPath;
  326. LFastPath:
  327. while (p + 3 < pbEnd)
  328. {
  329. unsigned bytes = *(unsigned *)p;
  330. if ((bytes & 0x80808080) != 0) goto LSlowPath;
  331. ((uint32 *)dest)[0] = (char16(bytes) & 0x00FF) | ((char16(bytes) & 0xFF00) << 8);
  332. ((uint32 *)dest)[1] = (char16(bytes >> 16) & 0x00FF) | ((char16(bytes >> 16) & 0xFF00) << 8);
  333. p += 4;
  334. dest += 4;
  335. }
  336. LSlowPath:
  337. while (p < pbEnd)
  338. {
  339. LPCUTF8 s = p;
  340. char16 chDest = Decode(p, pbEnd, localOptions, chunkEndsAtTruncatedSequence);
  341. if (s < p)
  342. {
  343. // We decoded the character, store it
  344. *dest++ = chDest;
  345. }
  346. else
  347. {
  348. // Nothing was converted. This might happen at the end of a buffer with doChunkedEncoding.
  349. break;
  350. }
  351. if (ShouldFastPath(p, dest)) goto LFastPath;
  352. }
  353. pbUtf8 = p;
  354. return dest - buffer;
  355. }
  356. _Use_decl_annotations_
  357. size_t DecodeUnitsIntoAndNullTerminate(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options, bool *chunkEndsAtTruncatedSequence)
  358. {
  359. size_t result = DecodeUnitsInto(buffer, pbUtf8, pbEnd, options, chunkEndsAtTruncatedSequence);
  360. buffer[result] = 0;
  361. return result;
  362. }
  363. _Use_decl_annotations_
  364. size_t DecodeUnitsIntoAndNullTerminateNoAdvance(char16 *buffer, LPCUTF8 pbUtf8, LPCUTF8 pbEnd, DecodeOptions options, bool *chunkEndsAtTruncatedSequence)
  365. {
  366. return DecodeUnitsIntoAndNullTerminate(buffer, pbUtf8, pbEnd, options, chunkEndsAtTruncatedSequence);
  367. }
  368. bool CharsAreEqual(LPCOLESTR pch, LPCUTF8 bch, LPCUTF8 end, DecodeOptions options)
  369. {
  370. DecodeOptions localOptions = options;
  371. while (bch < end)
  372. {
  373. if (*pch++ != utf8::Decode(bch, end, localOptions))
  374. {
  375. return false;
  376. }
  377. }
  378. return true;
  379. }
  380. template <Utf8EncodingKind encoding, bool countBytesOnly = false>
  381. __range(0, cbDest)
  382. size_t EncodeIntoImpl(
  383. _When_(!countBytesOnly, _Out_writes_(cbDest)) utf8char_t *destBuffer,
  384. __range(0, cchSource * 3) size_t cbDest,
  385. _In_reads_(cchSource) const char16 *source,
  386. __range(0, INT_MAX) charcount_t cchSource)
  387. {
  388. charcount_t cch = cchSource; // SAL analysis gets confused by EncodeTrueUtf8's dest buffer requirement unless we alias cchSource with a local
  389. LPUTF8 dest = destBuffer;
  390. utf8char_t *bufferEnd = &destBuffer[cbDest];
  391. CodexAssertOrFailFast(dest <= bufferEnd);
  392. if (!ShouldFastPath(dest, source)) goto LSlowPath;
  393. LFastPath:
  394. while (cch >= 4)
  395. {
  396. uint32 first = ((const uint32 *)source)[0];
  397. if ( (first & 0xFF80FF80) != 0) goto LSlowPath;
  398. uint32 second = ((const uint32 *)source)[1];
  399. if ( (second & 0xFF80FF80) != 0) goto LSlowPath;
  400. if (!countBytesOnly)
  401. {
  402. CodexAssertOrFailFast(dest + 4 <= bufferEnd);
  403. *(uint32 *)dest = (first & 0x0000007F) | ((first & 0x007F0000) >> 8) | ((second & 0x0000007f) << 16) | ((second & 0x007F0000) << 8);
  404. }
  405. dest += 4;
  406. source += 4;
  407. cch -= 4;
  408. }
  409. LSlowPath:
  410. if (encoding == Utf8EncodingKind::Cesu8)
  411. {
  412. while (cch-- > 0)
  413. {
  414. dest = Encode<countBytesOnly>(*source++, dest, bufferEnd);
  415. if (ShouldFastPath(dest, source)) goto LFastPath;
  416. }
  417. }
  418. else
  419. {
  420. while (cch-- > 0)
  421. {
  422. // We increment the source pointer here since at least one utf16 code unit is read here
  423. // If the code unit turns out to be the high surrogate in a surrogate pair, then
  424. // EncodeTrueUtf8 will consume the low surrogate code unit too by decrementing cch
  425. // and incrementing source
  426. dest = EncodeTrueUtf8<countBytesOnly>(*source++, &source, &cch, dest, bufferEnd);
  427. if (ShouldFastPath(dest, source)) goto LFastPath;
  428. }
  429. }
  430. return dest - destBuffer;
  431. }
  432. template <Utf8EncodingKind encoding>
  433. __range(0, cbDest)
  434. size_t EncodeInto(
  435. _Out_writes_(cbDest) utf8char_t *dest,
  436. __range(0, cchSource * 3) size_t cbDest,
  437. _In_reads_(cchSource) const char16 *source,
  438. __range(0, INT_MAX) charcount_t cchSource)
  439. {
  440. return EncodeIntoImpl<encoding>(dest, cbDest, source, cchSource);
  441. }
  442. template <Utf8EncodingKind encoding>
  443. __range(0, cbDest)
  444. size_t EncodeIntoAndNullTerminate(
  445. _Out_writes_z_(cbDest) utf8char_t *dest,
  446. __range(1, cchSource * 3 + 1) size_t cbDest, // must be at least large enough to write null terminator
  447. _In_reads_(cchSource) const char16 *source,
  448. __range(0, INT_MAX) charcount_t cchSource)
  449. {
  450. size_t destWriteMaxBytes = cbDest - 1; // leave room for null terminator
  451. size_t result = EncodeIntoImpl<encoding>(dest, destWriteMaxBytes, source, cchSource);
  452. dest[result] = 0;
  453. return result;
  454. }
  455. template
  456. __range(0, cbDest)
  457. size_t EncodeInto<Utf8EncodingKind::Cesu8>(
  458. _Out_writes_(cbDest) utf8char_t *dest,
  459. __range(0, cchSource * 3) size_t cbDest,
  460. _In_reads_(cchSource) const char16 *source,
  461. __range(0, INT_MAX) charcount_t cchSource);
  462. template
  463. __range(0, cbDest)
  464. size_t EncodeInto<Utf8EncodingKind::TrueUtf8>(
  465. _Out_writes_(cbDest) utf8char_t *dest,
  466. __range(0, cchSource * 3) size_t cbDest,
  467. _In_reads_(cchSource) const char16 *source,
  468. __range(0, INT_MAX) charcount_t cchSource);
  469. template
  470. __range(0, cbDest)
  471. size_t EncodeIntoAndNullTerminate<Utf8EncodingKind::Cesu8>(
  472. _Out_writes_z_(cbDest) utf8char_t *dest,
  473. __range(1, cchSource * 3 + 1) size_t cbDest,
  474. _In_reads_(cchSource) const char16 *source,
  475. __range(0, INT_MAX) charcount_t cchSource);
  476. template
  477. __range(0, cbDest)
  478. size_t EncodeIntoAndNullTerminate<Utf8EncodingKind::TrueUtf8>(
  479. _Out_writes_z_(cbDest) utf8char_t *dest,
  480. __range(1, cchSource * 3 + 1) size_t cbDest,
  481. _In_reads_(cchSource) const char16 *source,
  482. __range(0, INT_MAX) charcount_t cchSource);
  483. // Since we are not actually encoding, the return value is bounded on cch
  484. __range(0, cch * 3)
  485. size_t CountTrueUtf8(__in_ecount(cch) const char16 *source, charcount_t cch)
  486. {
  487. return EncodeIntoImpl<Utf8EncodingKind::TrueUtf8, true /*count only*/>(nullptr, 0, source, cch);
  488. }
  489. // Convert the character index into a byte index.
  490. size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, charcount_t cchIndex, DecodeOptions options)
  491. {
  492. return CharacterIndexToByteIndex(pch, cbLength, cchIndex, 0, 0, options);
  493. }
  494. size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, size_t cbStartIndex, charcount_t cchStartIndex, DecodeOptions options)
  495. {
  496. DecodeOptions localOptions = options;
  497. LPCUTF8 pchCurrent = pch + cbStartIndex;
  498. LPCUTF8 pchEnd = pch + cbLength;
  499. LPCUTF8 pchEndMinus4 = pch + (cbLength - 4);
  500. charcount_t i = cchIndex - cchStartIndex;
  501. // Avoid using a reinterpret_cast to start a misaligned read.
  502. if (!IsAligned(pchCurrent)) goto LSlowPath;
  503. LFastPath:
  504. // Skip 4 bytes at a time.
  505. while (pchCurrent < pchEndMinus4 && i > 4)
  506. {
  507. uint32 ch4 = *reinterpret_cast<const uint32 *>(pchCurrent);
  508. if ((ch4 & 0x80808080) == 0)
  509. {
  510. pchCurrent += 4;
  511. i -= 4;
  512. }
  513. else break;
  514. }
  515. LSlowPath:
  516. while (pchCurrent < pchEnd && i > 0)
  517. {
  518. Decode(pchCurrent, pchEnd, localOptions);
  519. i--;
  520. // Try to return to the fast path avoiding misaligned reads.
  521. if (i > 4 && IsAligned(pchCurrent)) goto LFastPath;
  522. }
  523. return i > 0 ? cbLength : pchCurrent - pch;
  524. }
  525. // Convert byte index into character index
  526. charcount_t ByteIndexIntoCharacterIndex(__in_ecount(cbIndex) LPCUTF8 pch, size_t cbIndex, DecodeOptions options)
  527. {
  528. DecodeOptions localOptions = options;
  529. LPCUTF8 pchCurrent = pch;
  530. LPCUTF8 pchEnd = pch + cbIndex;
  531. LPCUTF8 pchEndMinus4 = pch + (cbIndex - 4);
  532. charcount_t i = 0;
  533. // Avoid using a reinterpret_cast to start a misaligned read.
  534. if (!IsAligned(pchCurrent)) goto LSlowPath;
  535. LFastPath:
  536. // Skip 4 bytes at a time.
  537. while (pchCurrent < pchEndMinus4)
  538. {
  539. uint32 ch4 = *reinterpret_cast<const uint32 *>(pchCurrent);
  540. if ((ch4 & 0x80808080) == 0)
  541. {
  542. pchCurrent += 4;
  543. i += 4;
  544. }
  545. else break;
  546. }
  547. LSlowPath:
  548. while (pchCurrent < pchEnd)
  549. {
  550. LPCUTF8 s = pchCurrent;
  551. Decode(pchCurrent, pchEnd, localOptions);
  552. if (s == pchCurrent) break;
  553. i++;
  554. // Try to return to the fast path avoiding misaligned reads.
  555. if (IsAligned(pchCurrent)) goto LFastPath;
  556. }
  557. return i;
  558. }
  559. } // namespace utf8
  560. #ifdef _MSC_VER
  561. #pragma warning(pop)
  562. #endif