Utf8Codex.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. #ifdef _WIN32
  7. #include <windows.h>
  8. #include <wtypes.h>
  9. #else
  10. // TODO: Abstract out into it's own file
  11. #include "pal.h"
  12. #include "inc/rt/palrt.h"
  13. #include <stdint.h>
  14. #endif
  15. // Utf8Codex.h needs to be self contained, so these type defs are duplicated from CommonTypeDefs.h
  16. #ifdef _WIN32
  17. typedef WCHAR char16;
  18. #define _u(s) L##s
  19. #else
  20. #define _u(s) u##s
  21. #endif
  22. typedef char16 wchar;
  23. #ifndef Unused
  24. #define Unused(var) var
  25. #endif
  26. extern void CodexAssert(bool condition);
  27. extern void CodexAssertOrFailFast(bool condition);
  28. #ifdef _MSC_VER
  29. //=============================
  30. // Disabled Warnings
  31. //=============================
  32. #pragma warning(push)
  33. #pragma warning(disable: 4127) // constant expression for template parameter
  34. #endif
  35. #ifndef _WIN32
  36. // Templates are defined here in order to avoid a dependency on C++
  37. // <type_traits> header file,
  38. // or on compiler-specific contructs.
  39. extern "C++" {
  40. template <size_t S>
  41. struct _ENUM_FLAG_INTEGER_FOR_SIZE;
  42. template <>
  43. struct _ENUM_FLAG_INTEGER_FOR_SIZE<1>
  44. {
  45. typedef int8_t type;
  46. };
  47. template <>
  48. struct _ENUM_FLAG_INTEGER_FOR_SIZE<2>
  49. {
  50. typedef int16_t type;
  51. };
  52. template <>
  53. struct _ENUM_FLAG_INTEGER_FOR_SIZE<4>
  54. {
  55. typedef int32_t type;
  56. };
  57. // used as an approximation of std::underlying_type<T>
  58. template <class T>
  59. struct _ENUM_FLAG_SIZED_INTEGER
  60. {
  61. typedef typename _ENUM_FLAG_INTEGER_FOR_SIZE<sizeof(T)>::type
  62. type;
  63. };
  64. }
  65. #define DEFINE_ENUM_FLAG_OPERATORS(ENUMTYPE) \
  66. extern "C++" { \
  67. inline ENUMTYPE operator | (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a) | ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  68. inline ENUMTYPE &operator |= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type &)a) |= ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  69. inline ENUMTYPE operator & (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a) & ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  70. inline ENUMTYPE &operator &= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type &)a) &= ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  71. inline ENUMTYPE operator ~ (ENUMTYPE a) { return ENUMTYPE(~((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a)); } \
  72. inline ENUMTYPE operator ^ (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a) ^ ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  73. inline ENUMTYPE &operator ^= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type &)a) ^= ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  74. }
  75. #endif
  76. typedef unsigned __int32 uint32;
  77. // charcount_t represents a count of characters in a String
  78. // It is unsigned and the maximum value is (INT_MAX-1)
  79. typedef uint32 charcount_t;
  80. typedef BYTE utf8char_t;
  81. typedef const utf8char_t CUTF8;
  82. typedef utf8char_t* LPUTF8;
  83. typedef const utf8char_t *LPCUTF8;
  84. // Unicode 4.0, unknown char should be converted to replace mark, U+FFFD.
  85. #define UNICODE_UNKNOWN_CHAR_MARK 0xFFFD
  86. #define UNICODE_TCHAR_UKNOWN_CHAR_MARK _T('\xFFFD')
  87. namespace utf8
  88. {
  89. class InvalidWideCharException {};
  90. // Terminology -
  91. // Code point - A ordinal value mapped to a standard ideograph as defined by ISO/IEC 10646-1. Here
  92. // also referred to as a UCS code point but can also be often be referred to as a UNICODE
  93. // code point.
  94. // UTF-8 - An encoding of UCS code points as defined by RFC-3629.
  95. // UTF-16 - An encoding of UCS code points as defined by RFC-2781. Use as a synonym for UNICODE or
  96. // UCS-2. This is technically incorrect but usually harmless. This file assumes char16 *
  97. // maps to an UTF-16LE (little-endian) encoded sequence of words.
  98. // Unit - The unit of encoding. For UTF-8 it is a byte (octet). For UTF-16 it is a word (two octets).
  99. // Valid - A UTF-8 byte sequence conforming to RFC-3629.
  100. // Well-formed - A sequence of bytes that conform to the encoding pattern of UTF8 but might be too long or
  101. // otherwise invalid. For example C0 80 is a well-formed but invalid encoding of U+0000.
  102. // Start byte - A byte can start a well-formed UTF-8 sequence.
  103. // Lead byte - A byte can start a well-formed multi-unit sequence but not a single byte sequence.
  104. // Trail byte - A byte that can appear after a lead-byte in a well-formed multi-unit sequence.
  105. // Surrogate pair - A UTF-16 word pair to encode characters outside the Unicode base plain as defined by
  106. // RFC-2781. Two char16 values are used to encode one UCS code point.
  107. // character index - The index into a UTF-16 sequence.
  108. // byte index - The index into a UTF-8 sequence.
  109. // Return the number of bytes needed to encode the given character (ignoring surrogate pairs)
  110. inline size_t EncodedSize(char16 ch)
  111. {
  112. if (ch < 0x0080) return 1;
  113. if (ch < 0x0800) return 2;
  114. return 3;
  115. }
  116. enum DecodeOptions
  117. {
  118. doDefault = 0x00,
  119. doAllowThreeByteSurrogates = 0x01, // Allow invalid 3 byte encodings as would be encoded by CSEU-8
  120. doChunkedEncoding = 0x02, // For sequences at the end of a buffer do not advance into incomplete sequences
  121. // If incomplete UTF-8 sequence is encountered at the end of a buffer, this
  122. // option will cause Decode() to not advance the ptr value and DecodeTail to
  123. // move the pointer back one position so it again points to where c1 was read by
  124. // Decode(). In effect, incomplete sequences are treated as if end pointed to the
  125. // beginning incomplete sequence instead of in the middle of it.
  126. doSecondSurrogatePair = 0x04, // A previous call to DecodeTail returned the first word of a UTF-16
  127. // surrogate pair. The second call will return the second word and reset
  128. // this 'option'.
  129. doAllowInvalidWCHARs = 0x08, // Don't replace invalid wide chars with 0xFFFD
  130. doThrowOnInvalidWCHARs = 0x10, // throw InvalidWideCharException if an invalid wide char is seen. Incompatible with doAllowInvalidWCHARs
  131. };
  132. DEFINE_ENUM_FLAG_OPERATORS(DecodeOptions);
  133. BOOL IsValidWideChar(char16 ch);
  134. const char16 WCH_UTF16_HIGH_FIRST = char16(0xd800);
  135. const char16 WCH_UTF16_HIGH_LAST = char16(0xdbff);
  136. const char16 WCH_UTF16_LOW_FIRST = char16(0xdc00);
  137. const char16 WCH_UTF16_LOW_LAST = char16(0xdfff);
  138. inline BOOL InRange(const char16 ch, const char16 chMin, const char16 chMax)
  139. {
  140. return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
  141. }
  142. inline BOOL IsHighSurrogateChar(char16 ch)
  143. {
  144. return InRange(ch, WCH_UTF16_HIGH_FIRST, WCH_UTF16_HIGH_LAST);
  145. }
  146. inline BOOL IsLowSurrogateChar(char16 ch)
  147. {
  148. return InRange(ch, WCH_UTF16_LOW_FIRST, WCH_UTF16_LOW_LAST);
  149. }
  150. // Decode the trail bytes after the UTF8 lead byte c1 but returning 0xFFFD if trail bytes are expected after end.
  151. _At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end))
  152. char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence = nullptr);
  153. // Decode the UTF8 sequence into a UTF16 encoding. Code points outside the Unicode base plain will generate
  154. // surrogate pairs, using the 'doSecondSurrogatePair' option to remember the first word has already been returned.
  155. // If ptr == end 0x0000 is emitted. If ptr < end but the lead byte of the UTF8 sequence
  156. // expects trail bytes past end then 0xFFFD are emitted until ptr == end.
  157. _At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) && ptr <= end))
  158. inline char16 Decode(LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence = nullptr)
  159. {
  160. if (ptr >= end) return 0;
  161. utf8char_t c1 = *ptr++;
  162. if (c1 < 0x80) return static_cast<char16>(c1);
  163. return DecodeTail(c1, ptr, end, options, chunkEndsAtTruncatedSequence);
  164. }
  165. // Encode ch into a UTF8 sequence ignoring surrogate pairs (which are encoded as two
  166. // separate code points). Use Encode() instead of EncodeFull() directly because it
  167. // special cases ASCII to avoid a call the most common characters.
  168. template <bool countBytesOnly>
  169. LPUTF8 EncodeFull(char16 ch, __out_ecount(3) LPUTF8 ptr, const void * bufferEnd)
  170. {
  171. if (ch < 0x0080)
  172. {
  173. // One byte
  174. if (countBytesOnly)
  175. {
  176. ptr++;
  177. }
  178. else
  179. {
  180. CodexAssertOrFailFast(ptr < bufferEnd);
  181. *ptr++ = static_cast<utf8char_t>(ch);
  182. }
  183. }
  184. else if (ch < 0x0800)
  185. {
  186. // Two bytes : 110yyyxx 10xxxxxx
  187. if (countBytesOnly)
  188. {
  189. ptr += 2;
  190. }
  191. else
  192. {
  193. CodexAssertOrFailFast(ptr + 2 <= bufferEnd);
  194. *ptr++ = static_cast<utf8char_t>(ch >> 6) | 0xc0;
  195. *ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80;
  196. }
  197. }
  198. else
  199. {
  200. // Three bytes : 1110yyyy 10yyyyxx 10xxxxxx
  201. if (countBytesOnly)
  202. {
  203. ptr += 3;
  204. }
  205. else
  206. {
  207. CodexAssertOrFailFast(ptr + 3 <= bufferEnd);
  208. *ptr++ = static_cast<utf8char_t>(ch >> 12) | 0xE0;
  209. *ptr++ = static_cast<utf8char_t>((ch >> 6) & 0x3F) | 0x80;
  210. *ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80;
  211. }
  212. }
  213. return ptr;
  214. }
  215. // Encode a surrogate pair into a utf8 sequence
  216. template <bool countBytesOnly>
  217. LPUTF8 EncodeSurrogatePair(char16 surrogateHigh, char16 surrogateLow, __out_ecount(4) LPUTF8 ptr)
  218. {
  219. // A unicode codepoint is encoded into a surrogate pair by doing the following:
  220. // subtract 0x10000 from the codepoint
  221. // Split the resulting value into the high-ten bits and low-ten bits
  222. // Add 0xD800 to the high ten bits, and 0xDC00 to the low ten bits
  223. // Below, we want to decode the surrogate pair to its original codepoint
  224. // So we do the above process in reverse
  225. uint32 highTen = (surrogateHigh - 0xD800);
  226. uint32 lowTen = (surrogateLow - 0xDC00);
  227. uint32 codepoint = 0x10000 + ((highTen << 10) | lowTen);
  228. // This is the maximum valid unicode codepoint
  229. // This should be ensured anyway since you can't encode a value higher
  230. // than this as a surrogate pair, so we assert this here
  231. CodexAssert(codepoint <= 0x10FFFF);
  232. // Now we need to encode the code point into utf-8
  233. // Codepoints in the range that gets encoded into a surrogate pair
  234. // gets encoded into 4 bytes under utf8
  235. // Since the codepoint can be represented by 21 bits, the encoding
  236. // does the following: first 3 bits in the first byte, the next 6 in the
  237. // second, the next six in the third, and the last six in the 4th byte
  238. if (countBytesOnly) {
  239. ptr += 4;
  240. }
  241. else
  242. {
  243. *ptr++ = static_cast<utf8char_t>(codepoint >> 18) | 0xF0;
  244. *ptr++ = static_cast<utf8char_t>((codepoint >> 12) & 0x3F) | 0x80;
  245. *ptr++ = static_cast<utf8char_t>((codepoint >> 6) & 0x3F) | 0x80;
  246. *ptr++ = static_cast<utf8char_t>(codepoint & 0x3F) | 0x80;
  247. }
  248. return ptr;
  249. }
  250. // Encode ch into a UTF8 sequence ignoring surrogate pairs (which are encoded as two
  251. // separate code points).
  252. template <bool countBytesOnly>
  253. inline LPUTF8 Encode(char16 ch, _When_(!countBytesOnly, __out_ecount(3)) LPUTF8 ptr, const void * bufferEnd)
  254. {
  255. if (ch < 0x80)
  256. {
  257. if (!countBytesOnly)
  258. {
  259. CodexAssertOrFailFast(ptr < bufferEnd);
  260. *ptr = static_cast<utf8char_t>(ch);
  261. }
  262. return ptr + 1;
  263. }
  264. return EncodeFull<countBytesOnly>(ch, ptr, bufferEnd);
  265. }
  266. // Encode ch into a UTF8 sequence while being aware of surrogate pairs.
  267. template <bool countBytesOnly>
  268. inline LPUTF8 EncodeTrueUtf8(char16 ch, const char16** source, charcount_t* cch, _When_(!countBytesOnly, __out_ecount((*cch + 1) * 3)) LPUTF8 ptr, const void * bufferEnd)
  269. {
  270. if (ch < 0x80)
  271. {
  272. if (!countBytesOnly)
  273. {
  274. CodexAssertOrFailFast(ptr < bufferEnd);
  275. *ptr = static_cast<utf8char_t>(ch);
  276. }
  277. return ptr + 1;
  278. }
  279. else if (ch < 0xD800 || (ch >= 0xE000 && ch <= 0xFFFF))
  280. {
  281. return EncodeFull<countBytesOnly>(ch, ptr, bufferEnd);
  282. }
  283. // We're now decoding a surrogate pair. If the input is malformed (eg. low surrogate is absent)
  284. // we'll instead encode the unicode replacement character as utf8
  285. if (*cch > 0)
  286. {
  287. char16 surrogateHigh = ch;
  288. char16 surrogateLow = **source;
  289. // Validate that the surrogate code units are within the appropriate
  290. // ranges for high and low surrogates
  291. if ((surrogateHigh >= 0xD800 && surrogateHigh <= 0xDBFF) &&
  292. (surrogateLow >= 0xDC00 && surrogateLow <= 0xDFFF))
  293. {
  294. LPUTF8 retptr = EncodeSurrogatePair<countBytesOnly>(surrogateHigh, surrogateLow, ptr);
  295. // SAL analysis gets confused if we call EncodeSurrogatePair after
  296. // modifying cch
  297. // Consume the low surrogate
  298. *source = *source + 1;
  299. *cch = *cch - 1;
  300. return retptr;
  301. }
  302. }
  303. // Invalid input: insert the unicode replacement character instead
  304. if (!countBytesOnly)
  305. {
  306. CodexAssertOrFailFast(ptr + 3 <= bufferEnd);
  307. ptr[0] = 0xEF;
  308. ptr[1] = 0xBF;
  309. ptr[2] = 0xBD;
  310. }
  311. return ptr + 3;
  312. }
  313. // Return true if ch is a lead byte of a UTF8 multi-unit sequence.
  314. inline bool IsLeadByte(utf8char_t ch)
  315. {
  316. return ch >= 0xC0;
  317. }
  318. // Return true if ch is a byte that starts a well-formed UTF8 sequence (i.e. is an ASCII character or a valid UTF8 lead byte)
  319. inline bool IsStartByte(utf8char_t ch)
  320. {
  321. return ch < 0x80 || ch >= 0xC0;
  322. }
  323. // Returns true if ch is a UTF8 multi-unit sequence trail byte.
  324. inline bool IsTrailByte(utf8char_t ch)
  325. {
  326. return (ch & 0xC0) == 0x80;
  327. }
  328. // Returns true if ptr points to a well-formed UTF8
  329. inline bool IsCharStart(LPCUTF8 ptr)
  330. {
  331. return IsStartByte(*ptr);
  332. }
  333. // Return the start of the next well-formed UTF-8 sequence. Use NextChar() instead of
  334. // NextCharFull() since NextChar() avoid a call if ptr references a single byte sequence.
  335. LPCUTF8 NextCharFull(LPCUTF8 ptr);
  336. // Return the start of the next well-formed UTF-8 sequence.
  337. inline LPCUTF8 NextChar(LPCUTF8 ptr)
  338. {
  339. if (*ptr < 0x80) return ptr + 1;
  340. return NextCharFull(ptr);
  341. }
  342. // Return the start of the previous well-formed UTF-8 sequence prior to start or start if
  343. // if ptr is already start or no well-formed sequence starts a start. Use PrevChar() instead of
  344. // PrevCharFull() since PrevChar() avoids a call if the previous sequence is a single byte
  345. // sequence.
  346. LPCUTF8 PrevCharFull(LPCUTF8 ptr, LPCUTF8 start);
  347. // Return the start of the previous well-formed UTF-8 sequence prior to start or start if
  348. // if ptr is already start or no well-formed sequence starts a start.
  349. inline LPCUTF8 PrevChar(LPCUTF8 ptr, LPCUTF8 start)
  350. {
  351. if (ptr > start && *(ptr - 1) < 0x80) return ptr - 1;
  352. return PrevCharFull(ptr, start);
  353. }
  354. // Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer
  355. _Ret_range_(0, pbEnd - _Old_(pbUtf8))
  356. size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault, bool *chunkEndsAtTruncatedSequence = nullptr);
  357. // Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer (excluding the null terminator)
  358. size_t DecodeUnitsIntoAndNullTerminate(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault, bool *chunkEndsAtTruncatedSequence = nullptr);
  359. size_t DecodeUnitsIntoAndNullTerminateNoAdvance(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8 pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault, bool *chunkEndsAtTruncatedSequence = nullptr);
  360. // Encode a UTF-8 sequence into a UTF-8 sequence (which is just a memcpy). This is included for convenience in templates
  361. // when the character encoding is a template parameter.
  362. __range(cbSource, cbDest)
  363. __precond(cbDest == cbSource)
  364. inline size_t EncodeInto(
  365. _Out_writes_(cbDest) utf8char_t *dest,
  366. size_t cbDest,
  367. _In_reads_(cbSource) const utf8char_t *source,
  368. size_t cbSource)
  369. {
  370. memcpy_s(dest, cbDest * sizeof(utf8char_t), source, cbSource * sizeof(utf8char_t));
  371. return cbDest;
  372. }
  373. enum class Utf8EncodingKind
  374. {
  375. Cesu8,
  376. TrueUtf8
  377. };
  378. // Encode a UTF16-LE sequence of cchSource words (char16) into a UTF-8 sequence returning the number of bytes needed.
  379. // Since a UTF16 encoding can encode to at most 3 bytes (utf8char_t) per char16, cbDest (dest buffer size) can be
  380. // at most 3 * cchSource.
  381. // Returns the number of bytes copied into the dest buffer.
  382. template <Utf8EncodingKind encoding>
  383. __range(0, cchSource * 3)
  384. size_t EncodeInto(
  385. _Out_writes_(cbDest) utf8char_t *dest,
  386. __range(0, cchSource * 3) size_t cbDest,
  387. _In_reads_(cchSource) const char16 *source,
  388. __range(0, INT_MAX) charcount_t cchSource);
  389. // Like EncodeInto but ensures that dest[return value] == 0.
  390. template <Utf8EncodingKind encoding>
  391. __range(0, cchSource * 3)
  392. size_t EncodeIntoAndNullTerminate(
  393. _Out_writes_z_(cbDest) utf8char_t *dest,
  394. __range(1, cchSource * 3 + 1) size_t cbDest, // must be at least large enough to write null terminator
  395. _In_reads_(cchSource) const char16 *source,
  396. __range(0, INT_MAX) charcount_t cchSource);
  397. // Determine the number of UTF-8 bytes needed to represent a UTF16-LE sequence of cch * words (char16)
  398. __range(0, cch * 3)
  399. size_t CountTrueUtf8(__in_ecount(cch) const char16 *source, charcount_t cch);
  400. // Returns true if the pch refers to a UTF-16LE encoding of the given UTF-8 encoding bch.
  401. bool CharsAreEqual(LPCOLESTR pch, LPCUTF8 bch, LPCUTF8 end, DecodeOptions options = doDefault);
  402. // Convert the character index into a byte index.
  403. size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, size_t cbStartIndex, charcount_t cchStartIndex, DecodeOptions options = doDefault);
  404. size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, DecodeOptions options = doDefault);
  405. // Convert byte index into character index
  406. charcount_t ByteIndexIntoCharacterIndex(__in_ecount(cbIndex) LPCUTF8 pch, size_t cbIndex, DecodeOptions options = doDefault);
  407. }
  408. #ifdef _MSC_VER
  409. #pragma warning(pop)
  410. #endif