Utf8Codex.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. #ifdef _WIN32
  7. #include <windows.h>
  8. #include <wtypes.h>
  9. typedef wchar_t wchar16;
  10. #else
  11. typedef char16_t wchar16;
  12. // TODO: Abstract out into it's own file
  13. #include "pal.h"
  14. #include "inc/rt/palrt.h"
  15. // Templates are defined here in order to avoid a dependency on C++
  16. // <type_traits> header file,
  17. // or on compiler-specific contructs.
  18. extern "C++" {
  19. template <size_t S>
  20. struct _ENUM_FLAG_INTEGER_FOR_SIZE;
  21. template <>
  22. struct _ENUM_FLAG_INTEGER_FOR_SIZE<1>
  23. {
  24. typedef int8_t type;
  25. };
  26. template <>
  27. struct _ENUM_FLAG_INTEGER_FOR_SIZE<2>
  28. {
  29. typedef int16_t type;
  30. };
  31. template <>
  32. struct _ENUM_FLAG_INTEGER_FOR_SIZE<4>
  33. {
  34. typedef int32_t type;
  35. };
  36. // used as an approximation of std::underlying_type<T>
  37. template <class T>
  38. struct _ENUM_FLAG_SIZED_INTEGER
  39. {
  40. typedef typename _ENUM_FLAG_INTEGER_FOR_SIZE<sizeof(T)>::type
  41. type;
  42. };
  43. }
  44. #define DEFINE_ENUM_FLAG_OPERATORS(ENUMTYPE) \
  45. extern "C++" { \
  46. inline ENUMTYPE operator | (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a) | ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  47. inline ENUMTYPE &operator |= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type &)a) |= ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  48. inline ENUMTYPE operator & (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a) & ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  49. inline ENUMTYPE &operator &= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type &)a) &= ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  50. inline ENUMTYPE operator ~ (ENUMTYPE a) { return ENUMTYPE(~((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a)); } \
  51. inline ENUMTYPE operator ^ (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a) ^ ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  52. inline ENUMTYPE &operator ^= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type &)a) ^= ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \
  53. }
  54. #endif
  55. typedef unsigned __int32 uint32;
  56. // charcount_t represents a count of characters in a String
  57. // It is unsigned and the maximum value is (INT_MAX-1)
  58. typedef uint32 charcount_t;
  59. typedef BYTE utf8char_t;
  60. typedef const utf8char_t CUTF8;
  61. typedef utf8char_t* LPUTF8;
  62. typedef const utf8char_t *LPCUTF8;
  63. // Unicode 4.0, unknown char should be converted to replace mark, U+FFFD.
  64. #define UNICODE_UNKNOWN_CHAR_MARK 0xFFFD
  65. #define UNICODE_TCHAR_UKNOWN_CHAR_MARK _T('\xFFFD')
  66. namespace utf8
  67. {
  68. // Terminology -
  69. // Code point - A ordinal value mapped to a standard ideograph as defined by ISO/IEC 10646-1. Here
  70. // also referred to as a UCS code point but can also be often be referred to as a UNICODE
  71. // code point.
  72. // UTF-8 - An encoding of UCS code points as defined by RFC-3629.
  73. // UTF-16 - An encoding of UCS code points as defined by RFC-2781. Use as a synonym for UNICODE or
  74. // UCS-2. This is technically incorrect but usually harmless. This file assumes wchar16 *
  75. // maps to an UTF-16LE (little-endian) encoded sequence of words.
  76. // Unit - The unit of encoding. For UTF-8 it is a byte (octet). For UTF-16 it is a word (two octets).
  77. // Valid - A UTF-8 byte sequence conforming to RFC-3629.
  78. // Well-formed - A sequence of bytes that conform to the encoding pattern of UTF8 but might be too long or
  79. // otherwise invalid. For example C0 80 is a well-formed but invalid encoding of U+0000.
  80. // Start byte - A byte can start a well-formed UTF-8 sequence.
  81. // Lead byte - A byte can start a well-formed multi-unit sequence but not a single byte sequence.
  82. // Trail byte - A byte that can appear after a lead-byte in a well-formed multi-unit sequence.
  83. // Surrogate pair - A UTF-16 word pair to encode characters outside the Unicode base plain as defined by
  84. // RFC-2781. Two wchar16 values are used to encode one UCS code point.
  85. // character index - The index into a UTF-16 sequence.
  86. // byte index - The index into a UTF-8 sequence.
  87. // Return the number of bytes needed to encode the given character (ignoring surrogate pairs)
  88. inline size_t EncodedSize(wchar16 ch)
  89. {
  90. if (ch < 0x0080) return 1;
  91. if (ch < 0x0800) return 2;
  92. return 3;
  93. }
  94. enum DecodeOptions
  95. {
  96. doDefault = 0x00,
  97. doAllowThreeByteSurrogates = 0x01, // Allow invalid 3 byte encodings as would be encoded by CSEU-8
  98. doChunkedEncoding = 0x02, // For sequences at the end of a buffer do not advance into incomplete sequences
  99. // If incomplete UTF-8 sequence is encountered at the end of a buffer, this
  100. // option will cause Decode() to not advance the ptr value and DecodeTail to
  101. // move the pointer back one position so it again points to where c1 was read by
  102. // Decode(). In effect, incomplete sequences are treated as if end pointed to the
  103. // beginning incomplete sequence instead of in the middle of it.
  104. doSecondSurrogatePair = 0x04, // A previous call to DecodeTail returned the first word of a UTF-16
  105. // surrogate pair. The second call will return the second word and reset
  106. // this 'option'.
  107. doAllowInvalidWCHARs = 0x08, // Don't replace invalid wide chars with 0xFFFD
  108. };
  109. DEFINE_ENUM_FLAG_OPERATORS(DecodeOptions);
  110. // Decode the trail bytes after the UTF8 lead byte c1 but returning 0xFFFD if trail bytes are expected after end.
  111. _At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end))
  112. wchar16 DecodeTail(wchar16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options);
  113. // Decode the UTF8 sequence into a UTF16 encoding. Code points outside the Unicode base plain will generate
  114. // surrogate pairs, using the 'doSecondSurrogatePair' option to remember the first word has already been returned.
  115. // If ptr == end 0x0000 is emitted. If ptr < end but the lead byte of the UTF8 sequence
  116. // expects trail bytes past end then 0xFFFD are emitted until ptr == end.
  117. _At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) && ptr <= end))
  118. inline wchar16 Decode(LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options)
  119. {
  120. if (ptr >= end) return 0;
  121. utf8char_t c1 = *ptr++;
  122. if (c1 < 0x80) return static_cast<wchar16>(c1);
  123. return DecodeTail(c1, ptr, end, options);
  124. }
  125. // Encode ch into a UTF8 sequence ignoring surrogate pairs (which are encoded as two
  126. // separate code points). Use Encode() instead of EncodeFull() directly because it
  127. // special cases ASCII to avoid a call the most common characters.
  128. LPUTF8 EncodeFull(wchar16 ch, __out_ecount(3) LPUTF8 ptr);
  129. // Encode ch into a UTF8 sequence ignoring surrogate pairs (which are encoded as two
  130. // separate code points).
  131. inline LPUTF8 Encode(wchar16 ch, __out_ecount(3) LPUTF8 ptr)
  132. {
  133. if (ch < 0x80)
  134. {
  135. *ptr = static_cast<utf8char_t>(ch);
  136. return ptr + 1;
  137. }
  138. return EncodeFull(ch, ptr);
  139. }
  140. // Return true if ch is a lead byte of a UTF8 multi-unit sequence.
  141. inline bool IsLeadByte(utf8char_t ch)
  142. {
  143. return ch >= 0xC0;
  144. }
  145. // Return true if ch is a byte that starts a well-formed UTF8 sequence (i.e. is an ASCII character or a valid UTF8 lead byte)
  146. inline bool IsStartByte(utf8char_t ch)
  147. {
  148. return ch < 0x80 || ch >= 0xC0;
  149. }
  150. // Returns true if ch is a UTF8 multi-unit sequence trail byte.
  151. inline bool IsTrailByte(utf8char_t ch)
  152. {
  153. return (ch & 0xC0) == 0x80;
  154. }
  155. // Returns true if ptr points to a well-formed UTF8
  156. inline bool IsCharStart(LPCUTF8 ptr)
  157. {
  158. return IsStartByte(*ptr);
  159. }
  160. // Return the start of the next well-formed UTF-8 sequence. Use NextChar() instead of
  161. // NextCharFull() since NextChar() avoid a call if ptr references a single byte sequence.
  162. LPCUTF8 NextCharFull(LPCUTF8 ptr);
  163. // Return the start of the next well-formed UTF-8 sequence.
  164. inline LPCUTF8 NextChar(LPCUTF8 ptr)
  165. {
  166. if (*ptr < 0x80) return ptr + 1;
  167. return NextCharFull(ptr);
  168. }
  169. // Return the start of the previous well-formed UTF-8 sequence prior to start or start if
  170. // if ptr is already start or no well-formed sequence starts a start. Use PrevChar() instead of
  171. // PrevCharFull() since PrevChar() avoids a call if the previous sequence is a single byte
  172. // sequence.
  173. LPCUTF8 PrevCharFull(LPCUTF8 ptr, LPCUTF8 start);
  174. // Return the start of the previous well-formed UTF-8 sequence prior to start or start if
  175. // if ptr is already start or no well-formed sequence starts a start.
  176. inline LPCUTF8 PrevChar(LPCUTF8 ptr, LPCUTF8 start)
  177. {
  178. if (ptr > start && *(ptr - 1) < 0x80) return ptr - 1;
  179. return PrevCharFull(ptr, start);
  180. }
  181. // Decode a UTF-8 sequence of cch UTF-16 characters into buffer. ptr could advance up to 3 times
  182. // longer than cch so DecodeInto should only be used when it is already known that
  183. // ptr refers to at least cch number of UTF-8 sequences.
  184. void DecodeInto(__out_ecount_full(cch) wchar16 *buffer, LPCUTF8 ptr, size_t cch, DecodeOptions options = doDefault);
  185. // Provided for dual-mode templates
  186. inline void DecodeInto(__out_ecount_full(cch) wchar16 *buffer, const wchar16 *ptr, size_t cch, DecodeOptions /* options */ = doDefault)
  187. {
  188. memcpy_s(buffer, cch * sizeof(wchar16), ptr, cch * sizeof(wchar16));
  189. }
  190. // Like DecodeInto but ensures buffer ends with a NULL at buffer[cch].
  191. void DecodeIntoAndNullTerminate(__out_ecount(cch+1) __nullterminated wchar16 *buffer, LPCUTF8 ptr, size_t cch, DecodeOptions options = doDefault);
  192. // Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer
  193. _Ret_range_(0, pbEnd - _Old_(pbUtf8))
  194. size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) wchar16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);
  195. // Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer (excluding the null terminator)
  196. size_t DecodeUnitsIntoAndNullTerminate(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated wchar16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);
  197. // Encode a UTF-8 sequence into a UTF-8 sequence (which is just a memcpy). This is included for convenience in templates
  198. // when the character encoding is a template parameter.
  199. __range(cch, cch)
  200. inline size_t EncodeInto(__out_ecount(cch) utf8char_t *buffer, const utf8char_t *source, size_t cch)
  201. {
  202. memcpy_s(buffer, cch * sizeof(utf8char_t), source, cch * sizeof(utf8char_t));
  203. return cch;
  204. }
  205. // Encode a UTF16-LE sequence of cch words into a UTF-8 sequence returning the number of bytes needed.
  206. // Since a UTF16 encoding can take up to 3 bytes buffer must refer to a buffer at least 3 times larger
  207. // than cch.
  208. // Returns the number of bytes copied into the buffer.
  209. __range(0, cch * 3)
  210. size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const wchar16 *source, charcount_t cch);
  211. // Like EncodeInto but ensures that buffer[return value] == 0.
  212. __range(0, cch * 3)
  213. size_t EncodeIntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const wchar16 *source, charcount_t cch);
  214. // Returns true if the pch refers to a UTF-16LE encoding of the given UTF-8 encoding bch.
  215. bool CharsAreEqual(__in_ecount(cch) LPCOLESTR pch, LPCUTF8 bch, size_t cch, DecodeOptions options = doDefault);
  216. // Convert the character index into a byte index.
  217. size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, size_t cbStartIndex, charcount_t cchStartIndex, DecodeOptions options = doDefault);
  218. size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, DecodeOptions options = doDefault);
  219. // Convert byte index into character index
  220. charcount_t ByteIndexIntoCharacterIndex(__in_ecount(cbIndex) LPCUTF8 pch, size_t cbIndex, DecodeOptions options = doDefault);
  221. }