Utf8Helper.h 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. #include "Utf8Codex.h"
  7. namespace utf8
  8. {
  9. ///
  10. /// Use the codex library to encode a UTF16 string to UTF8.
  11. /// The caller is responsible for freeing the memory, which is allocated
  12. /// using Allocator.
  13. /// The returned string is null terminated.
  14. ///
  15. template <class Allocator>
  16. HRESULT WideStringToNarrow(_In_ LPCWSTR sourceString, size_t sourceCount, _Out_ LPSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  17. {
  18. size_t cchSourceString = sourceCount;
  19. if (cchSourceString >= MAXUINT32)
  20. {
  21. return E_OUTOFMEMORY;
  22. }
  23. size_t cbDestString = (cchSourceString + 1) * 3;
  24. // Check for overflow- cbDestString should be >= cchSourceString
  25. if (cbDestString < cchSourceString)
  26. {
  27. return E_OUTOFMEMORY;
  28. }
  29. utf8char_t* destString = (utf8char_t*)Allocator::allocate(cbDestString);
  30. if (destString == nullptr)
  31. {
  32. return E_OUTOFMEMORY;
  33. }
  34. size_t cbEncoded = utf8::EncodeTrueUtf8IntoAndNullTerminate(destString, sourceString, (charcount_t) cchSourceString);
  35. Assert(cbEncoded <= cbDestString);
  36. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  37. *destStringPtr = (char*)destString;
  38. *destCount = cbEncoded;
  39. if (allocateCount != nullptr) *allocateCount = cbEncoded;
  40. return S_OK;
  41. }
  42. ///
  43. /// Use the codex library to encode a UTF8 string to UTF16.
  44. /// The caller is responsible for freeing the memory, which is allocated
  45. /// using Allocator.
  46. /// The returned string is null terminated.
  47. ///
  48. template <class Allocator>
  49. HRESULT NarrowStringToWide(_In_ LPCSTR sourceString, size_t sourceCount, _Out_ LPWSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  50. {
  51. size_t cbSourceString = sourceCount;
  52. size_t sourceStart = 0;
  53. size_t cbDestString = (sourceCount + 1) * sizeof(WCHAR);
  54. if (cbDestString < sourceCount) // overflow ?
  55. {
  56. return E_OUTOFMEMORY;
  57. }
  58. WCHAR* destString = (WCHAR*)Allocator::allocate(cbDestString);
  59. if (destString == nullptr)
  60. {
  61. return E_OUTOFMEMORY;
  62. }
  63. if (allocateCount != nullptr) *allocateCount = cbDestString;
  64. for (; sourceStart < sourceCount; sourceStart++)
  65. {
  66. const char ch = sourceString[sourceStart];
  67. if ( ! (ch > 0 && ch < 0x0080) )
  68. {
  69. size_t fallback = sourceStart > 3 ? 3 : sourceStart; // 3 + 1 -> fallback at least 1 unicode char
  70. sourceStart -= fallback;
  71. break;
  72. }
  73. destString[sourceStart] = (WCHAR) ch;
  74. }
  75. if (sourceStart == sourceCount)
  76. {
  77. *destCount = sourceCount;
  78. destString[sourceCount] = WCHAR(0);
  79. *destStringPtr = destString;
  80. }
  81. else
  82. {
  83. LPCUTF8 remSourceString = (LPCUTF8)sourceString + sourceStart;
  84. WCHAR *remDestString = destString + sourceStart;
  85. charcount_t cchDestString = utf8::ByteIndexIntoCharacterIndex(remSourceString, cbSourceString - sourceStart);
  86. cchDestString += (charcount_t)sourceStart;
  87. Assert (cchDestString <= sourceCount);
  88. // Some node tests depend on the utf8 decoder not swallowing invalid unicode characters
  89. // instead of replacing them with the "replacement" chracter. Pass a flag to our
  90. // decoder to require such behavior
  91. utf8::DecodeUnitsIntoAndNullTerminateNoAdvance(remDestString, remSourceString, (LPCUTF8) sourceString + cbSourceString, DecodeOptions::doAllowInvalidWCHARs);
  92. Assert(destString[cchDestString] == 0);
  93. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  94. *destStringPtr = destString;
  95. *destCount = cchDestString;
  96. }
  97. return S_OK;
  98. }
  99. class malloc_allocator
  100. {
  101. public:
  102. static void* allocate(size_t size) { return ::malloc(size); }
  103. static void free(void* ptr, size_t count) { ::free(ptr); }
  104. };
  105. inline HRESULT WideStringToNarrowDynamic(_In_ LPCWSTR sourceString, _Out_ LPSTR* destStringPtr)
  106. {
  107. size_t unused;
  108. return WideStringToNarrow<malloc_allocator>(
  109. sourceString, wcslen(sourceString), destStringPtr, &unused);
  110. }
  111. inline HRESULT NarrowStringToWideDynamic(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr)
  112. {
  113. size_t unused;
  114. return NarrowStringToWide<malloc_allocator>(
  115. sourceString, strlen(sourceString), destStringPtr, &unused);
  116. }
  117. inline HRESULT NarrowStringToWideDynamicGetLength(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr, _Out_ size_t* destLength)
  118. {
  119. return NarrowStringToWide<malloc_allocator>(
  120. sourceString, strlen(sourceString), destStringPtr, destLength);
  121. }
  122. template <class Allocator, class SrcType, class DstType>
  123. class NarrowWideStringConverter
  124. {
  125. public:
  126. static size_t Length(const SrcType& src);
  127. static HRESULT Convert(
  128. SrcType src, size_t srcCount, DstType* dst, size_t* dstCount, size_t* allocateCount = nullptr);
  129. };
  130. template <class Allocator>
  131. class NarrowWideStringConverter<Allocator, LPCSTR, LPWSTR>
  132. {
  133. public:
  134. // Note: Typically caller should pass in Utf8 string length. Following
  135. // is used as fallback.
  136. static size_t Length(LPCSTR src)
  137. {
  138. return strnlen(src, INT_MAX);
  139. }
  140. static HRESULT Convert(
  141. LPCSTR sourceString, size_t sourceCount,
  142. LPWSTR* destStringPtr, size_t* destCount, size_t* allocateCount = nullptr)
  143. {
  144. return NarrowStringToWide<Allocator>(
  145. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  146. }
  147. };
  148. template <class Allocator>
  149. class NarrowWideStringConverter<Allocator, LPCWSTR, LPSTR>
  150. {
  151. public:
  152. // Note: Typically caller should pass in WCHAR string length. Following
  153. // is used as fallback.
  154. static size_t Length(LPCWSTR src)
  155. {
  156. return wcslen(src);
  157. }
  158. static HRESULT Convert(
  159. LPCWSTR sourceString, size_t sourceCount,
  160. LPSTR* destStringPtr, size_t* destCount, size_t* allocateCount = nullptr)
  161. {
  162. return WideStringToNarrow<Allocator>(
  163. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  164. }
  165. };
  166. template <class Allocator, class SrcType, class DstType>
  167. class NarrowWideConverter
  168. {
  169. typedef NarrowWideStringConverter<Allocator, SrcType, DstType>
  170. StringConverter;
  171. private:
  172. DstType dst;
  173. size_t dstCount;
  174. size_t allocateCount;
  175. public:
  176. NarrowWideConverter() : dst()
  177. {
  178. // do nothing
  179. }
  180. NarrowWideConverter(const SrcType& src, size_t srcCount = -1): dst()
  181. {
  182. Initialize(src, srcCount);
  183. }
  184. void Initialize(const SrcType& src, size_t srcCount = -1)
  185. {
  186. if (srcCount == -1)
  187. {
  188. srcCount = StringConverter::Length(src);
  189. }
  190. StringConverter::Convert(src, srcCount, &dst, &dstCount, &allocateCount);
  191. }
  192. ~NarrowWideConverter()
  193. {
  194. if (dst)
  195. {
  196. Allocator::free(dst, allocateCount);
  197. }
  198. }
  199. DstType Detach()
  200. {
  201. DstType result = dst;
  202. dst = DstType();
  203. return result;
  204. }
  205. operator DstType()
  206. {
  207. return dst;
  208. }
  209. size_t Length() const
  210. {
  211. return dstCount;
  212. }
  213. };
  214. typedef NarrowWideConverter<malloc_allocator, LPCSTR, LPWSTR> NarrowToWide;
  215. typedef NarrowWideConverter<malloc_allocator, LPCWSTR, LPSTR> WideToNarrow;
  216. }