Utf8Helper.h 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. #include "Utf8Codex.h"
  7. namespace utf8
  8. {
  9. ///
  10. /// Use the codex library to encode a UTF16 string to UTF8.
  11. /// The caller is responsible for freeing the memory, which is allocated
  12. /// using Allocator.
  13. /// The returned string is null terminated.
  14. ///
  15. template <typename AllocatorFunction>
  16. HRESULT WideStringToNarrow(_In_ AllocatorFunction allocator, _In_ LPCWSTR sourceString, size_t sourceCount, _Out_ LPSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  17. {
  18. size_t cchSourceString = sourceCount;
  19. if (cchSourceString >= MAXUINT32)
  20. {
  21. return E_OUTOFMEMORY;
  22. }
  23. size_t cbDestString = (cchSourceString + 1) * 3;
  24. // Check for overflow- cbDestString should be >= cchSourceString
  25. if (cbDestString < cchSourceString)
  26. {
  27. return E_OUTOFMEMORY;
  28. }
  29. utf8char_t* destString = (utf8char_t*)allocator(cbDestString);
  30. if (destString == nullptr)
  31. {
  32. return E_OUTOFMEMORY;
  33. }
  34. size_t cbEncoded = utf8::EncodeTrueUtf8IntoAndNullTerminate(destString, sourceString, (charcount_t) cchSourceString);
  35. Assert(cbEncoded <= cbDestString);
  36. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  37. *destStringPtr = (char*)destString;
  38. *destCount = cbEncoded;
  39. if (allocateCount != nullptr) *allocateCount = cbEncoded;
  40. return S_OK;
  41. }
  42. template <class Allocator>
  43. HRESULT WideStringToNarrow(_In_ LPCWSTR sourceString, size_t sourceCount, _Out_ LPSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  44. {
  45. return WideStringToNarrow(Allocator::allocate, sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  46. }
  47. ///
  48. /// Use the codex library to encode a UTF8 string to UTF16.
  49. /// The caller is responsible for freeing the memory, which is allocated
  50. /// using Allocator.
  51. /// The returned string is null terminated.
  52. ///
  53. template <typename AllocatorFunction>
  54. HRESULT NarrowStringToWide(_In_ AllocatorFunction allocator,_In_ LPCSTR sourceString, size_t sourceCount, _Out_ LPWSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  55. {
  56. size_t cbSourceString = sourceCount;
  57. size_t sourceStart = 0;
  58. size_t cbDestString = (sourceCount + 1) * sizeof(WCHAR);
  59. if (cbDestString < sourceCount) // overflow ?
  60. {
  61. return E_OUTOFMEMORY;
  62. }
  63. WCHAR* destString = (WCHAR*)allocator(cbDestString);
  64. if (destString == nullptr)
  65. {
  66. return E_OUTOFMEMORY;
  67. }
  68. if (allocateCount != nullptr) *allocateCount = cbDestString;
  69. for (; sourceStart < sourceCount; sourceStart++)
  70. {
  71. const char ch = sourceString[sourceStart];
  72. if ( ! (ch > 0 && ch < 0x0080) )
  73. {
  74. size_t fallback = sourceStart > 3 ? 3 : sourceStart; // 3 + 1 -> fallback at least 1 unicode char
  75. sourceStart -= fallback;
  76. break;
  77. }
  78. destString[sourceStart] = (WCHAR) ch;
  79. }
  80. if (sourceStart == sourceCount)
  81. {
  82. *destCount = sourceCount;
  83. destString[sourceCount] = WCHAR(0);
  84. *destStringPtr = destString;
  85. }
  86. else
  87. {
  88. LPCUTF8 remSourceString = (LPCUTF8)sourceString + sourceStart;
  89. WCHAR *remDestString = destString + sourceStart;
  90. charcount_t cchDestString = utf8::ByteIndexIntoCharacterIndex(remSourceString, cbSourceString - sourceStart);
  91. cchDestString += (charcount_t)sourceStart;
  92. Assert (cchDestString <= sourceCount);
  93. // Some node tests depend on the utf8 decoder not swallowing invalid unicode characters
  94. // instead of replacing them with the "replacement" chracter. Pass a flag to our
  95. // decoder to require such behavior
  96. utf8::DecodeUnitsIntoAndNullTerminateNoAdvance(remDestString, remSourceString, (LPCUTF8) sourceString + cbSourceString, DecodeOptions::doAllowInvalidWCHARs);
  97. Assert(destString[cchDestString] == 0);
  98. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  99. *destStringPtr = destString;
  100. *destCount = cchDestString;
  101. }
  102. return S_OK;
  103. }
  104. template <class Allocator>
  105. HRESULT NarrowStringToWide(_In_ LPCSTR sourceString, size_t sourceCount, _Out_ LPWSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  106. {
  107. return NarrowStringToWide(Allocator::allocate, sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  108. }
  109. class malloc_allocator
  110. {
  111. public:
  112. static void* allocate(size_t size) { return ::malloc(size); }
  113. static void free(void* ptr, size_t count) { ::free(ptr); }
  114. };
  115. inline HRESULT WideStringToNarrowDynamic(_In_ LPCWSTR sourceString, _Out_ LPSTR* destStringPtr)
  116. {
  117. size_t unused;
  118. return WideStringToNarrow<malloc_allocator>(
  119. sourceString, wcslen(sourceString), destStringPtr, &unused);
  120. }
  121. inline HRESULT NarrowStringToWideDynamic(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr)
  122. {
  123. size_t unused;
  124. return NarrowStringToWide<malloc_allocator>(
  125. sourceString, strlen(sourceString), destStringPtr, &unused);
  126. }
  127. inline HRESULT NarrowStringToWideDynamicGetLength(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr, _Out_ size_t* destLength)
  128. {
  129. return NarrowStringToWide<malloc_allocator>(
  130. sourceString, strlen(sourceString), destStringPtr, destLength);
  131. }
  132. template <class Allocator, class SrcType, class DstType>
  133. class NarrowWideStringConverter
  134. {
  135. public:
  136. static size_t Length(const SrcType& src);
  137. static HRESULT Convert(
  138. SrcType src, size_t srcCount, DstType* dst, size_t* dstCount, size_t* allocateCount = nullptr);
  139. };
  140. template <class Allocator>
  141. class NarrowWideStringConverter<Allocator, LPCSTR, LPWSTR>
  142. {
  143. public:
  144. // Note: Typically caller should pass in Utf8 string length. Following
  145. // is used as fallback.
  146. static size_t Length(LPCSTR src)
  147. {
  148. return strnlen(src, INT_MAX);
  149. }
  150. static HRESULT Convert(
  151. LPCSTR sourceString, size_t sourceCount,
  152. LPWSTR* destStringPtr, size_t* destCount, size_t* allocateCount = nullptr)
  153. {
  154. return NarrowStringToWide<Allocator>(
  155. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  156. }
  157. };
  158. template <class Allocator>
  159. class NarrowWideStringConverter<Allocator, LPCWSTR, LPSTR>
  160. {
  161. public:
  162. // Note: Typically caller should pass in WCHAR string length. Following
  163. // is used as fallback.
  164. static size_t Length(LPCWSTR src)
  165. {
  166. return wcslen(src);
  167. }
  168. static HRESULT Convert(
  169. LPCWSTR sourceString, size_t sourceCount,
  170. LPSTR* destStringPtr, size_t* destCount, size_t* allocateCount = nullptr)
  171. {
  172. return WideStringToNarrow<Allocator>(
  173. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  174. }
  175. };
  176. template <class Allocator, class SrcType, class DstType>
  177. class NarrowWideConverter
  178. {
  179. typedef NarrowWideStringConverter<Allocator, SrcType, DstType>
  180. StringConverter;
  181. private:
  182. DstType dst;
  183. size_t dstCount;
  184. size_t allocateCount;
  185. public:
  186. NarrowWideConverter() : dst()
  187. {
  188. // do nothing
  189. }
  190. NarrowWideConverter(const SrcType& src, size_t srcCount = -1): dst()
  191. {
  192. Initialize(src, srcCount);
  193. }
  194. void Initialize(const SrcType& src, size_t srcCount = -1)
  195. {
  196. if (srcCount == -1)
  197. {
  198. srcCount = StringConverter::Length(src);
  199. }
  200. StringConverter::Convert(src, srcCount, &dst, &dstCount, &allocateCount);
  201. }
  202. ~NarrowWideConverter()
  203. {
  204. if (dst)
  205. {
  206. Allocator::free(dst, allocateCount);
  207. }
  208. }
  209. DstType Detach()
  210. {
  211. DstType result = dst;
  212. dst = DstType();
  213. return result;
  214. }
  215. operator DstType()
  216. {
  217. return dst;
  218. }
  219. size_t Length() const
  220. {
  221. return dstCount;
  222. }
  223. };
  224. typedef NarrowWideConverter<malloc_allocator, LPCSTR, LPWSTR> NarrowToWide;
  225. typedef NarrowWideConverter<malloc_allocator, LPCWSTR, LPSTR> WideToNarrow;
  226. }