Utf8Helper.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. #include "Utf8Codex.h"
  7. namespace utf8
  8. {
  9. ///
  10. /// Use the codex library to encode a UTF16 string to UTF8.
  11. /// The caller is responsible for freeing the memory, which is allocated
  12. /// using Allocator.
  13. /// The returned string is null terminated.
  14. /// TODO(jahorto): This file's dependencies mean that it cannot be included in PlatformAgnostic
  15. /// Thus, this function is currently ~duplicated in PlatformAgnostic::Intl::Utf16ToUtf8 (Intl.cpp)
  16. /// As long as that function exists, it _must_ be updated alongside any updates here
  17. ///
  18. template <typename AllocatorFunction>
  19. HRESULT WideStringToNarrow(
  20. _In_ AllocatorFunction allocator,
  21. _In_ LPCWSTR sourceString,
  22. size_t sourceCount,
  23. _Out_ LPSTR* destStringPtr,
  24. _Out_ size_t* destCount,
  25. size_t* allocateCount = nullptr)
  26. {
  27. size_t cchSourceString = sourceCount;
  28. if (cchSourceString >= MAXUINT32)
  29. {
  30. return E_OUTOFMEMORY;
  31. }
  32. // Multiply by 3 for max size of encoded character, plus 1 for the null terminator (don't need 3 bytes for the null terminator)
  33. size_t cbDestString = (cchSourceString * 3) + 1;
  34. // Check for overflow- cbDestString should be >= cchSourceString
  35. if (cbDestString < cchSourceString)
  36. {
  37. return E_OUTOFMEMORY;
  38. }
  39. utf8char_t* destString = (utf8char_t*)allocator(cbDestString);
  40. if (destString == nullptr)
  41. {
  42. return E_OUTOFMEMORY;
  43. }
  44. size_t cbEncoded = utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>(destString, cbDestString, sourceString, static_cast<charcount_t>(cchSourceString));
  45. Assert(cbEncoded <= cbDestString);
  46. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  47. *destStringPtr = (char*)destString;
  48. *destCount = cbEncoded;
  49. if (allocateCount != nullptr)
  50. {
  51. *allocateCount = cbEncoded;
  52. }
  53. return S_OK;
  54. }
  55. ///
  56. /// Use the codex library to encode a UTF16 string to UTF8.
  57. /// The caller is responsible for providing the buffer
  58. /// The returned string is null terminated.
  59. ///
  60. inline HRESULT WideStringToNarrowNoAlloc(
  61. _In_ LPCWSTR sourceString,
  62. size_t sourceCount,
  63. __out_ecount(destCount) LPSTR destString,
  64. size_t destCount,
  65. size_t* writtenCount = nullptr)
  66. {
  67. size_t cchSourceString = sourceCount;
  68. if (cchSourceString >= MAXUINT32)
  69. {
  70. return E_OUTOFMEMORY;
  71. }
  72. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  73. size_t cbEncoded = 0;
  74. if (destString == nullptr)
  75. {
  76. cbEncoded = utf8::CountTrueUtf8(sourceString, (charcount_t)cchSourceString);
  77. }
  78. else
  79. {
  80. cbEncoded = utf8::EncodeInto<utf8::Utf8EncodingKind::TrueUtf8>((utf8char_t*)destString, destCount, sourceString, static_cast<charcount_t>(cchSourceString));
  81. Assert(cbEncoded <= destCount);
  82. }
  83. if (writtenCount != nullptr)
  84. {
  85. *writtenCount = cbEncoded;
  86. }
  87. return S_OK;
  88. }
  89. template <class Allocator>
  90. HRESULT WideStringToNarrow(_In_ LPCWSTR sourceString, size_t sourceCount, _Out_ LPSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  91. {
  92. return WideStringToNarrow(Allocator::allocate, sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  93. }
  94. inline HRESULT NarrowStringToWideNoAlloc(_In_ LPCSTR sourceString, size_t sourceCount,
  95. __out_ecount(destBufferCount) LPWSTR destString, size_t destBufferCount, _Out_ charcount_t* destCount)
  96. {
  97. size_t sourceStart = 0;
  98. size_t cbSourceString = sourceCount;
  99. if (sourceCount >= MAXUINT32)
  100. {
  101. destString[0] = WCHAR(0);
  102. return E_OUTOFMEMORY;
  103. }
  104. if (destString == nullptr)
  105. {
  106. return E_INVALIDARG;
  107. }
  108. if (sourceCount >= destBufferCount)
  109. {
  110. destString[0] = WCHAR(0);
  111. return E_INVALIDARG;
  112. }
  113. for (; sourceStart < sourceCount; sourceStart++)
  114. {
  115. const char ch = sourceString[sourceStart];
  116. if ( ! (ch > 0 && ch < 0x0080) )
  117. {
  118. size_t fallback = sourceStart > 3 ? 3 : sourceStart; // 3 + 1 -> fallback at least 1 unicode char
  119. sourceStart -= fallback;
  120. break;
  121. }
  122. destString[sourceStart] = (WCHAR) ch;
  123. }
  124. if (sourceStart == sourceCount)
  125. {
  126. *destCount = static_cast<charcount_t>(sourceCount);
  127. destString[sourceCount] = WCHAR(0);
  128. }
  129. else
  130. {
  131. LPCUTF8 remSourceString = (LPCUTF8)sourceString + sourceStart;
  132. WCHAR *remDestString = destString + sourceStart;
  133. charcount_t cchDestString = utf8::ByteIndexIntoCharacterIndex(remSourceString, cbSourceString - sourceStart);
  134. cchDestString += (charcount_t)sourceStart;
  135. if (cchDestString > sourceCount)
  136. {
  137. return E_OUTOFMEMORY;
  138. }
  139. // Some node tests depend on the utf8 decoder not swallowing invalid unicode characters
  140. // instead of replacing them with the "replacement" chracter. Pass a flag to our
  141. // decoder to require such behavior
  142. utf8::DecodeUnitsIntoAndNullTerminateNoAdvance(remDestString, remSourceString, (LPCUTF8) sourceString + cbSourceString, DecodeOptions::doAllowInvalidWCHARs);
  143. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  144. *destCount = cchDestString;
  145. }
  146. Assert(destString[*destCount] == 0);
  147. return S_OK;
  148. }
  149. ///
  150. /// Use the codex library to encode a UTF8 string to UTF16.
  151. /// The caller is responsible for freeing the memory, which is allocated
  152. /// using Allocator.
  153. /// The returned string is null terminated.
  154. ///
  155. template <typename AllocatorFunction>
  156. HRESULT NarrowStringToWide(_In_ AllocatorFunction allocator,_In_ LPCSTR sourceString,
  157. size_t sourceCount, _Out_ LPWSTR* destStringPtr, _Out_ charcount_t* destCount, size_t* allocateCount = nullptr)
  158. {
  159. size_t cbDestString = (sourceCount + 1) * sizeof(WCHAR);
  160. if (cbDestString < sourceCount) // overflow ?
  161. {
  162. return E_OUTOFMEMORY;
  163. }
  164. WCHAR* destString = (WCHAR*)allocator(cbDestString);
  165. if (destString == nullptr)
  166. {
  167. return E_OUTOFMEMORY;
  168. }
  169. if (allocateCount != nullptr)
  170. {
  171. *allocateCount = cbDestString;
  172. }
  173. *destStringPtr = destString;
  174. return NarrowStringToWideNoAlloc(sourceString, sourceCount, destString, sourceCount + 1, destCount);
  175. }
  176. template <class Allocator>
  177. HRESULT NarrowStringToWide(_In_ LPCSTR sourceString, size_t sourceCount, _Out_ LPWSTR* destStringPtr, _Out_ charcount_t* destCount, size_t* allocateCount = nullptr)
  178. {
  179. return NarrowStringToWide(Allocator::allocate, sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  180. }
  181. class malloc_allocator
  182. {
  183. public:
  184. static void* allocate(size_t size) { return ::malloc(size); }
  185. static void free(void* ptr, size_t count) { ::free(ptr); }
  186. };
  187. inline HRESULT WideStringToNarrowDynamic(_In_ LPCWSTR sourceString, _Out_ LPSTR* destStringPtr)
  188. {
  189. size_t unused;
  190. return WideStringToNarrow<malloc_allocator>(
  191. sourceString, wcslen(sourceString), destStringPtr, &unused);
  192. }
  193. inline HRESULT NarrowStringToWideDynamic(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr)
  194. {
  195. charcount_t unused;
  196. return NarrowStringToWide<malloc_allocator>(
  197. sourceString, strlen(sourceString), destStringPtr, &unused);
  198. }
  199. inline HRESULT NarrowStringToWideDynamicGetLength(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr, _Out_ charcount_t* destLength)
  200. {
  201. return NarrowStringToWide<malloc_allocator>(
  202. sourceString, strlen(sourceString), destStringPtr, destLength);
  203. }
  204. template <class Allocator, class SrcType, class DstType, class CountType>
  205. class NarrowWideStringConverter
  206. {
  207. public:
  208. static size_t Length(const SrcType& src);
  209. static HRESULT Convert(
  210. SrcType src, size_t srcCount, DstType* dst, CountType* dstCount, size_t* allocateCount = nullptr);
  211. static HRESULT ConvertNoAlloc(
  212. SrcType src, size_t srcCount, DstType dst, CountType dstCount, CountType* written);
  213. };
  214. template <class Allocator>
  215. class NarrowWideStringConverter<Allocator, LPCSTR, LPWSTR, charcount_t>
  216. {
  217. public:
  218. // Note: Typically caller should pass in Utf8 string length. Following
  219. // is used as fallback.
  220. static size_t Length(LPCSTR src)
  221. {
  222. return strnlen(src, INT_MAX);
  223. }
  224. static HRESULT Convert(
  225. LPCSTR sourceString, size_t sourceCount,
  226. LPWSTR* destStringPtr, charcount_t * destCount, size_t* allocateCount = nullptr)
  227. {
  228. return NarrowStringToWide<Allocator>(
  229. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  230. }
  231. static HRESULT ConvertNoAlloc(
  232. LPCSTR sourceString, size_t sourceCount,
  233. LPWSTR destStringPtr, charcount_t destCount, charcount_t* written)
  234. {
  235. return NarrowStringToWideNoAlloc(
  236. sourceString, sourceCount, destStringPtr, destCount, written);
  237. }
  238. };
  239. template <class Allocator>
  240. class NarrowWideStringConverter<Allocator, LPCWSTR, LPSTR, size_t>
  241. {
  242. public:
  243. // Note: Typically caller should pass in WCHAR string length. Following
  244. // is used as fallback.
  245. static size_t Length(LPCWSTR src)
  246. {
  247. return wcslen(src);
  248. }
  249. static HRESULT Convert(
  250. LPCWSTR sourceString, size_t sourceCount,
  251. LPSTR* destStringPtr, size_t* destCount, size_t* allocateCount = nullptr)
  252. {
  253. return WideStringToNarrow<Allocator>(
  254. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  255. }
  256. static HRESULT ConvertNoAlloc(
  257. LPCWSTR sourceString, size_t sourceCount,
  258. LPSTR destStringPtr, size_t destCount, size_t* written)
  259. {
  260. return WideStringToNarrowNoAlloc(
  261. sourceString, sourceCount, destStringPtr, destCount, written);
  262. }
  263. };
  264. template <class Allocator, class SrcType, class DstType, class CountType>
  265. class NarrowWideConverter
  266. {
  267. typedef NarrowWideStringConverter<Allocator, SrcType, DstType, CountType>
  268. StringConverter;
  269. private:
  270. DstType dst;
  271. CountType dstCount;
  272. size_t allocateCount;
  273. bool freeDst;
  274. public:
  275. NarrowWideConverter() : dst()
  276. {
  277. // do nothing
  278. }
  279. NarrowWideConverter(const SrcType& src, size_t srcCount = -1): dst()
  280. {
  281. Initialize(src, srcCount);
  282. }
  283. NarrowWideConverter(const SrcType& src, size_t srcCount, DstType dst, size_t dstSize) : dst(dst), freeDst(false)
  284. {
  285. StringConverter::ConvertNoAlloc(src, srcCount, dst, dstSize, &dstCount);
  286. }
  287. void Initialize(const SrcType& src, size_t srcCount = -1)
  288. {
  289. if (srcCount == -1)
  290. {
  291. srcCount = StringConverter::Length(src);
  292. }
  293. StringConverter::Convert(src, srcCount, &dst, &dstCount, &allocateCount);
  294. freeDst = true;
  295. }
  296. ~NarrowWideConverter()
  297. {
  298. if (dst && freeDst)
  299. {
  300. Allocator::free(dst, allocateCount);
  301. }
  302. }
  303. DstType Detach()
  304. {
  305. DstType result = dst;
  306. dst = DstType();
  307. return result;
  308. }
  309. operator DstType()
  310. {
  311. return dst;
  312. }
  313. size_t Length() const
  314. {
  315. return dstCount;
  316. }
  317. };
  318. typedef NarrowWideConverter<malloc_allocator, LPCSTR, LPWSTR, charcount_t> NarrowToWide;
  319. typedef NarrowWideConverter<malloc_allocator, LPCWSTR, LPSTR, size_t> WideToNarrow;
  320. }