Utf8Helper.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. #include "Utf8Codex.h"
  7. namespace utf8
  8. {
  9. ///
  10. /// Use the codex library to encode a UTF16 string to UTF8.
  11. /// The caller is responsible for freeing the memory, which is allocated
  12. /// using Allocator.
  13. /// The returned string is null terminated.
  14. /// TODO(jahorto): This file's dependencies mean that it cannot be included in PlatformAgnostic
  15. /// Thus, this function is currently ~duplicated in PlatformAgnostic::Intl::Utf16ToUtf8 (Intl.cpp)
  16. /// As long as that function exists, it _must_ be updated alongside any updates here
  17. ///
  18. template <typename AllocatorFunction>
  19. HRESULT WideStringToNarrow(
  20. _In_ AllocatorFunction allocator,
  21. _In_ LPCWSTR sourceString,
  22. size_t sourceCount,
  23. _Out_ LPSTR* destStringPtr,
  24. _Out_ size_t* destCount,
  25. size_t* allocateCount = nullptr)
  26. {
  27. size_t cchSourceString = sourceCount;
  28. if (cchSourceString >= MAXUINT32)
  29. {
  30. return E_OUTOFMEMORY;
  31. }
  32. size_t cbDestString = (cchSourceString + 1) * 3;
  33. // Check for overflow- cbDestString should be >= cchSourceString
  34. if (cbDestString < cchSourceString)
  35. {
  36. return E_OUTOFMEMORY;
  37. }
  38. utf8char_t* destString = (utf8char_t*)allocator(cbDestString);
  39. if (destString == nullptr)
  40. {
  41. return E_OUTOFMEMORY;
  42. }
  43. size_t cbEncoded = utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>(destString, cbDestString, sourceString, static_cast<charcount_t>(cchSourceString));
  44. Assert(cbEncoded <= cbDestString);
  45. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  46. *destStringPtr = (char*)destString;
  47. *destCount = cbEncoded;
  48. if (allocateCount != nullptr)
  49. {
  50. *allocateCount = cbEncoded;
  51. }
  52. return S_OK;
  53. }
  54. ///
  55. /// Use the codex library to encode a UTF16 string to UTF8.
  56. /// The caller is responsible for providing the buffer
  57. /// The returned string is null terminated.
  58. ///
  59. inline HRESULT WideStringToNarrowNoAlloc(
  60. _In_ LPCWSTR sourceString,
  61. size_t sourceCount,
  62. __out_ecount(destCount) LPSTR destString,
  63. size_t destCount,
  64. size_t* writtenCount = nullptr)
  65. {
  66. size_t cchSourceString = sourceCount;
  67. if (cchSourceString >= MAXUINT32)
  68. {
  69. return E_OUTOFMEMORY;
  70. }
  71. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  72. size_t cbEncoded = 0;
  73. if (destString == nullptr)
  74. {
  75. cbEncoded = utf8::CountTrueUtf8(sourceString, (charcount_t)cchSourceString);
  76. }
  77. else
  78. {
  79. cbEncoded = utf8::EncodeInto<utf8::Utf8EncodingKind::TrueUtf8>((utf8char_t*)destString, destCount, sourceString, static_cast<charcount_t>(cchSourceString));
  80. Assert(cbEncoded <= destCount);
  81. }
  82. if (writtenCount != nullptr)
  83. {
  84. *writtenCount = cbEncoded;
  85. }
  86. return S_OK;
  87. }
  88. template <class Allocator>
  89. HRESULT WideStringToNarrow(_In_ LPCWSTR sourceString, size_t sourceCount, _Out_ LPSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  90. {
  91. return WideStringToNarrow(Allocator::allocate, sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  92. }
  93. inline HRESULT NarrowStringToWideNoAlloc(_In_ LPCSTR sourceString, size_t sourceCount,
  94. __out_ecount(destBufferCount) LPWSTR destString, size_t destBufferCount, _Out_ charcount_t* destCount)
  95. {
  96. size_t sourceStart = 0;
  97. size_t cbSourceString = sourceCount;
  98. if (sourceCount >= MAXUINT32)
  99. {
  100. destString[0] = WCHAR(0);
  101. return E_OUTOFMEMORY;
  102. }
  103. if (destString == nullptr)
  104. {
  105. return E_INVALIDARG;
  106. }
  107. if (sourceCount >= destBufferCount)
  108. {
  109. destString[0] = WCHAR(0);
  110. return E_INVALIDARG;
  111. }
  112. for (; sourceStart < sourceCount; sourceStart++)
  113. {
  114. const char ch = sourceString[sourceStart];
  115. if ( ! (ch > 0 && ch < 0x0080) )
  116. {
  117. size_t fallback = sourceStart > 3 ? 3 : sourceStart; // 3 + 1 -> fallback at least 1 unicode char
  118. sourceStart -= fallback;
  119. break;
  120. }
  121. destString[sourceStart] = (WCHAR) ch;
  122. }
  123. if (sourceStart == sourceCount)
  124. {
  125. *destCount = static_cast<charcount_t>(sourceCount);
  126. destString[sourceCount] = WCHAR(0);
  127. }
  128. else
  129. {
  130. LPCUTF8 remSourceString = (LPCUTF8)sourceString + sourceStart;
  131. WCHAR *remDestString = destString + sourceStart;
  132. charcount_t cchDestString = utf8::ByteIndexIntoCharacterIndex(remSourceString, cbSourceString - sourceStart);
  133. cchDestString += (charcount_t)sourceStart;
  134. if (cchDestString > sourceCount)
  135. {
  136. return E_OUTOFMEMORY;
  137. }
  138. // Some node tests depend on the utf8 decoder not swallowing invalid unicode characters
  139. // instead of replacing them with the "replacement" chracter. Pass a flag to our
  140. // decoder to require such behavior
  141. utf8::DecodeUnitsIntoAndNullTerminateNoAdvance(remDestString, remSourceString, (LPCUTF8) sourceString + cbSourceString, DecodeOptions::doAllowInvalidWCHARs);
  142. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  143. *destCount = cchDestString;
  144. }
  145. Assert(destString[*destCount] == 0);
  146. return S_OK;
  147. }
  148. ///
  149. /// Use the codex library to encode a UTF8 string to UTF16.
  150. /// The caller is responsible for freeing the memory, which is allocated
  151. /// using Allocator.
  152. /// The returned string is null terminated.
  153. ///
  154. template <typename AllocatorFunction>
  155. HRESULT NarrowStringToWide(_In_ AllocatorFunction allocator,_In_ LPCSTR sourceString,
  156. size_t sourceCount, _Out_ LPWSTR* destStringPtr, _Out_ charcount_t* destCount, size_t* allocateCount = nullptr)
  157. {
  158. size_t cbDestString = (sourceCount + 1) * sizeof(WCHAR);
  159. if (cbDestString < sourceCount) // overflow ?
  160. {
  161. return E_OUTOFMEMORY;
  162. }
  163. WCHAR* destString = (WCHAR*)allocator(cbDestString);
  164. if (destString == nullptr)
  165. {
  166. return E_OUTOFMEMORY;
  167. }
  168. if (allocateCount != nullptr)
  169. {
  170. *allocateCount = cbDestString;
  171. }
  172. *destStringPtr = destString;
  173. return NarrowStringToWideNoAlloc(sourceString, sourceCount, destString, sourceCount + 1, destCount);
  174. }
  175. template <class Allocator>
  176. HRESULT NarrowStringToWide(_In_ LPCSTR sourceString, size_t sourceCount, _Out_ LPWSTR* destStringPtr, _Out_ charcount_t* destCount, size_t* allocateCount = nullptr)
  177. {
  178. return NarrowStringToWide(Allocator::allocate, sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  179. }
  180. class malloc_allocator
  181. {
  182. public:
  183. static void* allocate(size_t size) { return ::malloc(size); }
  184. static void free(void* ptr, size_t count) { ::free(ptr); }
  185. };
  186. inline HRESULT WideStringToNarrowDynamic(_In_ LPCWSTR sourceString, _Out_ LPSTR* destStringPtr)
  187. {
  188. size_t unused;
  189. return WideStringToNarrow<malloc_allocator>(
  190. sourceString, wcslen(sourceString), destStringPtr, &unused);
  191. }
  192. inline HRESULT NarrowStringToWideDynamic(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr)
  193. {
  194. charcount_t unused;
  195. return NarrowStringToWide<malloc_allocator>(
  196. sourceString, strlen(sourceString), destStringPtr, &unused);
  197. }
  198. inline HRESULT NarrowStringToWideDynamicGetLength(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr, _Out_ charcount_t* destLength)
  199. {
  200. return NarrowStringToWide<malloc_allocator>(
  201. sourceString, strlen(sourceString), destStringPtr, destLength);
  202. }
  203. template <class Allocator, class SrcType, class DstType, class CountType>
  204. class NarrowWideStringConverter
  205. {
  206. public:
  207. static size_t Length(const SrcType& src);
  208. static HRESULT Convert(
  209. SrcType src, size_t srcCount, DstType* dst, CountType* dstCount, size_t* allocateCount = nullptr);
  210. static HRESULT ConvertNoAlloc(
  211. SrcType src, size_t srcCount, DstType dst, CountType dstCount, CountType* written);
  212. };
  213. template <class Allocator>
  214. class NarrowWideStringConverter<Allocator, LPCSTR, LPWSTR, charcount_t>
  215. {
  216. public:
  217. // Note: Typically caller should pass in Utf8 string length. Following
  218. // is used as fallback.
  219. static size_t Length(LPCSTR src)
  220. {
  221. return strnlen(src, INT_MAX);
  222. }
  223. static HRESULT Convert(
  224. LPCSTR sourceString, size_t sourceCount,
  225. LPWSTR* destStringPtr, charcount_t * destCount, size_t* allocateCount = nullptr)
  226. {
  227. return NarrowStringToWide<Allocator>(
  228. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  229. }
  230. static HRESULT ConvertNoAlloc(
  231. LPCSTR sourceString, size_t sourceCount,
  232. LPWSTR destStringPtr, charcount_t destCount, charcount_t* written)
  233. {
  234. return NarrowStringToWideNoAlloc(
  235. sourceString, sourceCount, destStringPtr, destCount, written);
  236. }
  237. };
  238. template <class Allocator>
  239. class NarrowWideStringConverter<Allocator, LPCWSTR, LPSTR, size_t>
  240. {
  241. public:
  242. // Note: Typically caller should pass in WCHAR string length. Following
  243. // is used as fallback.
  244. static size_t Length(LPCWSTR src)
  245. {
  246. return wcslen(src);
  247. }
  248. static HRESULT Convert(
  249. LPCWSTR sourceString, size_t sourceCount,
  250. LPSTR* destStringPtr, size_t* destCount, size_t* allocateCount = nullptr)
  251. {
  252. return WideStringToNarrow<Allocator>(
  253. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  254. }
  255. static HRESULT ConvertNoAlloc(
  256. LPCWSTR sourceString, size_t sourceCount,
  257. LPSTR destStringPtr, size_t destCount, size_t* written)
  258. {
  259. return WideStringToNarrowNoAlloc(
  260. sourceString, sourceCount, destStringPtr, destCount, written);
  261. }
  262. };
  263. template <class Allocator, class SrcType, class DstType, class CountType>
  264. class NarrowWideConverter
  265. {
  266. typedef NarrowWideStringConverter<Allocator, SrcType, DstType, CountType>
  267. StringConverter;
  268. private:
  269. DstType dst;
  270. CountType dstCount;
  271. size_t allocateCount;
  272. bool freeDst;
  273. public:
  274. NarrowWideConverter() : dst()
  275. {
  276. // do nothing
  277. }
  278. NarrowWideConverter(const SrcType& src, size_t srcCount = -1): dst()
  279. {
  280. Initialize(src, srcCount);
  281. }
  282. NarrowWideConverter(const SrcType& src, size_t srcCount, DstType dst, size_t dstSize) : dst(dst), freeDst(false)
  283. {
  284. StringConverter::ConvertNoAlloc(src, srcCount, dst, dstSize, &dstCount);
  285. }
  286. void Initialize(const SrcType& src, size_t srcCount = -1)
  287. {
  288. if (srcCount == -1)
  289. {
  290. srcCount = StringConverter::Length(src);
  291. }
  292. StringConverter::Convert(src, srcCount, &dst, &dstCount, &allocateCount);
  293. freeDst = true;
  294. }
  295. ~NarrowWideConverter()
  296. {
  297. if (dst && freeDst)
  298. {
  299. Allocator::free(dst, allocateCount);
  300. }
  301. }
  302. DstType Detach()
  303. {
  304. DstType result = dst;
  305. dst = DstType();
  306. return result;
  307. }
  308. operator DstType()
  309. {
  310. return dst;
  311. }
  312. size_t Length() const
  313. {
  314. return dstCount;
  315. }
  316. };
  317. typedef NarrowWideConverter<malloc_allocator, LPCSTR, LPWSTR, charcount_t> NarrowToWide;
  318. typedef NarrowWideConverter<malloc_allocator, LPCWSTR, LPSTR, size_t> WideToNarrow;
  319. }