Utf8Helper.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. #include "Utf8Codex.h"
  7. namespace utf8
  8. {
  9. ///
  10. /// Use the codex library to encode a UTF16 string to UTF8.
  11. /// The caller is responsible for freeing the memory, which is allocated
  12. /// using Allocator.
  13. /// The returned string is null terminated.
  14. /// TODO(jahorto): This file's dependencies mean that it cannot be included in PlatformAgnostic
  15. /// Thus, this function is currently ~duplicated in PlatformAgnostic::Intl::Utf16ToUtf8 (Intl.cpp)
  16. /// As long as that function exists, it _must_ be updated alongside any updates here
  17. ///
  18. template <typename AllocatorFunction>
  19. HRESULT WideStringToNarrow(_In_ AllocatorFunction allocator, _In_ LPCWSTR sourceString, size_t sourceCount, _Out_ LPSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  20. {
  21. size_t cchSourceString = sourceCount;
  22. if (cchSourceString >= MAXUINT32)
  23. {
  24. return E_OUTOFMEMORY;
  25. }
  26. size_t cbDestString = (cchSourceString + 1) * 3;
  27. // Check for overflow- cbDestString should be >= cchSourceString
  28. if (cbDestString < cchSourceString)
  29. {
  30. return E_OUTOFMEMORY;
  31. }
  32. utf8char_t* destString = (utf8char_t*)allocator(cbDestString);
  33. if (destString == nullptr)
  34. {
  35. return E_OUTOFMEMORY;
  36. }
  37. size_t cbEncoded = utf8::EncodeTrueUtf8IntoAndNullTerminate(destString, sourceString, (charcount_t) cchSourceString);
  38. Assert(cbEncoded <= cbDestString);
  39. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  40. *destStringPtr = (char*)destString;
  41. *destCount = cbEncoded;
  42. if (allocateCount != nullptr) *allocateCount = cbEncoded;
  43. return S_OK;
  44. }
  45. ///
  46. /// Use the codex library to encode a UTF16 string to UTF8.
  47. /// The caller is responsible for providing the buffer
  48. /// The returned string is null terminated.
  49. ///
  50. inline HRESULT WideStringToNarrowNoAlloc(_In_ LPCWSTR sourceString, size_t sourceCount, __out_ecount(destCount) LPSTR destString, size_t destCount, size_t* writtenCount = nullptr)
  51. {
  52. size_t cchSourceString = sourceCount;
  53. if (cchSourceString >= MAXUINT32)
  54. {
  55. return E_OUTOFMEMORY;
  56. }
  57. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  58. size_t cbEncoded = 0;
  59. if (destString == nullptr)
  60. {
  61. cbEncoded = utf8::CountTrueUtf8(sourceString, (charcount_t)cchSourceString);
  62. }
  63. else
  64. {
  65. cbEncoded = utf8::EncodeTrueUtf8IntoBoundsChecked((utf8char_t*)destString, sourceString, (charcount_t)cchSourceString, &destString[destCount]);
  66. Assert(cbEncoded <= destCount);
  67. }
  68. if (writtenCount != nullptr) *writtenCount = cbEncoded;
  69. return S_OK;
  70. }
  71. template <class Allocator>
  72. HRESULT WideStringToNarrow(_In_ LPCWSTR sourceString, size_t sourceCount, _Out_ LPSTR* destStringPtr, _Out_ size_t* destCount, size_t* allocateCount = nullptr)
  73. {
  74. return WideStringToNarrow(Allocator::allocate, sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  75. }
  76. inline HRESULT NarrowStringToWideNoAlloc(_In_ LPCSTR sourceString, size_t sourceCount,
  77. __out_ecount(destBufferCount) LPWSTR destString, size_t destBufferCount, _Out_ charcount_t* destCount)
  78. {
  79. size_t sourceStart = 0;
  80. size_t cbSourceString = sourceCount;
  81. if (sourceCount >= MAXUINT32)
  82. {
  83. destString[0] = WCHAR(0);
  84. return E_OUTOFMEMORY;
  85. }
  86. if (destString == nullptr)
  87. {
  88. return E_INVALIDARG;
  89. }
  90. if (sourceCount >= destBufferCount)
  91. {
  92. destString[0] = WCHAR(0);
  93. return E_INVALIDARG;
  94. }
  95. for (; sourceStart < sourceCount; sourceStart++)
  96. {
  97. const char ch = sourceString[sourceStart];
  98. if ( ! (ch > 0 && ch < 0x0080) )
  99. {
  100. size_t fallback = sourceStart > 3 ? 3 : sourceStart; // 3 + 1 -> fallback at least 1 unicode char
  101. sourceStart -= fallback;
  102. break;
  103. }
  104. destString[sourceStart] = (WCHAR) ch;
  105. }
  106. if (sourceStart == sourceCount)
  107. {
  108. *destCount = static_cast<charcount_t>(sourceCount);
  109. destString[sourceCount] = WCHAR(0);
  110. }
  111. else
  112. {
  113. LPCUTF8 remSourceString = (LPCUTF8)sourceString + sourceStart;
  114. WCHAR *remDestString = destString + sourceStart;
  115. charcount_t cchDestString = utf8::ByteIndexIntoCharacterIndex(remSourceString, cbSourceString - sourceStart);
  116. cchDestString += (charcount_t)sourceStart;
  117. if (cchDestString > sourceCount)
  118. {
  119. return E_OUTOFMEMORY;
  120. }
  121. // Some node tests depend on the utf8 decoder not swallowing invalid unicode characters
  122. // instead of replacing them with the "replacement" chracter. Pass a flag to our
  123. // decoder to require such behavior
  124. utf8::DecodeUnitsIntoAndNullTerminateNoAdvance(remDestString, remSourceString, (LPCUTF8) sourceString + cbSourceString, DecodeOptions::doAllowInvalidWCHARs);
  125. static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
  126. *destCount = cchDestString;
  127. }
  128. Assert(destString[*destCount] == 0);
  129. return S_OK;
  130. }
  131. ///
  132. /// Use the codex library to encode a UTF8 string to UTF16.
  133. /// The caller is responsible for freeing the memory, which is allocated
  134. /// using Allocator.
  135. /// The returned string is null terminated.
  136. ///
  137. template <typename AllocatorFunction>
  138. HRESULT NarrowStringToWide(_In_ AllocatorFunction allocator,_In_ LPCSTR sourceString,
  139. size_t sourceCount, _Out_ LPWSTR* destStringPtr, _Out_ charcount_t* destCount, size_t* allocateCount = nullptr)
  140. {
  141. size_t cbDestString = (sourceCount + 1) * sizeof(WCHAR);
  142. if (cbDestString < sourceCount) // overflow ?
  143. {
  144. return E_OUTOFMEMORY;
  145. }
  146. WCHAR* destString = (WCHAR*)allocator(cbDestString);
  147. if (destString == nullptr)
  148. {
  149. return E_OUTOFMEMORY;
  150. }
  151. if (allocateCount != nullptr)
  152. {
  153. *allocateCount = cbDestString;
  154. }
  155. *destStringPtr = destString;
  156. return NarrowStringToWideNoAlloc(sourceString, sourceCount, destString, sourceCount + 1, destCount);
  157. }
  158. template <class Allocator>
  159. HRESULT NarrowStringToWide(_In_ LPCSTR sourceString, size_t sourceCount, _Out_ LPWSTR* destStringPtr, _Out_ charcount_t* destCount, size_t* allocateCount = nullptr)
  160. {
  161. return NarrowStringToWide(Allocator::allocate, sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  162. }
  163. class malloc_allocator
  164. {
  165. public:
  166. static void* allocate(size_t size) { return ::malloc(size); }
  167. static void free(void* ptr, size_t count) { ::free(ptr); }
  168. };
  169. inline HRESULT WideStringToNarrowDynamic(_In_ LPCWSTR sourceString, _Out_ LPSTR* destStringPtr)
  170. {
  171. size_t unused;
  172. return WideStringToNarrow<malloc_allocator>(
  173. sourceString, wcslen(sourceString), destStringPtr, &unused);
  174. }
  175. inline HRESULT NarrowStringToWideDynamic(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr)
  176. {
  177. charcount_t unused;
  178. return NarrowStringToWide<malloc_allocator>(
  179. sourceString, strlen(sourceString), destStringPtr, &unused);
  180. }
  181. inline HRESULT NarrowStringToWideDynamicGetLength(_In_ LPCSTR sourceString, _Out_ LPWSTR* destStringPtr, _Out_ charcount_t* destLength)
  182. {
  183. return NarrowStringToWide<malloc_allocator>(
  184. sourceString, strlen(sourceString), destStringPtr, destLength);
  185. }
  186. template <class Allocator, class SrcType, class DstType, class CountType>
  187. class NarrowWideStringConverter
  188. {
  189. public:
  190. static size_t Length(const SrcType& src);
  191. static HRESULT Convert(
  192. SrcType src, size_t srcCount, DstType* dst, CountType* dstCount, size_t* allocateCount = nullptr);
  193. static HRESULT ConvertNoAlloc(
  194. SrcType src, size_t srcCount, DstType dst, CountType dstCount, CountType* written);
  195. };
  196. template <class Allocator>
  197. class NarrowWideStringConverter<Allocator, LPCSTR, LPWSTR, charcount_t>
  198. {
  199. public:
  200. // Note: Typically caller should pass in Utf8 string length. Following
  201. // is used as fallback.
  202. static size_t Length(LPCSTR src)
  203. {
  204. return strnlen(src, INT_MAX);
  205. }
  206. static HRESULT Convert(
  207. LPCSTR sourceString, size_t sourceCount,
  208. LPWSTR* destStringPtr, charcount_t * destCount, size_t* allocateCount = nullptr)
  209. {
  210. return NarrowStringToWide<Allocator>(
  211. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  212. }
  213. static HRESULT ConvertNoAlloc(
  214. LPCSTR sourceString, size_t sourceCount,
  215. LPWSTR destStringPtr, charcount_t destCount, charcount_t* written)
  216. {
  217. return NarrowStringToWideNoAlloc(
  218. sourceString, sourceCount, destStringPtr, destCount, written);
  219. }
  220. };
  221. template <class Allocator>
  222. class NarrowWideStringConverter<Allocator, LPCWSTR, LPSTR, size_t>
  223. {
  224. public:
  225. // Note: Typically caller should pass in WCHAR string length. Following
  226. // is used as fallback.
  227. static size_t Length(LPCWSTR src)
  228. {
  229. return wcslen(src);
  230. }
  231. static HRESULT Convert(
  232. LPCWSTR sourceString, size_t sourceCount,
  233. LPSTR* destStringPtr, size_t* destCount, size_t* allocateCount = nullptr)
  234. {
  235. return WideStringToNarrow<Allocator>(
  236. sourceString, sourceCount, destStringPtr, destCount, allocateCount);
  237. }
  238. static HRESULT ConvertNoAlloc(
  239. LPCWSTR sourceString, size_t sourceCount,
  240. LPSTR destStringPtr, size_t destCount, size_t* written)
  241. {
  242. return WideStringToNarrowNoAlloc(
  243. sourceString, sourceCount, destStringPtr, destCount, written);
  244. }
  245. };
  246. template <class Allocator, class SrcType, class DstType, class CountType>
  247. class NarrowWideConverter
  248. {
  249. typedef NarrowWideStringConverter<Allocator, SrcType, DstType, CountType>
  250. StringConverter;
  251. private:
  252. DstType dst;
  253. CountType dstCount;
  254. size_t allocateCount;
  255. bool freeDst;
  256. public:
  257. NarrowWideConverter() : dst()
  258. {
  259. // do nothing
  260. }
  261. NarrowWideConverter(const SrcType& src, size_t srcCount = -1): dst()
  262. {
  263. Initialize(src, srcCount);
  264. }
  265. NarrowWideConverter(const SrcType& src, size_t srcCount, DstType dst, size_t dstSize) : dst(dst), freeDst(false)
  266. {
  267. StringConverter::ConvertNoAlloc(src, srcCount, dst, dstSize, &dstCount);
  268. }
  269. void Initialize(const SrcType& src, size_t srcCount = -1)
  270. {
  271. if (srcCount == -1)
  272. {
  273. srcCount = StringConverter::Length(src);
  274. }
  275. StringConverter::Convert(src, srcCount, &dst, &dstCount, &allocateCount);
  276. freeDst = true;
  277. }
  278. ~NarrowWideConverter()
  279. {
  280. if (dst && freeDst)
  281. {
  282. Allocator::free(dst, allocateCount);
  283. }
  284. }
  285. DstType Detach()
  286. {
  287. DstType result = dst;
  288. dst = DstType();
  289. return result;
  290. }
  291. operator DstType()
  292. {
  293. return dst;
  294. }
  295. size_t Length() const
  296. {
  297. return dstCount;
  298. }
  299. };
  300. typedef NarrowWideConverter<malloc_allocator, LPCSTR, LPWSTR, charcount_t> NarrowToWide;
  301. typedef NarrowWideConverter<malloc_allocator, LPCWSTR, LPSTR, size_t> WideToNarrow;
  302. }