2
0

CodexTests.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "stdafx.h"
  6. #pragma warning(disable:26434) // Function definition hides non-virtual function in base class
  7. #pragma warning(disable:26439) // Implicit noexcept
  8. #pragma warning(disable:26451) // Arithmetic overflow
  9. #pragma warning(disable:26495) // Uninitialized member variable
  10. #include "catch.hpp"
  11. #include <process.h>
  12. #include "Codex\Utf8Codex.h"
  13. #pragma warning(disable:4100) // unreferenced formal parameter
  14. #pragma warning(disable:6387) // suppressing preFAST which raises warning for passing null to the JsRT APIs
  15. #pragma warning(disable:6262) // CATCH is using stack variables to report errors, suppressing the preFAST warning.
  16. namespace CodexTest
  17. {
  18. ///
  19. /// The following test verifies that for invalid characters, we replace them
  20. /// with the unicode replacement character
  21. ///
  22. // Verify single utf8-encoded codepoint
  23. void CheckIsUnicodeReplacementChar(const utf8char_t* encodedBuffer)
  24. {
  25. CHECK(encodedBuffer[0] == 0xEF);
  26. CHECK(encodedBuffer[1] == 0xBF);
  27. CHECK(encodedBuffer[2] == 0xBD);
  28. }
  29. //
  30. // Following test cases are based on the Utf-8 decoder tests
  31. // suggested by Markus Kuhn at https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
  32. //
  33. TEST_CASE("CodexTest_EncodeTrueUtf8_SingleSurrogates", "[CodexTest]")
  34. {
  35. const size_t charCount = 1;
  36. constexpr size_t cbEncodedBuffer = charCount * 3 + 1; // +1 since the buffer will be null-terminated
  37. utf8char_t encodedBuffer[cbEncodedBuffer];
  38. char16 testValues[] = { 0xD800, 0xDB7F, 0xDB80, 0xDBFF, 0xDC00, 0xDF80, 0xDFFF };
  39. const int numTestCases = _countof(testValues);
  40. for (int i = 0; i < numTestCases; i++)
  41. {
  42. size_t numEncodedBytes = utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>(encodedBuffer, cbEncodedBuffer, &testValues[i], charCount);
  43. CHECK(numEncodedBytes == 3);
  44. CheckIsUnicodeReplacementChar(encodedBuffer);
  45. }
  46. }
  47. //
  48. // Test encoding of given utf16-encoded strings into another encoding
  49. //
  50. // In the expected encoded string, extra bytes are represented as 0
  51. //
  52. template <typename TTestCase, typename TEncodingFunc>
  53. void RunUtf8EncodingTestCase(const TTestCase &testCases, const TEncodingFunc func)
  54. {
  55. const int numTestCases = _countof(testCases);
  56. const charcount_t charCount = _countof(testCases[0].surrogatePair);
  57. const charcount_t maxEncodedByteCount = _countof(testCases[0].utf8Encoding);
  58. const size_t encodedBufferSize = maxEncodedByteCount + 1; // +1 in case a null-terminating func is passed in
  59. utf8char_t encodedBuffer[encodedBufferSize];
  60. for (int i = 0; i < numTestCases; i++)
  61. {
  62. size_t numEncodedBytes = func(encodedBuffer, encodedBufferSize, testCases[i].surrogatePair, charCount);
  63. CHECK(numEncodedBytes <= maxEncodedByteCount);
  64. for (size_t j = 0; j < numEncodedBytes; j++)
  65. {
  66. CHECK(encodedBuffer[j] == testCases[i].utf8Encoding[j]);
  67. }
  68. // Check and make sure there were no other bytes expected in the encoded string
  69. if (numEncodedBytes < maxEncodedByteCount)
  70. {
  71. for (size_t j = numEncodedBytes; j < maxEncodedByteCount; j++)
  72. {
  73. CHECK(testCases[i].utf8Encoding[j] == 0);
  74. }
  75. }
  76. }
  77. }
  78. TEST_CASE("CodexTest_EncodeCesu_PairedSurrogates", "[CodexTest]")
  79. {
  80. // Each of these test cases verifies the encoding
  81. // of a single surrogate pair into a 6 byte CESU string
  82. // Each surrogate-pair unit is encoded separately into utf8
  83. struct TestCase
  84. {
  85. char16 surrogatePair[2];
  86. utf8char_t utf8Encoding[6];
  87. };
  88. TestCase testCases[] = {
  89. { { 0xD800, 0xDC00 }, { 0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80 } }, // U+010000 LINEAR B SYLLABLE B008 A character
  90. { { 0xD800, 0xDFFF }, { 0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF } }, // U+0103FF
  91. { { 0xDB7F, 0xDC00 }, { 0xED, 0xAD, 0xBF, 0xED, 0xB0, 0x80 } }, // U+0EFC00
  92. { { 0xDB7F, 0xDFFF }, { 0xED, 0xAD, 0xBF, 0xED, 0xBF, 0xBF } }, // U+0EFFFF
  93. { { 0xDB80, 0xDC00 }, { 0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80 } }, // U+0F0000 Plane 15 Private Use First
  94. { { 0xDB80, 0xDFFF }, { 0xED, 0xAE, 0x80, 0xED, 0xBF, 0xBF } }, // U+0F03FF
  95. { { 0xDBFF, 0xDC00 }, { 0xED, 0xAF, 0xBF, 0xED, 0xB0, 0x80 } }, // U+10FC00
  96. { { 0xDBFF, 0xDFFF }, { 0xED, 0xAF, 0xBF, 0xED, 0xBF, 0xBF } } // U+10FFFF
  97. };
  98. RunUtf8EncodingTestCase(testCases, static_cast<size_t (*)(utf8char_t*, size_t, const char16*, charcount_t)>(utf8::EncodeInto<utf8::Utf8EncodingKind::Cesu8>));
  99. }
  100. TEST_CASE("CodexTest_EncodeUtf8_PairedSurrogates", "[CodexTest]")
  101. {
  102. // Each of these test cases verifies the encoding
  103. // of a single surrogate pair into a 4 byte utf8 string
  104. // Each surrogate-pair unit is decoded to its original codepoint
  105. // and then encoded into utf8
  106. struct TestCase
  107. {
  108. char16 surrogatePair[2];
  109. utf8char_t utf8Encoding[4];
  110. };
  111. TestCase testCases[] = {
  112. { { 0xD800, 0xDC00 }, { 0xF0, 0x90, 0x80, 0x80 } }, // U+010000 LINEAR B SYLLABLE B008 A character
  113. { { 0xD800, 0xDFFF }, { 0xF0, 0x90, 0x8F, 0xBF } }, // U+0103FF
  114. { { 0xDB7F, 0xDC00 }, { 0xF3, 0xAF, 0xB0, 0x80 } }, // U+0EFC00
  115. { { 0xDB7F, 0xDFFF }, { 0xF3, 0xAF, 0xBF, 0xBF } }, // U+0EFFFF
  116. { { 0xDB80, 0xDC00 }, { 0xF3, 0xB0, 0x80, 0x80 } }, // U+0F0000 Plane 15 Private Use First
  117. { { 0xDB80, 0xDFFF }, { 0xF3, 0xB0, 0x8F, 0xBF } }, // U+0F03FF
  118. { { 0xDBFF, 0xDC00 }, { 0xF4, 0x8F, 0xB0, 0x80 } }, // U+10FC00
  119. { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } } // U+10FFFF
  120. };
  121. RunUtf8EncodingTestCase(testCases, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
  122. }
  123. TEST_CASE("CodexTest_EncodeUtf8_NonCharacters", "[CodexTest]")
  124. {
  125. // Each of these test cases verifies the encoding
  126. // of certain problematic codepoints that do not represent
  127. // characters
  128. struct TestCase
  129. {
  130. char16 surrogatePair[1];
  131. utf8char_t utf8Encoding[3];
  132. };
  133. TestCase testCases[] = {
  134. { { 0xFFFE }, { 0xEF, 0xBF, 0xBE } }, // U+FFFE
  135. { { 0xFFFF }, { 0xEF, 0xBF, 0xBF } } // U+FFFF
  136. };
  137. RunUtf8EncodingTestCase(testCases, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
  138. }
  139. TEST_CASE("CodexTest_EncodeUtf8_BoundaryChars", "[CodexTest]")
  140. {
  141. // Each of these test cases verifies the encoding
  142. // of boundary conditions
  143. struct SingleChar16TestCase
  144. {
  145. char16 surrogatePair[1];
  146. utf8char_t utf8Encoding[3];
  147. };
  148. SingleChar16TestCase testCases[] = {
  149. { { 0xD7FF }, { 0xED, 0x9F, 0xBF } }, // U+D7FF
  150. { { 0xE000 }, { 0xEE, 0x80, 0x80 } }, // U+E000
  151. { { 0xFFFD }, { 0xEF, 0xBF, 0xBD } } // U+FFFD
  152. };
  153. struct TwoChar16TestCase
  154. {
  155. char16 surrogatePair[2];
  156. utf8char_t utf8Encoding[4];
  157. };
  158. TwoChar16TestCase testCases2[] = {
  159. { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } } // U+10FFFF
  160. };
  161. RunUtf8EncodingTestCase(testCases, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
  162. RunUtf8EncodingTestCase(testCases2, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
  163. }
  164. TEST_CASE("CodexTest_EncodeUtf8_SimpleCharacters", "[CodexTest]")
  165. {
  166. // Each of these test cases verifies the encoding
  167. // of certain problematic codepoints that do not represent
  168. // characters
  169. struct TestCase
  170. {
  171. char16 surrogatePair[1];
  172. utf8char_t utf8Encoding[3];
  173. };
  174. TestCase testCases[] = {
  175. { { 0x0024 }, { 0x24 } }, // U+0024 - Dollar Symbol
  176. { { 0x00A2 }, { 0xC2, 0xA2 } }, // U+00A2 - Cent symbol
  177. { { 0x20AC }, { 0xE2, 0x82, 0xAC } } // U+20AC - Euro symbol
  178. };
  179. RunUtf8EncodingTestCase(testCases, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
  180. }
  181. TEST_CASE("CodexTest_EncodeTrueUtf8_SimpleString", "[CodexTest]")
  182. {
  183. const charcount_t charCount = 3;
  184. constexpr size_t cbEncodedBuffer = charCount * 3 + 1; // +1 since the buffer will be null terminated
  185. utf8char_t encodedBuffer[cbEncodedBuffer];
  186. const char16* sourceBuffer = L"abc";
  187. size_t numEncodedBytes = utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>(encodedBuffer, cbEncodedBuffer, sourceBuffer, charCount);
  188. CHECK(numEncodedBytes == charCount);
  189. for (int i = 0; i < charCount; i++)
  190. {
  191. CHECK(sourceBuffer[i] == (const char16)encodedBuffer[i]);
  192. }
  193. }
  194. template <typename TTestCase, typename TDecodeFunc>
  195. void RunUtf8DecodeTestCase(const TTestCase &testCases, const TDecodeFunc func)
  196. {
  197. const int numTestCases = _countof(testCases);
  198. const charcount_t charCount = _countof(testCases[0].result);
  199. char16 decodedBuffer[charCount + 1]; // +1 in case a null-terminating func is passed in
  200. for (int i = 0; i < numTestCases; i++)
  201. {
  202. bool chunkEndsInTruncatedSequence = false;
  203. size_t decodedCount = func(decodedBuffer, testCases[i].utf8Encoding, testCases[i].utf8Encoding + testCases[i].bytesToDecode, utf8::DecodeOptions::doChunkedEncoding, &chunkEndsInTruncatedSequence);
  204. CHECK(decodedCount == testCases[i].expectedDecodedChars);
  205. for (size_t j = 0; j < decodedCount; j++)
  206. {
  207. CHECK(decodedBuffer[j] == testCases[i].result[j]);
  208. }
  209. CHECK(testCases[i].shouldEndInTruncation == chunkEndsInTruncatedSequence);
  210. }
  211. }
  212. TEST_CASE("CodexTest_DecodeUnitsInto_ChunkEndsInTruncatedSequence", "[CodexTest]")
  213. {
  214. struct TestCase
  215. {
  216. int bytesToDecode;
  217. size_t expectedDecodedChars;
  218. bool shouldEndInTruncation;
  219. char16 result[8];
  220. utf8char_t utf8Encoding[8];
  221. };
  222. TestCase testCases[] = {
  223. { 2, 1, false, { 0xc1 }, { 0xc3, 0x81 } }, // Valid 2-byte sequence
  224. { 1, 0, true, { 0x0 }, { 0xc3, 0x81 } }, // Valid 2-byte sequence truncated at the end of the chunk
  225. { 2, 2, false, { 0xfffd, 0x79 },{ 0xc3, 0x79 } }, // Invalid 2-byte sequence
  226. { 1, 0, true, { 0x0 }, { 0xc3, 0x79 } }, // Invalid 2-byte sequence truncated at the end of the chunk
  227. { 3, 1, false, { 0x3042 },{ 0xe3, 0x81, 0x82 } }, // Valid 3-byte sequence
  228. { 1, 0, true, { 0x0 }, { 0xe3, 0x81, 0x82 } }, // Valid 3-byte sequence truncated at the end of the chunk
  229. { 2, 0, true, { 0x0 }, { 0xe3, 0x81, 0x82 } }, // Valid 3-byte sequence truncated at the end of the chunk
  230. { 3, 3, false, { 0xfffd, 0x79, 0xfffd }, { 0xe3, 0x79, 0x82 } }, // Invalid 3-byte sequence
  231. { 1, 0, true, { 0x0 }, { 0xe3, 0x79, 0x82 } }, // Invalid 3-byte sequence truncated at the end of the chunk
  232. { 2, 0, true, { 0x0 }, { 0xe3, 0x79, 0x82 } }, // Invalid 3-byte sequence truncated at the end of the chunk
  233. { 3, 3, false, { 0xfffd, 0xfffd, 0x79 }, { 0xe3, 0x81, 0x79 } }, // Invalid 3-byte sequence
  234. { 1, 0, true, { 0x0 }, { 0xe3, 0x81, 0x79 } }, // Invalid 3-byte sequence truncated at the end of the chunk
  235. { 2, 0, true, { 0x0 }, { 0xe3, 0x81, 0x79 } }, // Invalid 3-byte sequence truncated at the end of the chunk
  236. { 4, 2, false, { 0xd9c4, 0xdc83 }, { 0xf2, 0x81, 0x82, 0x83 } }, // Valid 4-byte sequence
  237. { 1, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x83 } }, // Valid 4-byte sequence truncated at the end of the chunk
  238. { 2, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x83 } }, // Valid 4-byte sequence truncated at the end of the chunk
  239. { 3, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x83 } }, // Valid 4-byte sequence truncated at the end of the chunk
  240. { 4, 4, false, { 0xfffd, 0x79, 0xfffd, 0xfffd }, { 0xf2, 0x79, 0x82, 0x83 } }, // Invalid 4-byte sequence
  241. { 1, 0, true, { 0x0 }, { 0xf2, 0x79, 0x82, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
  242. { 2, 0, true, { 0x0 }, { 0xf2, 0x79, 0x82, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
  243. { 3, 0, true, { 0x0 }, { 0xf2, 0x79, 0x82, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
  244. { 4, 4, false, { 0xfffd, 0xfffd, 0x79, 0xfffd }, { 0xf2, 0x81, 0x79, 0x83 } }, // Invalid 4-byte sequence
  245. { 1, 0, true, { 0x0 }, { 0xf2, 0x81, 0x79, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
  246. { 2, 0, true, { 0x0 }, { 0xf2, 0x81, 0x79, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
  247. { 3, 0, true, { 0x0 }, { 0xf2, 0x81, 0x79, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
  248. { 4, 4, false, { 0xfffd, 0xfffd, 0xfffd, 0x79 }, { 0xf2, 0x81, 0x82, 0x79 } }, // Invalid 4-byte sequence
  249. { 1, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x79 } }, // Invalid 4-byte sequence truncated at the end of the chunk
  250. { 2, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x79 } }, // Invalid 4-byte sequence truncated at the end of the chunk
  251. { 3, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x79 } }, // Invalid 4-byte sequence truncated at the end of the chunk
  252. };
  253. RunUtf8DecodeTestCase(testCases, utf8::DecodeUnitsIntoAndNullTerminateNoAdvance);
  254. }
  255. };