| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290 |
- //-------------------------------------------------------------------------------------------------------
- // Copyright (C) Microsoft. All rights reserved.
- // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
- //-------------------------------------------------------------------------------------------------------
- #include "stdafx.h"
- #pragma warning(disable:26434) // Function definition hides non-virtual function in base class
- #pragma warning(disable:26439) // Implicit noexcept
- #pragma warning(disable:26451) // Arithmetic overflow
- #pragma warning(disable:26495) // Uninitialized member variable
- #include "catch.hpp"
- #include <process.h>
- #include "Codex\Utf8Codex.h"
- #pragma warning(disable:4100) // unreferenced formal parameter
- #pragma warning(disable:6387) // suppressing preFAST which raises warning for passing null to the JsRT APIs
- #pragma warning(disable:6262) // CATCH is using stack variables to report errors, suppressing the preFAST warning.
- namespace CodexTest
- {
- ///
- /// The following test verifies that for invalid characters, we replace them
- /// with the unicode replacement character
- ///
- // Verify single utf8-encoded codepoint
- void CheckIsUnicodeReplacementChar(const utf8char_t* encodedBuffer)
- {
- CHECK(encodedBuffer[0] == 0xEF);
- CHECK(encodedBuffer[1] == 0xBF);
- CHECK(encodedBuffer[2] == 0xBD);
- }
- //
- // Following test cases are based on the Utf-8 decoder tests
- // suggested by Markus Kuhn at https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
- //
- TEST_CASE("CodexTest_EncodeTrueUtf8_SingleSurrogates", "[CodexTest]")
- {
- const size_t charCount = 1;
- constexpr size_t cbEncodedBuffer = charCount * 3 + 1; // +1 since the buffer will be null-terminated
- utf8char_t encodedBuffer[cbEncodedBuffer];
- char16 testValues[] = { 0xD800, 0xDB7F, 0xDB80, 0xDBFF, 0xDC00, 0xDF80, 0xDFFF };
- const int numTestCases = _countof(testValues);
- for (int i = 0; i < numTestCases; i++)
- {
- size_t numEncodedBytes = utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>(encodedBuffer, cbEncodedBuffer, &testValues[i], charCount);
- CHECK(numEncodedBytes == 3);
- CheckIsUnicodeReplacementChar(encodedBuffer);
- }
- }
- //
- // Test encoding of given utf16-encoded strings into another encoding
- //
- // In the expected encoded string, extra bytes are represented as 0
- //
- template <typename TTestCase, typename TEncodingFunc>
- void RunUtf8EncodingTestCase(const TTestCase &testCases, const TEncodingFunc func)
- {
- const int numTestCases = _countof(testCases);
- const charcount_t charCount = _countof(testCases[0].surrogatePair);
- const charcount_t maxEncodedByteCount = _countof(testCases[0].utf8Encoding);
- const size_t encodedBufferSize = maxEncodedByteCount + 1; // +1 in case a null-terminating func is passed in
- utf8char_t encodedBuffer[encodedBufferSize];
- for (int i = 0; i < numTestCases; i++)
- {
- size_t numEncodedBytes = func(encodedBuffer, encodedBufferSize, testCases[i].surrogatePair, charCount);
- CHECK(numEncodedBytes <= maxEncodedByteCount);
- for (size_t j = 0; j < numEncodedBytes; j++)
- {
- CHECK(encodedBuffer[j] == testCases[i].utf8Encoding[j]);
- }
- // Check and make sure there were no other bytes expected in the encoded string
- if (numEncodedBytes < maxEncodedByteCount)
- {
- for (size_t j = numEncodedBytes; j < maxEncodedByteCount; j++)
- {
- CHECK(testCases[i].utf8Encoding[j] == 0);
- }
- }
- }
- }
- TEST_CASE("CodexTest_EncodeCesu_PairedSurrogates", "[CodexTest]")
- {
- // Each of these test cases verifies the encoding
- // of a single surrogate pair into a 6 byte CESU string
- // Each surrogate-pair unit is encoded separately into utf8
- struct TestCase
- {
- char16 surrogatePair[2];
- utf8char_t utf8Encoding[6];
- };
- TestCase testCases[] = {
- { { 0xD800, 0xDC00 }, { 0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80 } }, // U+010000 LINEAR B SYLLABLE B008 A character
- { { 0xD800, 0xDFFF }, { 0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF } }, // U+0103FF
- { { 0xDB7F, 0xDC00 }, { 0xED, 0xAD, 0xBF, 0xED, 0xB0, 0x80 } }, // U+0EFC00
- { { 0xDB7F, 0xDFFF }, { 0xED, 0xAD, 0xBF, 0xED, 0xBF, 0xBF } }, // U+0EFFFF
- { { 0xDB80, 0xDC00 }, { 0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80 } }, // U+0F0000 Plane 15 Private Use First
- { { 0xDB80, 0xDFFF }, { 0xED, 0xAE, 0x80, 0xED, 0xBF, 0xBF } }, // U+0F03FF
- { { 0xDBFF, 0xDC00 }, { 0xED, 0xAF, 0xBF, 0xED, 0xB0, 0x80 } }, // U+10FC00
- { { 0xDBFF, 0xDFFF }, { 0xED, 0xAF, 0xBF, 0xED, 0xBF, 0xBF } } // U+10FFFF
- };
- RunUtf8EncodingTestCase(testCases, static_cast<size_t (*)(utf8char_t*, size_t, const char16*, charcount_t)>(utf8::EncodeInto<utf8::Utf8EncodingKind::Cesu8>));
- }
- TEST_CASE("CodexTest_EncodeUtf8_PairedSurrogates", "[CodexTest]")
- {
- // Each of these test cases verifies the encoding
- // of a single surrogate pair into a 4 byte utf8 string
- // Each surrogate-pair unit is decoded to its original codepoint
- // and then encoded into utf8
- struct TestCase
- {
- char16 surrogatePair[2];
- utf8char_t utf8Encoding[4];
- };
- TestCase testCases[] = {
- { { 0xD800, 0xDC00 }, { 0xF0, 0x90, 0x80, 0x80 } }, // U+010000 LINEAR B SYLLABLE B008 A character
- { { 0xD800, 0xDFFF }, { 0xF0, 0x90, 0x8F, 0xBF } }, // U+0103FF
- { { 0xDB7F, 0xDC00 }, { 0xF3, 0xAF, 0xB0, 0x80 } }, // U+0EFC00
- { { 0xDB7F, 0xDFFF }, { 0xF3, 0xAF, 0xBF, 0xBF } }, // U+0EFFFF
- { { 0xDB80, 0xDC00 }, { 0xF3, 0xB0, 0x80, 0x80 } }, // U+0F0000 Plane 15 Private Use First
- { { 0xDB80, 0xDFFF }, { 0xF3, 0xB0, 0x8F, 0xBF } }, // U+0F03FF
- { { 0xDBFF, 0xDC00 }, { 0xF4, 0x8F, 0xB0, 0x80 } }, // U+10FC00
- { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } } // U+10FFFF
- };
- RunUtf8EncodingTestCase(testCases, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
- }
- TEST_CASE("CodexTest_EncodeUtf8_NonCharacters", "[CodexTest]")
- {
- // Each of these test cases verifies the encoding
- // of certain problematic codepoints that do not represent
- // characters
- struct TestCase
- {
- char16 surrogatePair[1];
- utf8char_t utf8Encoding[3];
- };
- TestCase testCases[] = {
- { { 0xFFFE }, { 0xEF, 0xBF, 0xBE } }, // U+FFFE
- { { 0xFFFF }, { 0xEF, 0xBF, 0xBF } } // U+FFFF
- };
- RunUtf8EncodingTestCase(testCases, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
- }
- TEST_CASE("CodexTest_EncodeUtf8_BoundaryChars", "[CodexTest]")
- {
- // Each of these test cases verifies the encoding
- // of boundary conditions
- struct SingleChar16TestCase
- {
- char16 surrogatePair[1];
- utf8char_t utf8Encoding[3];
- };
- SingleChar16TestCase testCases[] = {
- { { 0xD7FF }, { 0xED, 0x9F, 0xBF } }, // U+D7FF
- { { 0xE000 }, { 0xEE, 0x80, 0x80 } }, // U+E000
- { { 0xFFFD }, { 0xEF, 0xBF, 0xBD } } // U+FFFD
- };
- struct TwoChar16TestCase
- {
- char16 surrogatePair[2];
- utf8char_t utf8Encoding[4];
- };
- TwoChar16TestCase testCases2[] = {
- { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } } // U+10FFFF
- };
- RunUtf8EncodingTestCase(testCases, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
- RunUtf8EncodingTestCase(testCases2, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
- }
- TEST_CASE("CodexTest_EncodeUtf8_SimpleCharacters", "[CodexTest]")
- {
- // Each of these test cases verifies the encoding
- // of certain problematic codepoints that do not represent
- // characters
- struct TestCase
- {
- char16 surrogatePair[1];
- utf8char_t utf8Encoding[3];
- };
- TestCase testCases[] = {
- { { 0x0024 }, { 0x24 } }, // U+0024 - Dollar Symbol
- { { 0x00A2 }, { 0xC2, 0xA2 } }, // U+00A2 - Cent symbol
- { { 0x20AC }, { 0xE2, 0x82, 0xAC } } // U+20AC - Euro symbol
- };
- RunUtf8EncodingTestCase(testCases, utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>);
- }
- TEST_CASE("CodexTest_EncodeTrueUtf8_SimpleString", "[CodexTest]")
- {
- const charcount_t charCount = 3;
- constexpr size_t cbEncodedBuffer = charCount * 3 + 1; // +1 since the buffer will be null terminated
- utf8char_t encodedBuffer[cbEncodedBuffer];
- const char16* sourceBuffer = L"abc";
- size_t numEncodedBytes = utf8::EncodeIntoAndNullTerminate<utf8::Utf8EncodingKind::TrueUtf8>(encodedBuffer, cbEncodedBuffer, sourceBuffer, charCount);
- CHECK(numEncodedBytes == charCount);
- for (int i = 0; i < charCount; i++)
- {
- CHECK(sourceBuffer[i] == (const char16)encodedBuffer[i]);
- }
- }
- template <typename TTestCase, typename TDecodeFunc>
- void RunUtf8DecodeTestCase(const TTestCase &testCases, const TDecodeFunc func)
- {
- const int numTestCases = _countof(testCases);
- const charcount_t charCount = _countof(testCases[0].result);
- char16 decodedBuffer[charCount + 1]; // +1 in case a null-terminating func is passed in
- for (int i = 0; i < numTestCases; i++)
- {
- bool chunkEndsInTruncatedSequence = false;
- size_t decodedCount = func(decodedBuffer, testCases[i].utf8Encoding, testCases[i].utf8Encoding + testCases[i].bytesToDecode, utf8::DecodeOptions::doChunkedEncoding, &chunkEndsInTruncatedSequence);
- CHECK(decodedCount == testCases[i].expectedDecodedChars);
- for (size_t j = 0; j < decodedCount; j++)
- {
- CHECK(decodedBuffer[j] == testCases[i].result[j]);
- }
- CHECK(testCases[i].shouldEndInTruncation == chunkEndsInTruncatedSequence);
- }
- }
- TEST_CASE("CodexTest_DecodeUnitsInto_ChunkEndsInTruncatedSequence", "[CodexTest]")
- {
- struct TestCase
- {
- int bytesToDecode;
- size_t expectedDecodedChars;
- bool shouldEndInTruncation;
- char16 result[8];
- utf8char_t utf8Encoding[8];
- };
- TestCase testCases[] = {
- { 2, 1, false, { 0xc1 }, { 0xc3, 0x81 } }, // Valid 2-byte sequence
- { 1, 0, true, { 0x0 }, { 0xc3, 0x81 } }, // Valid 2-byte sequence truncated at the end of the chunk
- { 2, 2, false, { 0xfffd, 0x79 },{ 0xc3, 0x79 } }, // Invalid 2-byte sequence
- { 1, 0, true, { 0x0 }, { 0xc3, 0x79 } }, // Invalid 2-byte sequence truncated at the end of the chunk
- { 3, 1, false, { 0x3042 },{ 0xe3, 0x81, 0x82 } }, // Valid 3-byte sequence
- { 1, 0, true, { 0x0 }, { 0xe3, 0x81, 0x82 } }, // Valid 3-byte sequence truncated at the end of the chunk
- { 2, 0, true, { 0x0 }, { 0xe3, 0x81, 0x82 } }, // Valid 3-byte sequence truncated at the end of the chunk
- { 3, 3, false, { 0xfffd, 0x79, 0xfffd }, { 0xe3, 0x79, 0x82 } }, // Invalid 3-byte sequence
- { 1, 0, true, { 0x0 }, { 0xe3, 0x79, 0x82 } }, // Invalid 3-byte sequence truncated at the end of the chunk
- { 2, 0, true, { 0x0 }, { 0xe3, 0x79, 0x82 } }, // Invalid 3-byte sequence truncated at the end of the chunk
- { 3, 3, false, { 0xfffd, 0xfffd, 0x79 }, { 0xe3, 0x81, 0x79 } }, // Invalid 3-byte sequence
- { 1, 0, true, { 0x0 }, { 0xe3, 0x81, 0x79 } }, // Invalid 3-byte sequence truncated at the end of the chunk
- { 2, 0, true, { 0x0 }, { 0xe3, 0x81, 0x79 } }, // Invalid 3-byte sequence truncated at the end of the chunk
- { 4, 2, false, { 0xd9c4, 0xdc83 }, { 0xf2, 0x81, 0x82, 0x83 } }, // Valid 4-byte sequence
- { 1, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x83 } }, // Valid 4-byte sequence truncated at the end of the chunk
- { 2, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x83 } }, // Valid 4-byte sequence truncated at the end of the chunk
- { 3, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x83 } }, // Valid 4-byte sequence truncated at the end of the chunk
- { 4, 4, false, { 0xfffd, 0x79, 0xfffd, 0xfffd }, { 0xf2, 0x79, 0x82, 0x83 } }, // Invalid 4-byte sequence
- { 1, 0, true, { 0x0 }, { 0xf2, 0x79, 0x82, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
- { 2, 0, true, { 0x0 }, { 0xf2, 0x79, 0x82, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
- { 3, 0, true, { 0x0 }, { 0xf2, 0x79, 0x82, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
- { 4, 4, false, { 0xfffd, 0xfffd, 0x79, 0xfffd }, { 0xf2, 0x81, 0x79, 0x83 } }, // Invalid 4-byte sequence
- { 1, 0, true, { 0x0 }, { 0xf2, 0x81, 0x79, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
- { 2, 0, true, { 0x0 }, { 0xf2, 0x81, 0x79, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
- { 3, 0, true, { 0x0 }, { 0xf2, 0x81, 0x79, 0x83 } }, // Invalid 4-byte sequence truncated at the end of the chunk
- { 4, 4, false, { 0xfffd, 0xfffd, 0xfffd, 0x79 }, { 0xf2, 0x81, 0x82, 0x79 } }, // Invalid 4-byte sequence
- { 1, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x79 } }, // Invalid 4-byte sequence truncated at the end of the chunk
- { 2, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x79 } }, // Invalid 4-byte sequence truncated at the end of the chunk
- { 3, 0, true, { 0x0 }, { 0xf2, 0x81, 0x82, 0x79 } }, // Invalid 4-byte sequence truncated at the end of the chunk
- };
-
- RunUtf8DecodeTestCase(testCases, utf8::DecodeUnitsIntoAndNullTerminateNoAdvance);
- }
- };
|