il y a 9 ans · 4cf0a41594
--- a/bin/NativeTests/CodexTests.cpp
+++ b/bin/NativeTests/CodexTests.cpp
@@ -0,0 +1,215 @@
 
				+//-------------------------------------------------------------------------------------------------------
			
 
				+// Copyright (C) Microsoft. All rights reserved.
			
 
				+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
			
 
				+//-------------------------------------------------------------------------------------------------------
			
 
				+#include "stdafx.h"
			
 
				+#include "catch.hpp"
			
 
				+#include <process.h>
			
 
				+#include "Codex\Utf8Codex.h"
			
 
				+
			
 
				+#pragma warning(disable:4100) // unreferenced formal parameter
			
 
				+#pragma warning(disable:6387) // suppressing preFAST which raises warning for passing null to the JsRT APIs
			
 
				+#pragma warning(disable:6262) // CATCH is using stack variables to report errors, suppressing the preFAST warning.
			
 
				+
			
 
				+namespace CodexTest
			
 
				+{
			
 
				+    ///
			
 
				+    /// The following test verifies that for invalid characters, we replace them
			
 
				+    /// with the unicode replacement character
			
 
				+    ///
			
 
				+
			
 
				+    // Verify single utf8-encoded codepoint
			
 
				+    void CheckIsUnicodeReplacementChar(const utf8char_t* encodedBuffer)
			
 
				+    {
			
 
				+        CHECK(encodedBuffer[0] == 0xEF);
			
 
				+        CHECK(encodedBuffer[1] == 0xBF);
			
 
				+        CHECK(encodedBuffer[2] == 0xBD);
			
 
				+    }
			
 
				+
			
 
				+    //
			
 
				+    // Following test cases are based on the Utf-8 decoder tests 
			
 
				+    // suggested by Markus Kuhn at https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
			
 
				+    //
			
 
				+    TEST_CASE("CodexTest_EncodeTrueUtf8_SingleSurrogates", "[CodexTest]")
			
 
				+    {
			
 
				+        const charcount_t charCount = 1;
			
 
				+        utf8char_t encodedBuffer[(charCount + 1) * 3]; // +1 since the buffer will be null-terminated
			
 
				+
			
 
				+        char16 testValues[] = { 0xD800, 0xDB7F, 0xDB80, 0xDBFF, 0xDC00, 0xDF80, 0xDFFF };
			
 
				+        const int numTestCases = _countof(testValues);
			
 
				+
			
 
				+        for (int i = 0; i < numTestCases; i++)
			
 
				+        {
			
 
				+            size_t numEncodedBytes = utf8::EncodeTrueUtf8IntoAndNullTerminate(encodedBuffer, &testValues[i], charCount);
			
 
				+            CHECK(numEncodedBytes == 3);
			
 
				+            CheckIsUnicodeReplacementChar(encodedBuffer);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // 
			
 
				+    // Test encoding of given utf16-encoded strings into another encoding
			
 
				+    //
			
 
				+    // In the expected encoded string, extra bytes are represented as 0
			
 
				+    //
			
 
				+
			
 
				+    template <typename TTestCase, typename TEncodingFunc>
			
 
				+    void RunUtf8EncodingTestCase(const TTestCase &testCases, const TEncodingFunc func)
			
 
				+    {
			
 
				+        const int numTestCases = _countof(testCases);
			
 
				+        const charcount_t charCount = _countof(testCases[0].surrogatePair);
			
 
				+        const charcount_t maxEncodedByteCount = _countof(testCases[0].utf8Encoding);
			
 
				+        utf8char_t encodedBuffer[maxEncodedByteCount + 1]; // +1 in case a null-terminating func is passed in
			
 
				+
			
 
				+        for (int i = 0; i < numTestCases; i++)
			
 
				+        {
			
 
				+            size_t numEncodedBytes = func(encodedBuffer, testCases[i].surrogatePair, charCount);
			
 
				+            CHECK(numEncodedBytes <= maxEncodedByteCount);
			
 
				+            for (size_t j = 0; j < numEncodedBytes; j++)
			
 
				+            {
			
 
				+                CHECK(encodedBuffer[j] == testCases[i].utf8Encoding[j]);
			
 
				+            }
			
 
				+
			
 
				+            // Check and make sure there were no other bytes expected in the encoded string
			
 
				+            if (numEncodedBytes < maxEncodedByteCount)
			
 
				+            {
			
 
				+                for (size_t j = numEncodedBytes; j < maxEncodedByteCount; j++)
			
 
				+                {
			
 
				+                    CHECK(testCases[i].utf8Encoding[j] == 0);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    TEST_CASE("CodexTest_EncodeCesu_PairedSurrogates", "[CodexTest]")
			
 
				+    {
			
 
				+        // Each of these test cases verifies the encoding 
			
 
				+        // of a single surrogate pair into a 6 byte CESU string
			
 
				+        // Each surrogate-pair unit is encoded seperately into utf8
			
 
				+        struct TestCase
			
 
				+        {
			
 
				+            char16     surrogatePair[2];
			
 
				+            utf8char_t utf8Encoding[6];
			
 
				+        };
			
 
				+
			
 
				+        TestCase testCases[] = {
			
 
				+            { { 0xD800, 0xDC00 }, { 0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80 } }, //  U+010000 LINEAR B SYLLABLE B008 A character
			
 
				+            { { 0xD800, 0xDFFF }, { 0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF } }, //  U+0103FF
			
 
				+            { { 0xDB7F, 0xDC00 }, { 0xED, 0xAD, 0xBF, 0xED, 0xB0, 0x80 } }, //  U+0EFC00
			
 
				+            { { 0xDB7F, 0xDFFF }, { 0xED, 0xAD, 0xBF, 0xED, 0xBF, 0xBF } }, //  U+0EFFFF
			
 
				+            { { 0xDB80, 0xDC00 }, { 0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80 } }, //  U+0F0000 Plane 15 Private Use First
			
 
				+            { { 0xDB80, 0xDFFF }, { 0xED, 0xAE, 0x80, 0xED, 0xBF, 0xBF } }, //  U+0F03FF
			
 
				+            { { 0xDBFF, 0xDC00 }, { 0xED, 0xAF, 0xBF, 0xED, 0xB0, 0x80 } }, //  U+10FC00
			
 
				+            { { 0xDBFF, 0xDFFF }, { 0xED, 0xAF, 0xBF, 0xED, 0xBF, 0xBF } }  //  U+10FFFF
			
 
				+        };
			
 
				+
			
 
				+        RunUtf8EncodingTestCase(testCases, static_cast<size_t (*)(utf8char_t*, const char16*, charcount_t)>(utf8::EncodeInto));
			
 
				+    }
			
 
				+
			
 
				+    TEST_CASE("CodexTest_EncodeUtf8_PairedSurrogates", "[CodexTest]")
			
 
				+    {
			
 
				+        // Each of these test cases verifies the encoding 
			
 
				+        // of a single surrogate pair into a 4 byte utf8 string
			
 
				+        // Each surrogate-pair unit is decoded to its original codepoint
			
 
				+        // and then encoded into utf8
			
 
				+        struct TestCase
			
 
				+        {
			
 
				+            char16     surrogatePair[2];
			
 
				+            utf8char_t utf8Encoding[4];
			
 
				+        };
			
 
				+
			
 
				+        TestCase testCases[] = {
			
 
				+            { { 0xD800, 0xDC00 }, { 0xF0, 0x90, 0x80, 0x80 } }, //  U+010000 LINEAR B SYLLABLE B008 A character
			
 
				+            { { 0xD800, 0xDFFF }, { 0xF0, 0x90, 0x8F, 0xBF } }, //  U+0103FF
			
 
				+            { { 0xDB7F, 0xDC00 }, { 0xF3, 0xAF, 0xB0, 0x80 } }, //  U+0EFC00
			
 
				+            { { 0xDB7F, 0xDFFF }, { 0xF3, 0xAF, 0xBF, 0xBF } }, //  U+0EFFFF
			
 
				+            { { 0xDB80, 0xDC00 }, { 0xF3, 0xB0, 0x80, 0x80 } }, //  U+0F0000 Plane 15 Private Use First
			
 
				+            { { 0xDB80, 0xDFFF }, { 0xF3, 0xB0, 0x8F, 0xBF } }, //  U+0F03FF
			
 
				+            { { 0xDBFF, 0xDC00 }, { 0xF4, 0x8F, 0xB0, 0x80 } }, //  U+10FC00
			
 
				+            { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } }  //  U+10FFFF
			
 
				+        };
			
 
				+
			
 
				+        RunUtf8EncodingTestCase(testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
			
 
				+    }
			
 
				+
			
 
				+    TEST_CASE("CodexTest_EncodeUtf8_NonCharacters", "[CodexTest]")
			
 
				+    {
			
 
				+        // Each of these test cases verifies the encoding 
			
 
				+        // of certain problematic codepoints that do not represent
			
 
				+        // characters
			
 
				+        struct TestCase
			
 
				+        {
			
 
				+            char16     surrogatePair[1];
			
 
				+            utf8char_t utf8Encoding[3];
			
 
				+        };
			
 
				+
			
 
				+        TestCase testCases[] = {
			
 
				+            { { 0xFFFE }, { 0xEF, 0xBF, 0xBE } }, //  U+FFFE
			
 
				+            { { 0xFFFF }, { 0xEF, 0xBF, 0xBF } }  //  U+FFFF
			
 
				+        };
			
 
				+
			
 
				+        RunUtf8EncodingTestCase(testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
			
 
				+    }
			
 
				+
			
 
				+    TEST_CASE("CodexTest_EncodeUtf8_BoundaryChars", "[CodexTest]")
			
 
				+    {
			
 
				+        // Each of these test cases verifies the encoding 
			
 
				+        // of boundary conditions
			
 
				+        struct SingleChar16TestCase
			
 
				+        {
			
 
				+            char16     surrogatePair[1];
			
 
				+            utf8char_t utf8Encoding[3];
			
 
				+        };
			
 
				+
			
 
				+        SingleChar16TestCase testCases[] = {
			
 
				+            { { 0xD7FF }, { 0xED, 0x9F, 0xBF } }, //  U+D7FF
			
 
				+            { { 0xE000 }, { 0xEE, 0x80, 0x80 } }, //  U+E000
			
 
				+            { { 0xFFFD }, { 0xEF, 0xBF, 0xBD } }  //  U+FFFD
			
 
				+        };
			
 
				+
			
 
				+        struct TwoChar16TestCase
			
 
				+        {
			
 
				+            char16     surrogatePair[2];
			
 
				+            utf8char_t utf8Encoding[4];
			
 
				+        };
			
 
				+
			
 
				+        TwoChar16TestCase testCases2[] = {
			
 
				+            { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } } //  U+10FFFF
			
 
				+        };
			
 
				+
			
 
				+        RunUtf8EncodingTestCase(testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
			
 
				+        RunUtf8EncodingTestCase(testCases2, utf8::EncodeTrueUtf8IntoAndNullTerminate);
			
 
				+    }
			
 
				+
			
 
				+    TEST_CASE("CodexTest_EncodeUtf8_SimpleCharacters", "[CodexTest]")
			
 
				+    {
			
 
				+        // Each of these test cases verifies the encoding 
			
 
				+        // of certain problematic codepoints that do not represent
			
 
				+        // characters
			
 
				+        struct TestCase
			
 
				+        {
			
 
				+            char16     surrogatePair[1];
			
 
				+            utf8char_t utf8Encoding[3];
			
 
				+        };
			
 
				+
			
 
				+        TestCase testCases[] = {
			
 
				+            { { 0x0024 }, { 0x24 } },              //  U+0024 - Dollar Symbol
			
 
				+            { { 0x00A2 }, { 0xC2, 0xA2 } },        //  U+00A2 - Cent symbol
			
 
				+            { { 0x20AC }, { 0xE2, 0x82, 0xAC } }   //  U+20AC - Euro symbol
			
 
				+        };
			
 
				+
			
 
				+        RunUtf8EncodingTestCase(testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
			
 
				+    }
			
 
				+
			
 
				+    TEST_CASE("CodexTest_EncodeTrueUtf8_SimpleString", "[CodexTest]")
			
 
				+    {
			
 
				+        const charcount_t charCount = 3;
			
 
				+        utf8char_t encodedBuffer[(charCount + 1) * 3]; // +1 since the buffer will be null terminated
			
 
				+        char16* sourceBuffer = L"abc";
			
 
				+        size_t numEncodedBytes = utf8::EncodeTrueUtf8IntoAndNullTerminate(encodedBuffer, sourceBuffer, charCount);
			
 
				+        CHECK(numEncodedBytes == charCount);
			
 
				+        for (int i = 0; i < charCount; i++)
			
 
				+        {
			
 
				+            CHECK(sourceBuffer[i] == (char16)encodedBuffer[i]);
			
 
				+        }
			
 
				+    }
			
 
				+};
			
--- a/bin/NativeTests/NativeTests.vcxproj
+++ b/bin/NativeTests/NativeTests.vcxproj
@@ -43,6 +43,7 @@
 
				     <ClInclude Include="stdafx.h" />
			
 
				   </ItemGroup>
			
 
				   <ItemGroup>
			
 
				+    <ClCompile Include="CodexTests.cpp" />
			
 
				     <ClCompile Include="FileLoadHelpers.cpp" />
			
 
				     <ClCompile Include="CodexAssert.cpp" />
			
 
				     <ClCompile Include="JsRTApiTest.cpp" />
			
--- a/lib/Common/Codex/Utf8Codex.cpp
+++ b/lib/Common/Codex/Utf8Codex.cpp
@@ -313,6 +313,37 @@ LFourByte:
 
				         return ptr;
			
 
				     }
			
 
				 
			
 
				+    LPUTF8 EncodeSurrogatePair(char16 surrogateHigh, char16 surrogateLow, __out_ecount(3) LPUTF8 ptr)
			
 
				+    {
			
 
				+        // A unicode codepoint is encoded into a surrogate pair by doing the following:
			
 
				+        //  subtract 0x10000 from the codepoint
			
 
				+        //  Split the resulting value into the high-ten bits and low-ten bits
			
 
				+        //  Add 0xD800 to the high ten bits, and 0xDC00 to the low ten bits
			
 
				+        // Below, we want to decode the surrogate pair to its original codepoint
			
 
				+        // So we do the above process in reverse
			
 
				+        uint32 highTen = (surrogateHigh - 0xD800);
			
 
				+        uint32 lowTen  = (surrogateLow - 0xDC00);
			
 
				+        uint32 codepoint = 0x10000 + ((highTen << 10) | lowTen);
			
 
				+
			
 
				+        // This is the maximum valid unicode codepoint
			
 
				+        // This should be ensured anyway since you can't encode a value higher 
			
 
				+        // than this as a surrogate pair, so we assert this here
			
 
				+        CodexAssert(codepoint <= 0x10FFFF);
			
 
				+
			
 
				+        // Now we need to encode the code point into utf-8
			
 
				+        // Codepoints in the range that gets encoded into a surrogate pair
			
 
				+        // gets encoded into 4 bytes under utf8
			
 
				+        // Since the codepoint can be represented by 21 bits, the encoding 
			
 
				+        // does the following: first 3 bits in the first byte, the next 6 in the
			
 
				+        // second, the next six in the third, and the last six in the 4th byte
			
 
				+        *ptr++ = static_cast<utf8char_t>(codepoint >> 18) | 0xF0;
			
 
				+        *ptr++ = static_cast<utf8char_t>((codepoint >> 12) & 0x3F) | 0x80;
			
 
				+        *ptr++ = static_cast<utf8char_t>((codepoint >> 6) & 0x3F) | 0x80;
			
 
				+        *ptr++ = static_cast<utf8char_t>(codepoint & 0x3F) | 0x80;
			
 
				+
			
 
				+        return ptr;
			
 
				+    }
			
 
				+
			
 
				     LPCUTF8 NextCharFull(LPCUTF8 ptr)
			
 
				     {
			
 
				         return ptr + EncodedBytes(*ptr);
			
@@ -430,8 +461,9 @@ LSlowPath:
 
				         return true;
			
 
				     }
			
 
				 
			
 
				+    template <bool cesu8Encoding>
			
 
				     __range(0, cch * 3)
			
 
				-    size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
			
 
				+    size_t EncodeIntoImpl(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
			
 
				     {
			
 
				         LPUTF8 dest = buffer;
			
 
				 
			
@@ -451,15 +483,36 @@ LFastPath:
 
				         }
			
 
				 
			
 
				 LSlowPath:
			
 
				-        while( cch-- > 0 )
			
 
				+        if (cesu8Encoding)
			
 
				         {
			
 
				-            dest = Encode(*source++, dest);
			
 
				-            if (ShouldFastPath(dest, source)) goto LFastPath;
			
 
				+            while (cch-- > 0)
			
 
				+            {
			
 
				+                dest = Encode(*source++, dest);
			
 
				+                if (ShouldFastPath(dest, source)) goto LFastPath;
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            while (cch-- > 0)
			
 
				+            {
			
 
				+                // We increment the source pointer here since at least one utf16 code unit is read here
			
 
				+                // If the code unit turns out to be the high surrogate in a surrogate pair, then 
			
 
				+                // EncodeTrueUtf8 will consume the low surrogate code unit too by decrementing cch 
			
 
				+                // and incrementing source
			
 
				+                dest = EncodeTrueUtf8(*source++, &source, &cch, dest);
			
 
				+                if (ShouldFastPath(dest, source)) goto LFastPath;
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         return dest - buffer;
			
 
				     }
			
 
				 
			
 
				+    __range(0, cch * 3)
			
 
				+        size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
			
 
				+    {
			
 
				+        return EncodeIntoImpl<true>(buffer, source, cch);
			
 
				+    }
			
 
				+
			
 
				     __range(0, cch * 3)
			
 
				     size_t EncodeIntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
			
 
				     {
			
@@ -468,7 +521,13 @@ LSlowPath:
 
				         return result;
			
 
				     }
			
 
				 
			
 
				-
			
 
				+    __range(0, cch * 3)
			
 
				+        size_t EncodeTrueUtf8IntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
			
 
				+    {
			
 
				+        size_t result = EncodeIntoImpl<false>(buffer, source, cch);
			
 
				+        buffer[result] = 0;
			
 
				+        return result;
			
 
				+    }
			
 
				 
			
 
				     // Convert the character index into a byte index.
			
 
				     size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, charcount_t cchIndex, DecodeOptions options)
			
--- a/lib/Common/Codex/Utf8Codex.h
+++ b/lib/Common/Codex/Utf8Codex.h
@@ -161,6 +161,9 @@ namespace utf8
 
				     // special cases ASCII to avoid a call the most common characters.
			
 
				     LPUTF8 EncodeFull(char16 ch, __out_ecount(3) LPUTF8 ptr);
			
 
				 
			
 
				+    // Encode a surrogate pair into a utf8 sequence 
			
 
				+    LPUTF8 EncodeSurrogatePair(char16 surrogateHigh, char16 surrogateLow, __out_ecount(3) LPUTF8 ptr);
			
 
				+
			
 
				     // Encode ch into a UTF8 sequence ignoring surrogate pairs (which are encoded as two
			
 
				     // separate code points).
			
 
				     inline LPUTF8 Encode(char16 ch, __out_ecount(3) LPUTF8 ptr)
			
@@ -173,6 +176,46 @@ namespace utf8
 
				         return EncodeFull(ch, ptr);
			
 
				     }
			
 
				 
			
 
				+    // Encode ch into a UTF8 sequence while being aware of surrogate pairs.
			
 
				+    inline LPUTF8 EncodeTrueUtf8(char16 ch, const char16** source, charcount_t* cch, __out_ecount(3) LPUTF8 ptr)
			
 
				+    {
			
 
				+        if (ch < 0x80)
			
 
				+        {
			
 
				+            *ptr = static_cast<utf8char_t>(ch);
			
 
				+            return ptr + 1;
			
 
				+        }
			
 
				+        else if (ch < 0xD800 || (ch >= 0xE000 && ch <= 0xFFFF))
			
 
				+        {
			
 
				+            return EncodeFull(ch, ptr);
			
 
				+        } 
			
 
				+
			
 
				+        // We're now decoding a surrogate pair. If the input is malformed (eg. low surrogate is absent)
			
 
				+        // we'll instead encode the unicode replacement character as utf8
			
 
				+        if (*cch > 0)
			
 
				+        {
			
 
				+            char16 surrogateHigh = ch;
			
 
				+            char16 surrogateLow = **source;
			
 
				+
			
 
				+            // Validate that the surrogate code units are within the appropriate 
			
 
				+            // ranges for high and low surrogates
			
 
				+            if ((surrogateHigh >= 0xD800 && surrogateHigh <= 0xDBFF) &&
			
 
				+                (surrogateLow >= 0xDC00 && surrogateLow <= 0xDFFF))
			
 
				+            {
			
 
				+                // Consume the low surrogate
			
 
				+                *source = *source + 1;
			
 
				+                *cch = *cch - 1;
			
 
				+
			
 
				+                return EncodeSurrogatePair(surrogateHigh, surrogateLow, ptr);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // Invalid input: insert the unicode replacement character instead
			
 
				+        ptr[0] = 0xEF;
			
 
				+        ptr[1] = 0xBF;
			
 
				+        ptr[2] = 0xBD;
			
 
				+        return ptr + 3;
			
 
				+    }
			
 
				+
			
 
				     // Return true if ch is a lead byte of a UTF8 multi-unit sequence.
			
 
				     inline bool IsLeadByte(utf8char_t ch)
			
 
				     {
			
@@ -263,6 +306,10 @@ namespace utf8
 
				     __range(0, cch * 3)
			
 
				     size_t EncodeIntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch);
			
 
				 
			
 
				+    // Like EncodeInto but ensures that buffer[return value] == 0.
			
 
				+    __range(0, cch * 3)
			
 
				+    size_t EncodeTrueUtf8IntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch);
			
 
				+
			
 
				     // Returns true if the pch refers to a UTF-16LE encoding of the given UTF-8 encoding bch.
			
 
				     bool CharsAreEqual(__in_ecount(cch) LPCOLESTR pch, LPCUTF8 bch, size_t cch, DecodeOptions options = doDefault);
			
 
				 
			
--- a/lib/Common/Codex/Utf8Helper.h
+++ b/lib/Common/Codex/Utf8Helper.h
@@ -39,7 +39,7 @@ namespace utf8
 
				             return E_OUTOFMEMORY;
			
 
				         }
			
 
				 
			
 
				-        size_t cbEncoded = utf8::EncodeIntoAndNullTerminate(destString, sourceString, (charcount_t) cchSourceString);
			
 
				+        size_t cbEncoded = utf8::EncodeTrueUtf8IntoAndNullTerminate(destString, sourceString, (charcount_t) cchSourceString);
			
 
				         Assert(cbEncoded <= cbDestString);
			
 
				         static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
			
 
				         *destStringPtr = (char*)destString;
			
@@ -74,7 +74,11 @@ namespace utf8
 
				             return E_OUTOFMEMORY;
			
 
				         }
			
 
				 
			
 
				-        utf8::DecodeIntoAndNullTerminate(destString, (LPCUTF8) sourceString, cchDestString);
			
 
				+        // Some node tests depend on the utf8 decoder not swallowing invalid unicode characters
			
 
				+        // instead of replacing them with the "replacement" chracter. Pass a flag to our 
			
 
				+        // decoder to require such behavior
			
 
				+        utf8::DecodeIntoAndNullTerminate(destString, (LPCUTF8) sourceString, cchDestString,
			
 
				+            DecodeOptions::doAllowInvalidWCHARs);
			
 
				         Assert(destString[cchDestString] == 0);
			
 
				         static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
			
 
				         *destStringPtr = destString;
			
--- a/lib/Common/Core/CodexAssert.cpp
+++ b/lib/Common/Core/CodexAssert.cpp
@@ -10,4 +10,3 @@ void CodexAssert(bool condition)
 
				 {
			
 
				     Assert(condition);
			
 
				 }
			
 
				-
			
--- a/lib/Jsrt/JsrtSourceHolder.cpp
+++ b/lib/Jsrt/JsrtSourceHolder.cpp
@@ -64,7 +64,7 @@ namespace Js
 
				                 *utf8Script = HeapNewArray(utf8char_t, cbUtf8Buffer);
			
 
				             }
			
 
				 
			
 
				-            *utf8Length = utf8::EncodeIntoAndNullTerminate(*utf8Script, script, static_cast<charcount_t>(length));
			
 
				+            *utf8Length = utf8::EncodeTrueUtf8IntoAndNullTerminate(*utf8Script, script, static_cast<charcount_t>(length));
			
 
				             *scriptLength = length;
			
 
				 
			
 
				             if (utf8AllocLength != nullptr)