Parcourir la source

Add support in our Unicode library to encode surrogate pairs as UTF8

Hitesh Kanwathirtha il y a 9 ans
Parent
commit
4cf0a41594

+ 215 - 0
bin/NativeTests/CodexTests.cpp

@@ -0,0 +1,215 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+#include "stdafx.h"
+#include "catch.hpp"
+#include <process.h>
+#include "Codex\Utf8Codex.h"
+
+#pragma warning(disable:4100) // unreferenced formal parameter
+#pragma warning(disable:6387) // suppressing preFAST which raises warning for passing null to the JsRT APIs
+#pragma warning(disable:6262) // CATCH is using stack variables to report errors, suppressing the preFAST warning.
+
+namespace CodexTest
+{
+    ///
+    /// The following test verifies that for invalid characters, we replace them
+    /// with the unicode replacement character
+    ///
+
+    // Verify single utf8-encoded codepoint
+    void CheckIsUnicodeReplacementChar(const utf8char_t* encodedBuffer)
+    {
+        CHECK(encodedBuffer[0] == 0xEF);
+        CHECK(encodedBuffer[1] == 0xBF);
+        CHECK(encodedBuffer[2] == 0xBD);
+    }
+
+    //
+    // Following test cases are based on the Utf-8 decoder tests 
+    // suggested by Markus Kuhn at https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+    //
+    TEST_CASE("CodexTest_EncodeTrueUtf8_SingleSurrogates", "[CodexTest]")
+    {
+        const charcount_t charCount = 1;
+        utf8char_t encodedBuffer[(charCount + 1) * 3]; // +1 since the buffer will be null-terminated
+
+        char16 testValues[] = { 0xD800, 0xDB7F, 0xDB80, 0xDBFF, 0xDC00, 0xDF80, 0xDFFF };
+        const int numTestCases = _countof(testValues);
+
+        for (int i = 0; i < numTestCases; i++)
+        {
+            size_t numEncodedBytes = utf8::EncodeTrueUtf8IntoAndNullTerminate(encodedBuffer, &testValues[i], charCount);
+            CHECK(numEncodedBytes == 3);
+            CheckIsUnicodeReplacementChar(encodedBuffer);
+        }
+    }
+
+    // 
+    // Test encoding of given utf16-encoded strings into another encoding
+    //
+    // In the expected encoded string, extra bytes are represented as 0
+    //
+
+    template <typename TTestCase, typename TEncodingFunc>
+    void RunUtf8EncodingTestCase(const TTestCase &testCases, const TEncodingFunc func)
+    {
+        const int numTestCases = _countof(testCases);
+        const charcount_t charCount = _countof(testCases[0].surrogatePair);
+        const charcount_t maxEncodedByteCount = _countof(testCases[0].utf8Encoding);
+        utf8char_t encodedBuffer[maxEncodedByteCount + 1]; // +1 in case a null-terminating func is passed in
+
+        for (int i = 0; i < numTestCases; i++)
+        {
+            size_t numEncodedBytes = func(encodedBuffer, testCases[i].surrogatePair, charCount);
+            CHECK(numEncodedBytes <= maxEncodedByteCount);
+            for (size_t j = 0; j < numEncodedBytes; j++)
+            {
+                CHECK(encodedBuffer[j] == testCases[i].utf8Encoding[j]);
+            }
+
+            // Check and make sure there were no other bytes expected in the encoded string
+            if (numEncodedBytes < maxEncodedByteCount)
+            {
+                for (size_t j = numEncodedBytes; j < maxEncodedByteCount; j++)
+                {
+                    CHECK(testCases[i].utf8Encoding[j] == 0);
+                }
+            }
+        }
+    }
+
+    TEST_CASE("CodexTest_EncodeCesu_PairedSurrogates", "[CodexTest]")
+    {
+        // Each of these test cases verifies the encoding 
+        // of a single surrogate pair into a 6 byte CESU string
+        // Each surrogate-pair unit is encoded seperately into utf8
+        struct TestCase
+        {
+            char16     surrogatePair[2];
+            utf8char_t utf8Encoding[6];
+        };
+
+        TestCase testCases[] = {
+            { { 0xD800, 0xDC00 }, { 0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80 } }, //  U+010000 LINEAR B SYLLABLE B008 A character
+            { { 0xD800, 0xDFFF }, { 0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF } }, //  U+0103FF
+            { { 0xDB7F, 0xDC00 }, { 0xED, 0xAD, 0xBF, 0xED, 0xB0, 0x80 } }, //  U+0EFC00
+            { { 0xDB7F, 0xDFFF }, { 0xED, 0xAD, 0xBF, 0xED, 0xBF, 0xBF } }, //  U+0EFFFF
+            { { 0xDB80, 0xDC00 }, { 0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80 } }, //  U+0F0000 Plane 15 Private Use First
+            { { 0xDB80, 0xDFFF }, { 0xED, 0xAE, 0x80, 0xED, 0xBF, 0xBF } }, //  U+0F03FF
+            { { 0xDBFF, 0xDC00 }, { 0xED, 0xAF, 0xBF, 0xED, 0xB0, 0x80 } }, //  U+10FC00
+            { { 0xDBFF, 0xDFFF }, { 0xED, 0xAF, 0xBF, 0xED, 0xBF, 0xBF } }  //  U+10FFFF
+        };
+
+        RunUtf8EncodingTestCase(testCases, static_cast<size_t (*)(utf8char_t*, const char16*, charcount_t)>(utf8::EncodeInto));
+    }
+
+    TEST_CASE("CodexTest_EncodeUtf8_PairedSurrogates", "[CodexTest]")
+    {
+        // Each of these test cases verifies the encoding 
+        // of a single surrogate pair into a 4 byte utf8 string
+        // Each surrogate-pair unit is decoded to its original codepoint
+        // and then encoded into utf8
+        struct TestCase
+        {
+            char16     surrogatePair[2];
+            utf8char_t utf8Encoding[4];
+        };
+
+        TestCase testCases[] = {
+            { { 0xD800, 0xDC00 }, { 0xF0, 0x90, 0x80, 0x80 } }, //  U+010000 LINEAR B SYLLABLE B008 A character
+            { { 0xD800, 0xDFFF }, { 0xF0, 0x90, 0x8F, 0xBF } }, //  U+0103FF
+            { { 0xDB7F, 0xDC00 }, { 0xF3, 0xAF, 0xB0, 0x80 } }, //  U+0EFC00
+            { { 0xDB7F, 0xDFFF }, { 0xF3, 0xAF, 0xBF, 0xBF } }, //  U+0EFFFF
+            { { 0xDB80, 0xDC00 }, { 0xF3, 0xB0, 0x80, 0x80 } }, //  U+0F0000 Plane 15 Private Use First
+            { { 0xDB80, 0xDFFF }, { 0xF3, 0xB0, 0x8F, 0xBF } }, //  U+0F03FF
+            { { 0xDBFF, 0xDC00 }, { 0xF4, 0x8F, 0xB0, 0x80 } }, //  U+10FC00
+            { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } }  //  U+10FFFF
+        };
+
+        RunUtf8EncodingTestCase(testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
+    }
+
+    TEST_CASE("CodexTest_EncodeUtf8_NonCharacters", "[CodexTest]")
+    {
+        // Each of these test cases verifies the encoding 
+        // of certain problematic codepoints that do not represent
+        // characters
+        struct TestCase
+        {
+            char16     surrogatePair[1];
+            utf8char_t utf8Encoding[3];
+        };
+
+        TestCase testCases[] = {
+            { { 0xFFFE }, { 0xEF, 0xBF, 0xBE } }, //  U+FFFE
+            { { 0xFFFF }, { 0xEF, 0xBF, 0xBF } }  //  U+FFFF
+        };
+
+        RunUtf8EncodingTestCase(testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
+    }
+
+    TEST_CASE("CodexTest_EncodeUtf8_BoundaryChars", "[CodexTest]")
+    {
+        // Each of these test cases verifies the encoding 
+        // of boundary conditions
+        struct SingleChar16TestCase
+        {
+            char16     surrogatePair[1];
+            utf8char_t utf8Encoding[3];
+        };
+
+        SingleChar16TestCase testCases[] = {
+            { { 0xD7FF }, { 0xED, 0x9F, 0xBF } }, //  U+D7FF
+            { { 0xE000 }, { 0xEE, 0x80, 0x80 } }, //  U+E000
+            { { 0xFFFD }, { 0xEF, 0xBF, 0xBD } }  //  U+FFFD
+        };
+
+        struct TwoChar16TestCase
+        {
+            char16     surrogatePair[2];
+            utf8char_t utf8Encoding[4];
+        };
+
+        TwoChar16TestCase testCases2[] = {
+            { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } } //  U+10FFFF
+        };
+
+        RunUtf8EncodingTestCase(testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
+        RunUtf8EncodingTestCase(testCases2, utf8::EncodeTrueUtf8IntoAndNullTerminate);
+    }
+
+    TEST_CASE("CodexTest_EncodeUtf8_SimpleCharacters", "[CodexTest]")
+    {
+        // Each of these test cases verifies the encoding 
+        // of certain problematic codepoints that do not represent
+        // characters
+        struct TestCase
+        {
+            char16     surrogatePair[1];
+            utf8char_t utf8Encoding[3];
+        };
+
+        TestCase testCases[] = {
+            { { 0x0024 }, { 0x24 } },              //  U+0024 - Dollar Symbol
+            { { 0x00A2 }, { 0xC2, 0xA2 } },        //  U+00A2 - Cent symbol
+            { { 0x20AC }, { 0xE2, 0x82, 0xAC } }   //  U+20AC - Euro symbol
+        };
+
+        RunUtf8EncodingTestCase(testCases, utf8::EncodeTrueUtf8IntoAndNullTerminate);
+    }
+
+    TEST_CASE("CodexTest_EncodeTrueUtf8_SimpleString", "[CodexTest]")
+    {
+        const charcount_t charCount = 3;
+        utf8char_t encodedBuffer[(charCount + 1) * 3]; // +1 since the buffer will be null terminated
+        char16* sourceBuffer = L"abc";
+        size_t numEncodedBytes = utf8::EncodeTrueUtf8IntoAndNullTerminate(encodedBuffer, sourceBuffer, charCount);
+        CHECK(numEncodedBytes == charCount);
+        for (int i = 0; i < charCount; i++)
+        {
+            CHECK(sourceBuffer[i] == (char16)encodedBuffer[i]);
+        }
+    }
+};

+ 1 - 0
bin/NativeTests/NativeTests.vcxproj

@@ -43,6 +43,7 @@
     <ClInclude Include="stdafx.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="CodexTests.cpp" />
     <ClCompile Include="FileLoadHelpers.cpp" />
     <ClCompile Include="CodexAssert.cpp" />
     <ClCompile Include="JsRTApiTest.cpp" />

+ 64 - 5
lib/Common/Codex/Utf8Codex.cpp

@@ -313,6 +313,37 @@ LFourByte:
         return ptr;
     }
 
+    LPUTF8 EncodeSurrogatePair(char16 surrogateHigh, char16 surrogateLow, __out_ecount(3) LPUTF8 ptr)
+    {
+        // A unicode codepoint is encoded into a surrogate pair by doing the following:
+        //  subtract 0x10000 from the codepoint
+        //  Split the resulting value into the high-ten bits and low-ten bits
+        //  Add 0xD800 to the high ten bits, and 0xDC00 to the low ten bits
+        // Below, we want to decode the surrogate pair to its original codepoint
+        // So we do the above process in reverse
+        uint32 highTen = (surrogateHigh - 0xD800);
+        uint32 lowTen  = (surrogateLow - 0xDC00);
+        uint32 codepoint = 0x10000 + ((highTen << 10) | lowTen);
+
+        // This is the maximum valid unicode codepoint
+        // This should be ensured anyway since you can't encode a value higher 
+        // than this as a surrogate pair, so we assert this here
+        CodexAssert(codepoint <= 0x10FFFF);
+
+        // Now we need to encode the code point into utf-8
+        // Codepoints in the range that gets encoded into a surrogate pair
+        // gets encoded into 4 bytes under utf8
+        // Since the codepoint can be represented by 21 bits, the encoding 
+        // does the following: first 3 bits in the first byte, the next 6 in the
+        // second, the next six in the third, and the last six in the 4th byte
+        *ptr++ = static_cast<utf8char_t>(codepoint >> 18) | 0xF0;
+        *ptr++ = static_cast<utf8char_t>((codepoint >> 12) & 0x3F) | 0x80;
+        *ptr++ = static_cast<utf8char_t>((codepoint >> 6) & 0x3F) | 0x80;
+        *ptr++ = static_cast<utf8char_t>(codepoint & 0x3F) | 0x80;
+
+        return ptr;
+    }
+
     LPCUTF8 NextCharFull(LPCUTF8 ptr)
     {
         return ptr + EncodedBytes(*ptr);
@@ -430,8 +461,9 @@ LSlowPath:
         return true;
     }
 
+    template <bool cesu8Encoding>
     __range(0, cch * 3)
-    size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
+    size_t EncodeIntoImpl(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
     {
         LPUTF8 dest = buffer;
 
@@ -451,15 +483,36 @@ LFastPath:
         }
 
 LSlowPath:
-        while( cch-- > 0 )
+        if (cesu8Encoding)
         {
-            dest = Encode(*source++, dest);
-            if (ShouldFastPath(dest, source)) goto LFastPath;
+            while (cch-- > 0)
+            {
+                dest = Encode(*source++, dest);
+                if (ShouldFastPath(dest, source)) goto LFastPath;
+            }
+        }
+        else
+        {
+            while (cch-- > 0)
+            {
+                // We increment the source pointer here since at least one utf16 code unit is read here
+                // If the code unit turns out to be the high surrogate in a surrogate pair, then 
+                // EncodeTrueUtf8 will consume the low surrogate code unit too by decrementing cch 
+                // and incrementing source
+                dest = EncodeTrueUtf8(*source++, &source, &cch, dest);
+                if (ShouldFastPath(dest, source)) goto LFastPath;
+            }
         }
 
         return dest - buffer;
     }
 
+    __range(0, cch * 3)
+        size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
+    {
+        return EncodeIntoImpl<true>(buffer, source, cch);
+    }
+
     __range(0, cch * 3)
     size_t EncodeIntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
     {
@@ -468,7 +521,13 @@ LSlowPath:
         return result;
     }
 
-
+    __range(0, cch * 3)
+        size_t EncodeTrueUtf8IntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch)
+    {
+        size_t result = EncodeIntoImpl<false>(buffer, source, cch);
+        buffer[result] = 0;
+        return result;
+    }
 
     // Convert the character index into a byte index.
     size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, charcount_t cchIndex, DecodeOptions options)

+ 47 - 0
lib/Common/Codex/Utf8Codex.h

@@ -161,6 +161,9 @@ namespace utf8
     // special cases ASCII to avoid a call the most common characters.
     LPUTF8 EncodeFull(char16 ch, __out_ecount(3) LPUTF8 ptr);
 
+    // Encode a surrogate pair into a utf8 sequence 
+    LPUTF8 EncodeSurrogatePair(char16 surrogateHigh, char16 surrogateLow, __out_ecount(3) LPUTF8 ptr);
+
     // Encode ch into a UTF8 sequence ignoring surrogate pairs (which are encoded as two
     // separate code points).
     inline LPUTF8 Encode(char16 ch, __out_ecount(3) LPUTF8 ptr)
@@ -173,6 +176,46 @@ namespace utf8
         return EncodeFull(ch, ptr);
     }
 
+    // Encode ch into a UTF8 sequence while being aware of surrogate pairs.
+    inline LPUTF8 EncodeTrueUtf8(char16 ch, const char16** source, charcount_t* cch, __out_ecount(3) LPUTF8 ptr)
+    {
+        if (ch < 0x80)
+        {
+            *ptr = static_cast<utf8char_t>(ch);
+            return ptr + 1;
+        }
+        else if (ch < 0xD800 || (ch >= 0xE000 && ch <= 0xFFFF))
+        {
+            return EncodeFull(ch, ptr);
+        } 
+
+        // We're now decoding a surrogate pair. If the input is malformed (eg. low surrogate is absent)
+        // we'll instead encode the unicode replacement character as utf8
+        if (*cch > 0)
+        {
+            char16 surrogateHigh = ch;
+            char16 surrogateLow = **source;
+
+            // Validate that the surrogate code units are within the appropriate 
+            // ranges for high and low surrogates
+            if ((surrogateHigh >= 0xD800 && surrogateHigh <= 0xDBFF) &&
+                (surrogateLow >= 0xDC00 && surrogateLow <= 0xDFFF))
+            {
+                // Consume the low surrogate
+                *source = *source + 1;
+                *cch = *cch - 1;
+
+                return EncodeSurrogatePair(surrogateHigh, surrogateLow, ptr);
+            }
+        }
+
+        // Invalid input: insert the unicode replacement character instead
+        ptr[0] = 0xEF;
+        ptr[1] = 0xBF;
+        ptr[2] = 0xBD;
+        return ptr + 3;
+    }
+
     // Return true if ch is a lead byte of a UTF8 multi-unit sequence.
     inline bool IsLeadByte(utf8char_t ch)
     {
@@ -263,6 +306,10 @@ namespace utf8
     __range(0, cch * 3)
     size_t EncodeIntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch);
 
+    // Like EncodeInto but ensures that buffer[return value] == 0.
+    __range(0, cch * 3)
+    size_t EncodeTrueUtf8IntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch);
+
     // Returns true if the pch refers to a UTF-16LE encoding of the given UTF-8 encoding bch.
     bool CharsAreEqual(__in_ecount(cch) LPCOLESTR pch, LPCUTF8 bch, size_t cch, DecodeOptions options = doDefault);
 

+ 6 - 2
lib/Common/Codex/Utf8Helper.h

@@ -39,7 +39,7 @@ namespace utf8
             return E_OUTOFMEMORY;
         }
 
-        size_t cbEncoded = utf8::EncodeIntoAndNullTerminate(destString, sourceString, (charcount_t) cchSourceString);
+        size_t cbEncoded = utf8::EncodeTrueUtf8IntoAndNullTerminate(destString, sourceString, (charcount_t) cchSourceString);
         Assert(cbEncoded <= cbDestString);
         static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
         *destStringPtr = (char*)destString;
@@ -74,7 +74,11 @@ namespace utf8
             return E_OUTOFMEMORY;
         }
 
-        utf8::DecodeIntoAndNullTerminate(destString, (LPCUTF8) sourceString, cchDestString);
+        // Some node tests depend on the utf8 decoder not swallowing invalid unicode characters
+        // instead of replacing them with the "replacement" chracter. Pass a flag to our 
+        // decoder to require such behavior
+        utf8::DecodeIntoAndNullTerminate(destString, (LPCUTF8) sourceString, cchDestString,
+            DecodeOptions::doAllowInvalidWCHARs);
         Assert(destString[cchDestString] == 0);
         static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
         *destStringPtr = destString;

+ 0 - 1
lib/Common/Core/CodexAssert.cpp

@@ -10,4 +10,3 @@ void CodexAssert(bool condition)
 {
     Assert(condition);
 }
-

+ 1 - 1
lib/Jsrt/JsrtSourceHolder.cpp

@@ -64,7 +64,7 @@ namespace Js
                 *utf8Script = HeapNewArray(utf8char_t, cbUtf8Buffer);
             }
 
-            *utf8Length = utf8::EncodeIntoAndNullTerminate(*utf8Script, script, static_cast<charcount_t>(length));
+            *utf8Length = utf8::EncodeTrueUtf8IntoAndNullTerminate(*utf8Script, script, static_cast<charcount_t>(length));
             *scriptLength = length;
 
             if (utf8AllocLength != nullptr)