Ver Fonte

Add unit tests, refactor to remove DecodeInto variants from Utf8Codex

Taylor Woll há 9 anos atrás
pai
commit
f85beb7c04

+ 2 - 2
bin/NativeTests/FileLoadHelpers.cpp

@@ -9,7 +9,7 @@ HRESULT FileLoadHelpers::LoadScriptFromFile(LPCSTR filename, LPCWSTR& contents,
 {
     HRESULT hr = S_OK;
     LPCWSTR contentsRaw = nullptr;
-    byte * pRawBytes = nullptr;
+    LPCUTF8 pRawBytes = nullptr;
     UINT lengthBytes = 0;
     bool isUtf8 = false;
     contents = nullptr;
@@ -119,7 +119,7 @@ HRESULT FileLoadHelpers::LoadScriptFromFile(LPCSTR filename, LPCWSTR& contents,
             IfFailGo(E_OUTOFMEMORY);
         }
 
-        utf8::DecodeIntoAndNullTerminate((char16*) contents, pRawBytes, pRawBytes + lengthBytes, cUtf16Chars, decodeOptions);
+        utf8::DecodeUnitsIntoAndNullTerminate((char16*)contents, pRawBytes, pRawBytes + lengthBytes, decodeOptions);
     }
 
 Error:

+ 11 - 35
lib/Common/Codex/Utf8Codex.cpp

@@ -376,40 +376,9 @@ LFourByte:
         else
             return ptr;
     }
-
-    void DecodeInto(__out_ecount_full(cch) char16 *buffer, LPCUTF8 ptr, LPCUTF8 end, size_t cch, DecodeOptions options)
-    {
-        DecodeOptions localOptions = options;
-
-        if (!ShouldFastPath(ptr, buffer)) goto LSlowPath;
-
-LFastPath:
-        while (cch >= 4)
-        {
-            uint32 bytes = *(uint32 *)ptr;
-            if ((bytes & 0x80808080) != 0) goto LSlowPath;
-            ((uint32 *)buffer)[0] = (bytes & 0x7F) | ((bytes << 8) & 0x7F0000);
-            ((uint32 *)buffer)[1] = ((bytes >> 16) & 0x7F) | ((bytes >> 8) & 0x7F0000);
-            ptr += 4;
-            buffer += 4;
-            cch -= 4;
-        }
-LSlowPath:
-        while (cch-- > 0)
-        {
-            *buffer++ = Decode(ptr, end, localOptions);
-            if (ShouldFastPath(ptr, buffer)) goto LFastPath;
-        }
-    }
-
-    void DecodeIntoAndNullTerminate(__out_ecount(cch+1) __nullterminated char16 *buffer, LPCUTF8 ptr, LPCUTF8 end, size_t cch, DecodeOptions options)
-    {
-        DecodeInto(buffer, ptr, end, cch, options);
-        buffer[cch] = 0;
-    }
-
-    _Ret_range_(0, pbEnd - _Old_(pbUtf8))
-    size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
+    
+    _Use_decl_annotations_
+    size_t DecodeUnitsInto(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
     {
         DecodeOptions localOptions = options;
 
@@ -454,13 +423,20 @@ LSlowPath:
         return dest - buffer;
     }
 
-    size_t DecodeUnitsIntoAndNullTerminate(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
+    _Use_decl_annotations_
+    size_t DecodeUnitsIntoAndNullTerminate(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
     {
         size_t result = DecodeUnitsInto(buffer, pbUtf8, pbEnd, options);
         buffer[(int)result] = 0;
         return result;
     }
 
+    _Use_decl_annotations_
+    size_t DecodeUnitsIntoAndNullTerminateNoAdvance(char16 *buffer, LPCUTF8 pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
+    {
+        return DecodeUnitsIntoAndNullTerminate(buffer, pbUtf8, pbEnd, options);
+    }
+
     bool CharsAreEqual(__in_ecount(cch) LPCOLESTR pch, LPCUTF8 bch, LPCUTF8 end, size_t cch, DecodeOptions options)
     {
         DecodeOptions localOptions = options;

+ 2 - 15
lib/Common/Codex/Utf8Codex.h

@@ -273,21 +273,6 @@ namespace utf8
         return PrevCharFull(ptr, start);
     }
 
-    // Decode a UTF-8 sequence of cch UTF-16 characters into buffer. ptr could advance up to 3 times
-    // longer than cch so DecodeInto should only be used when it is already known that
-    // ptr refers to at least cch number of UTF-8 sequences.
-    void DecodeInto(__out_ecount_full(cch) char16 *buffer, LPCUTF8 ptr, LPCUTF8 end, size_t cch, DecodeOptions options = doDefault);
-
-    // Provided for dual-mode templates
-    inline void DecodeInto(__out_ecount_full(cch) char16 *buffer, const char16 *ptr, const char16 *end, size_t cch, DecodeOptions /* options */ = doDefault)
-    {
-        Unused(end);
-        memcpy_s(buffer, cch * sizeof(char16), ptr, cch * sizeof(char16));
-    }
-
-    // Like DecodeInto but ensures buffer ends with a NULL at buffer[cch].
-    void DecodeIntoAndNullTerminate(__out_ecount(cch+1) __nullterminated char16 *buffer, LPCUTF8 ptr, LPCUTF8 end, size_t cch, DecodeOptions options = doDefault);
-
     // Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer
     _Ret_range_(0, pbEnd - _Old_(pbUtf8))
     size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);
@@ -295,6 +280,8 @@ namespace utf8
     // Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer (excluding the null terminator)
     size_t DecodeUnitsIntoAndNullTerminate(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);
 
+    size_t DecodeUnitsIntoAndNullTerminateNoAdvance(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8 pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);
+
     // Encode a UTF-8 sequence into a UTF-8 sequence (which is just a memcpy). This is included for convenience in templates
     // when the character encoding is a template parameter.
     __range(cch, cch)

+ 1 - 1
lib/Common/Codex/Utf8Helper.h

@@ -73,7 +73,7 @@ namespace utf8
         // Some node tests depend on the utf8 decoder not swallowing invalid unicode characters
         // instead of replacing them with the "replacement" chracter. Pass a flag to our 
         // decoder to require such behavior
-        utf8::DecodeIntoAndNullTerminate(destString, (LPCUTF8) sourceString, (LPCUTF8) sourceString + cbSourceString, cchDestString, DecodeOptions::doAllowInvalidWCHARs);
+        utf8::DecodeUnitsIntoAndNullTerminateNoAdvance(destString, (LPCUTF8) sourceString, (LPCUTF8) sourceString + cbSourceString, DecodeOptions::doAllowInvalidWCHARs);
         Assert(destString[cchDestString] == 0);
         static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
         *destStringPtr = destString;

+ 9 - 7
lib/Jsrt/JsrtDebugUtils.cpp

@@ -61,17 +61,19 @@ void JsrtDebugUtils::AddSourceLengthAndTextToObject(Js::DynamicObject* object, J
     LPCUTF8 source = functionBody->GetStartOfDocument(_u("Source for debugging"));
     size_t cbLength = functionBody->GetUtf8SourceInfo()->GetCbLength();
     size_t startByte = utf8::CharacterIndexToByteIndex(source, cbLength, (const charcount_t)statementMap->sourceSpan.begin);
+    size_t endByte = utf8::CharacterIndexToByteIndex(source, cbLength, (const charcount_t)statementMap->sourceSpan.end);
+    int cch = statementMap->sourceSpan.end - statementMap->sourceSpan.begin;
 
-    int byteLength = statementMap->sourceSpan.end - statementMap->sourceSpan.begin;
+    JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::sourceLength, (double)cch, functionBody->GetScriptContext());
 
-    JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::sourceLength, (double)byteLength, functionBody->GetScriptContext());
-
-    AutoArrayPtr<char16> sourceContent(HeapNewNoThrowArray(char16, byteLength + 1), byteLength + 1);
+    AutoArrayPtr<char16> sourceContent(HeapNewNoThrowArray(char16, cch + 1), cch + 1);
     if (sourceContent != nullptr)
     {
+        LPCUTF8 pbStart = source + startByte;
+        LPCUTF8 pbEnd = pbStart + (endByte - startByte);
         utf8::DecodeOptions options = functionBody->GetUtf8SourceInfo()->IsCesu8() ? utf8::doAllowThreeByteSurrogates : utf8::doDefault;
-        utf8::DecodeIntoAndNullTerminate(sourceContent, source + startByte, source + startByte + cbLength, byteLength, options);
-        JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::sourceText, sourceContent, byteLength, functionBody->GetScriptContext());
+        utf8::DecodeUnitsIntoAndNullTerminate(sourceContent, pbStart, pbEnd, options);
+        JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::sourceText, sourceContent, cch, functionBody->GetScriptContext());
     }
     else
     {
@@ -96,7 +98,7 @@ void JsrtDebugUtils::AddSouceToObject(Js::DynamicObject * object, Js::Utf8Source
         LPCUTF8 source = utf8SourceInfo->GetSource();
         size_t cbLength = utf8SourceInfo->GetCbLength();
         utf8::DecodeOptions options = utf8SourceInfo->IsCesu8() ? utf8::doAllowThreeByteSurrogates : utf8::doDefault;
-        utf8::DecodeIntoAndNullTerminate(sourceContent, source, source + cbLength, cchLength, options);
+        utf8::DecodeUnitsIntoAndNullTerminate(sourceContent, source, source + cbLength, options);
         JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::source, sourceContent, cchLength, utf8SourceInfo->GetScriptContext());
     }
     else

+ 2 - 1
lib/Parser/Hash.h

@@ -434,7 +434,8 @@ private:
     }
     static void CopyString(__in_ecount(cch + 1) LPOLESTR psz1, LPCUTF8 psz2, LPCUTF8 psz2end, int32 cch)
     {
-        utf8::DecodeIntoAndNullTerminate(psz1, psz2, psz2end, cch);
+        Unused(cch);
+        utf8::DecodeUnitsIntoAndNullTerminate(psz1, psz2, psz2end);
     }
     static void CopyString(__in_ecount(cch + 1) LPOLESTR psz1, __in_ecount(cch) char const * psz2, char const * psz2end, int32 cch)
     {

+ 3 - 2
lib/Parser/Scan.cpp

@@ -2516,12 +2516,13 @@ HRESULT Scanner<EncodingPolicy>::SysAllocErrorLine(int32 ichMinLine, __out BSTR*
     }
 
     typename EncodingPolicy::EncodedCharPtr pStart = static_cast<size_t>(ichMinLine) == IchMinLine() ? m_pchMinLine : m_pchBase + this->CharacterOffsetToUnitOffset(m_pchBase, m_currentCharacter, m_pchLast, ichMinLine);
-    typename EncodingPolicy::EncodedCharPtr pEnd = AdjustedLast();
 
     // Determine the length by scanning for the next newline
-    charcount_t cch = LineLength(pStart, pEnd);
+    charcount_t cch = LineLength(pStart, m_pchLast);
     Assert(cch <= LONG_MAX);
 
+    typename EncodingPolicy::EncodedCharPtr pEnd = static_cast<size_t>(ichMinLine) == IchMinLine() ? m_pchMinLine + cch : m_pchBase + this->CharacterOffsetToUnitOffset(m_pchBase, m_currentCharacter, m_pchLast, cch);
+
     *pbstrLine = SysAllocStringLen(NULL, cch);
     if (!*pbstrLine)
     {

+ 1 - 1
lib/Parser/Scan.h

@@ -294,7 +294,7 @@ protected:
     void ConvertToUnicode(__out_ecount_full(cch) LPOLESTR pch, charcount_t cch, EncodedCharPtr start, EncodedCharPtr end)
     {
         m_decodeOptions = (utf8::DecodeOptions)(m_decodeOptions & ~utf8::doSecondSurrogatePair);
-        utf8::DecodeInto(pch, start, end, cch, m_decodeOptions);
+        utf8::DecodeUnitsInto(pch, start, end, m_decodeOptions);
     }
 
 

+ 15 - 3
lib/Runtime/Base/Utf8SourceInfo.h

@@ -78,10 +78,22 @@ namespace Js
         void RetrieveSourceText(__out_ecount_full(cchLim - cchMin) LPOLESTR cpText, charcount_t cchMin, charcount_t cchLim) const
         {
             size_t cbLength = GetCbLength(_u("Utf8SourceInfo::RetrieveSourceText"));
-            LPCUTF8 pSource = GetSource(_u("Utf8SourceInfo::RetrieveSourceText"));
-            size_t cbMin = cbLength == GetCchLength() ? cchMin : utf8::CharacterIndexToByteIndex(pSource, cbLength, cchMin, utf8::doAllowThreeByteSurrogates);
+            LPCUTF8 source = GetSource(_u("Utf8SourceInfo::RetrieveSourceText"));
+            LPCUTF8 pbStart = nullptr;
+            LPCUTF8 pbEnd = nullptr;
             
-            utf8::DecodeInto(cpText, pSource + cbMin, pSource + cbMin + cbLength, cchLim - cchMin, utf8::doAllowThreeByteSurrogates);
+            if (cbLength == GetCchLength())
+            {
+                pbStart = source + cchMin;
+                pbEnd = source + cchLim;
+            }
+            else
+            {
+                pbStart = source + utf8::CharacterIndexToByteIndex(source, cbLength, cchMin, utf8::doAllowThreeByteSurrogates);
+                pbEnd = source + utf8::CharacterIndexToByteIndex(source, cbLength, cchLim, utf8::doAllowThreeByteSurrogates);
+            }
+            
+            utf8::DecodeUnitsInto(cpText, pbStart, pbEnd, utf8::doAllowThreeByteSurrogates);
         }
 
         size_t CharacterIndexToByteIndex(charcount_t cchIndex) const

+ 2 - 2
lib/Runtime/Language/DynamicProfileStorage.cpp

@@ -110,7 +110,7 @@ _Success_(return) bool DynamicProfileStorageReaderWriter::ReadUtf8String(__deref
         return false;
     }
 
-    utf8char_t * tempBuffer = NoCheckHeapNewArray(utf8char_t, urllen);
+    utf8char_t* tempBuffer = NoCheckHeapNewArray(utf8char_t, urllen);
     if (tempBuffer == nullptr)
     {
         Output::Print(_u("ERROR: DynamicProfileStorage: Out of memory reading '%s'\n"), filename);
@@ -133,7 +133,7 @@ _Success_(return) bool DynamicProfileStorageReaderWriter::ReadUtf8String(__deref
         HeapDeleteArray(urllen, tempBuffer);
         return false;
     }
-    utf8::DecodeIntoAndNullTerminate(name, tempBuffer, tempBuffer + urllen, length);
+    utf8::DecodeUnitsIntoAndNullTerminateNoAdvance(name, tempBuffer, tempBuffer + urllen);
     NoCheckHeapDeleteArray(urllen, tempBuffer);
     *str = name;
     *len = length;

+ 3 - 1
lib/Runtime/Library/JavascriptFunction.cpp

@@ -3058,7 +3058,9 @@ LABEL1:
                     charcount_t count = min(DIAG_MAX_FUNCTION_STRING, func->LengthInChars());
                     utf8::DecodeOptions options = sourceInfo->IsCesu8() ? utf8::doAllowThreeByteSurrogates : utf8::doDefault;
                     LPCUTF8 source = func->GetSource(_u("JavascriptFunction::GetDiagValueString"));
-                    utf8::DecodeInto(stringBuilder->AllocBufferSpace(count), source, source + sourceInfo->GetCbLength(_u("JavascriptFunction::GetDiagValueString")), count, options);
+                    size_t cbLength = sourceInfo->GetCbLength(_u("JavascriptFunction::GetDiagValueString"));
+                    size_t cbIndex = utf8::CharacterIndexToByteIndex(source, cbLength, count, options);
+                    utf8::DecodeUnitsInto(stringBuilder->AllocBufferSpace(count), source, source + cbIndex, options);
                     stringBuilder->IncreaseCount(count);
                     return TRUE;
                 }

+ 5 - 5
lib/Runtime/Library/ScriptFunction.cpp

@@ -483,12 +483,12 @@ namespace Js
             // Consider: Should we have a JavascriptUtf8Substring class which defers decoding
             // until it's needed?
 
-            BufferStringBuilder builder(pFuncBody->LengthInChars(), scriptContext);
-            // TODO: What about surrogate pairs?
+            charcount_t cch = pFuncBody->LengthInChars();
+            size_t cbLength = pFuncBody->LengthInBytes();
+            LPCUTF8 pbStart = pFuncBody->GetSource(_u("ScriptFunction::EnsureSourceString"));
+            BufferStringBuilder builder(cch, scriptContext);
             utf8::DecodeOptions options = pFuncBody->GetUtf8SourceInfo()->IsCesu8() ? utf8::doAllowThreeByteSurrogates : utf8::doDefault;
-            LPCUTF8 ptr = pFuncBody->GetSource(_u("ScriptFunction::EnsureSourceString"));
-            size_t cbLength = pFuncBody->GetUtf8SourceInfo()->GetCbLength(_u("ScriptFunction::EnsureSourceString"));
-            utf8::DecodeInto(builder.DangerousGetWritableBuffer(), ptr, ptr + cbLength, pFuncBody->LengthInChars(), options);
+            utf8::DecodeUnitsInto(builder.DangerousGetWritableBuffer(), pbStart, pbStart + cbLength, options);
             if (pFuncBody->IsLambda() || isActiveScript || this->GetFunctionInfo()->IsClassConstructor()
 #ifdef ENABLE_PROJECTION
                 || scriptContext->GetConfig()->IsWinRTEnabled()

+ 1 - 1
lib/WasmReader/WasmBinaryReader.cpp

@@ -1019,7 +1019,7 @@ WasmBinaryReader::CvtUtf8Str(LPCUTF8 name, uint32 nameLen, charcount_t* dstLengt
     {
         Js::Throw::OutOfMemory();
     }
-    utf8::DecodeIntoAndNullTerminate(contents, name, name + nameLen, utf16Len, decodeOptions);
+    utf8::DecodeUnitsIntoAndNullTerminate(contents, name, name + nameLen, decodeOptions);
     if (dstLength)
     {
         *dstLength = utf16Len;

+ 74 - 0
test/utf8/bugGH2386.js

@@ -0,0 +1,74 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft Corporation and contributors. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+
+function toHexCP(c, cp) {
+    var hex = "0123456789abcdef";
+    return String.fromCharCode(hex.charCodeAt((c >> (cp * 4)) & 0xf));
+}
+
+function toHex(str) {
+    var result = "";
+    for(var i = 0; i < str.length; i++) {
+        var c = str.charCodeAt(i);
+        for (var cp = 3; cp >= 0; cp--) {
+            result += toHexCP(c, cp);
+        }
+    }
+    return "0x" + result;
+}
+
+var CHECK = function(h)
+{
+    var hex_str = String.fromCharCode(h);
+    var pattern = eval("/" + hex_str + "/");
+    if (toHex(hex_str) != toHex(pattern.source)) {
+        throw new Error("String encoding has failed? "
+          + toHex(hex_str) + " != " + toHex(pattern.source));
+    }
+}
+
+CHECK("0x0000");
+CHECK("0x0080");
+CHECK("0x0800");
+CHECK("0xFF80");
+CHECK("0xFFFD");
+CHECK("0xFFFFFF");
+CHECK("0xFFFFFF80");
+CHECK("0xFFFFFF80FF");
+
+function CHECK_EVAL(s)
+{
+    var eval_s = new RegExp( s ).source;
+    if (s !== eval_s) throw new Error(
+      "String Encoding is broken ? ->" + s);
+}
+
+var CH1 = String.fromCharCode('0xe4b8ad');
+var CH2 = String.fromCharCode('0xe69687');
+var CH3 = String.fromCharCode('0xe336b2');
+var CH4 = String.fromCharCode('0xe336b2aa');
+var CHX = String.fromCharCode("0x80808080");
+
+var BUFF = '';
+for(var i = 0; i < 16; i++)
+{
+    var str = CH1;
+
+    CHECK_EVAL(str + CHX + BUFF)
+    CHECK_EVAL(str + BUFF + CHX)
+    CHECK_EVAL(str + BUFF + CHX + '1')
+    str += BUFF + CH2 + CHX;
+    BUFF += '1';
+
+    CHECK_EVAL(str + '1' + CH3);
+    CHECK_EVAL(str + '12' + CH3);
+    CHECK_EVAL(str + '123' + CH3);
+
+    CHECK_EVAL(str + '1' + CH4);
+    CHECK_EVAL(str + '12' + CH4);
+    CHECK_EVAL(str + '123' + CH4)
+}
+
+console.log("PASS");

+ 13 - 0
test/utf8/rlexe.xml

@@ -20,4 +20,17 @@
       <baseline />
     </default>
   </test>
+  <test>
+    <default>
+      <files>bugGH2386.js</files>
+      <baseline />
+    </default>
+  </test>
+  <test>
+    <default>
+      <files>unicode_sequence_serialized.js</files>
+      <baseline />
+      <compile-flags>-forceserialized -oopjit-</compile-flags>
+    </default>
+  </test>
 </regress-exe>

+ 6 - 1
test/utf8/surrogatepair.js

@@ -7,4 +7,9 @@
 // For this test case to work, please save this file with UTF-8 encoding
 var y = "function () { '鄏𡄻�莞�遲���屢���箋成鄏賴旭鄑温收鄏擒�鄏賴忖 鄏兒江鄏眇成鄑温戍鄍�' ;WScript.Echo('hello'); }"
 var x = function () { '鄏𡄻�莞�遲���屢���箋成鄏賴旭鄑温收鄏擒�鄏賴忖 鄏兒江鄏眇成鄑温戍鄍�' ;WScript.Echo('hello'); }
-WScript.Echo(x.toString() === y ? "PASS" : "FAIL");
+
+// 2 bytes 
+var y2 = "function () { '𥌓 kugu' ;WScript.Echo('hello'); }" 
+var x2 = function () { '𥌓 kugu' ;WScript.Echo('hello'); } 
+
+WScript.Echo((x.toString() === y && x2.toString() === y2) ? "PASS" : "FAIL");

+ 3 - 0
test/utf8/unicode_sequence_serialized.js

@@ -0,0 +1,3 @@
+(function () {
+       /(오)/ ;
+})();