| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236 |
- //-------------------------------------------------------------------------------------------------------
- // Copyright (C) Microsoft. All rights reserved.
- // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
- //-------------------------------------------------------------------------------------------------------
- #pragma once
- #include "Core/CommonTypedefs.h"
- #include "ChakraICU.h"
- #include "sal.h"
- namespace PlatformAgnostic
- {
- namespace UnicodeText
- {
- // This structure is used by a subset of APIs where
- // errors are expected. In that case, ApiError is the last out
- // parameter.
- enum ApiError
- {
- NoError,
- InvalidParameter,
- InvalidUnicodeText,
- InsufficientBuffer,
- OutOfMemory,
- UntranslatedError
- };
- // The form to use for NormalizeString
- // Intentionally compatible with backing Normalization Kind enums if available
- // enum NormalizationForm
- // {
- // C, // Each base plus combining characters to the canonical precomposed equivalent.
- // D, // Each precomposed character to its canonical decomposed equivalent.
- // KC, // Each base plus combining characters to the canonical precomposed
- // // equivalents and all compatibility characters to their equivalents.
- // KD, // Each precomposed character to its canonical decomposed equivalent
- // // and all compatibility characters to their equivalents.
- // Other, // Not supported
- // };
- #if !defined(HAS_ICU) && !_WIN32
- enum NormalizationForm
- {
- C,
- D,
- KC,
- KD,
- Other
- };
- #elif !defined(HAS_ICU) && _WIN32
- enum NormalizationForm
- {
- C = NORM_FORM::NormalizationC,
- D = NORM_FORM::NormalizationD,
- KC = NORM_FORM::NormalizationKC,
- KD = NORM_FORM::NormalizationKD,
- Other = NORM_FORM::NormalizationOther
- };
- #else
- // ICU does not have specific enums for KC and KD
- // Instead, they have a string argument, "nfc" or "nfkc",
- // coupled with the COMPOSE or DECOMPOSE enum variant
- // ICU does not have an explicit "other", but the static_asserts
- // in UnicodeText.ICU.cpp ensure that Other is different from C/D/KC/KD
- static const char * const ICU_NORMALIZATION_NFC = "nfc";
- static const char * const ICU_NORMALIZATION_NFKC = "nfkc";
- enum NormalizationForm
- {
- C = UNORM2_COMPOSE,
- D = UNORM2_DECOMPOSE,
- KC = INT_MIN + UNORM2_COMPOSE,
- KD = INT_MIN + UNORM2_DECOMPOSE,
- Other = INT_MAX
- };
- #endif
- // Mapping of a unicode codepoint to a class of characters
- // Used by the legacy API
- enum class CharacterClassificationType
- {
- Invalid,
- Letter,
- Whitespace,
- NewLine,
- DigitOrPunct
- };
- // Used by the legacy API for mapping
- // a codepoint to a set of flags
- // This remains as an enum rather than an enum class
- // because the global names are referenced widely in CharClassifier
- // so leaving this as is to make the code more readable
- enum CharacterTypeFlags : byte
- {
- UnknownChar = 0x0,
- IdChar = 0x01,
- IdLeadChar = 0x02,
- HexChar = 0x04,
- DecimalChar = 0x08,
- SpaceChar = 0x10,
- LineFeedChar = 0x20,
- LineCharGroup = SpaceChar | LineFeedChar,
- LetterCharGroup = IdChar | IdLeadChar,
- HexCharGroup = IdChar | IdLeadChar | HexChar,
- DecimalCharGroup = IdChar | DecimalChar,
- };
- // Parameters for APIs to change a strings case
- enum CaseFlags
- {
- CaseFlagsUpper,
- CaseFlagsLower
- };
- // Subset of the unicode general category listed in Table 4-9 of the Unicode 8.0 Spec
- // We map the unicode categories into "category classes" at the granularity that the
- // user of this data needs to discriminate the category at.
- enum UnicodeGeneralCategoryClass
- {
- CategoryClassLetter,
- CategoryClassDigit,
- CategoryClassLineSeparator,
- CategoryClassParagraphSeparator,
- CategoryClassSpaceSeparator,
- CategoryClassSpacingCombiningMark,
- CategoryClassNonSpacingMark,
- CategoryClassConnectorPunctuation,
- CategoryClassOther
- };
- //
- // This method normalizes the characters of a given UTF16 string according to the rules of Unicode 4.0 TR#15
- // This is needed for implementation of the ES6 method String.prototoype.normalize
- //
- // Params:
- // normalizationForm: the Unicode Normalization Form
- // sourceString: The string to normalize
- // sourceLength: The number of characters in the source string. This must be provided, the function does not assume null-termination etc. Length should be greater than 0.
- // destString: Optional pointer to the destination string buffer. It can be null if destLength is 0.
- // destLength: Size in characters of the destination buffer, or 0 if the function shuld just return the required character count for the dest buffer.
- // pErrorOut: Set to NoError, or the actual error if one occurred.
- //
- // Return Value:
- // length of the normalized string in the destination buffer
- // If the return value is less than or equal to 0, then see the value of pErrorOut to understand the error
- //
- int32 NormalizeString(NormalizationForm normalizationForm, const char16* sourceString, uint32 sourceLength, char16* destString, int32 destLength, ApiError* pErrorOut);
- //
- // This method verifies that a given UTF16 string is normalized according to the rules of Unicode 4.0 TR#15.
- //
- // Params:
- // normalizationForm: the Unicode Normalization Form
- // testString: The string to test
- // testStringLength: The number of characters in the test string. If the string is null-terminated, and the API should calculate the length, set testStringLength to 0.
- //
- // Return Value:
- // true if the input string is already normalized, false if it isn't
- // No error codes are returned since they're not used by the caller.
- //
- bool IsNormalizedString(NormalizationForm normalizatingForm, const char16* testString, int32 testStringLength);
- //
- // This method lets the caller know if an external Unicode helper library is being used by the PAL
- // For example, if we're using ICU/Windows.Globalization.dll/JsIntl.dll
- //
- // Return Value:
- // true if Windows.Globalization or the ICU is available and being used
- // false otherwise
- //
- bool IsExternalUnicodeLibraryAvailable();
- //
- // Return if a codepoint is considered a whitespace character according to the Unicode spec
- //
- bool IsWhitespace(codepoint_t ch);
- //
- // Return if a codepoint is in the ID_START class according to the Unicode Standard Annex 31.
- // These characters are considered valid to start identifiers for programming languages.
- //
- bool IsIdStart(codepoint_t ch);
- //
- // Return if a codepoint is in the ID_CONTINUE class according to the Unicode Standard Annex 31.
- // These characters are considered valid to be in identifiers for programming languages
- // These are also called identifier characters as they are a superset of ID_Start characters
- //
- bool IsIdContinue(codepoint_t ch);
- //
- // Return the General Category of the unicode code point based on Chapter 4, Section 4.5 of the Unicode 8 Spec
- //
- UnicodeGeneralCategoryClass GetGeneralCategoryClass(codepoint_t ch);
- //
- // Change the case of a string using linguistic rules
- // Params:
- // sourceString: The string to convert
- // sourceLength: The number of characters in the source string. This must be provided, the function does not assume null-termination. Length should be greater than 0.
- // destString: Optional pointer to the destination string buffer. It can be null if destLength is 0, if you want the required buffer size
- // destLength: Size in characters of the destination buffer, or 0 if the function shuld just return the required character count for the dest buffer.
- // pErrorOut: Set to NoError, or the actual error if one occurred.
- //
- // Return Value:
- // The length required to convert sourceString to the given case, even if destString was not large enough to hold it, including the null terminator
- //
- template<bool toUpper, bool useInvariant>
- charcount_t ChangeStringLinguisticCase(_In_count_(sourceLength) const char16* sourceString, _In_ charcount_t sourceLength, _Out_writes_(destLength) char16* destString, _In_ charcount_t destLength, _Out_ ApiError* pErrorOut);
- //
- // Return the classification type of the character using Unicode 2.0 rules
- // Used for ES5 compat
- //
- CharacterClassificationType GetLegacyCharacterClassificationType(char16 character);
- //
- // Return the flags associated with the character using Unicode 2.0 rules
- // Used for ES5 compat
- //
- CharacterTypeFlags GetLegacyCharacterTypeFlags(char16 character);
- //
- // Compares two unicode strings but numbers are compared
- // numerically rather than as text.
- // For example, test2 comes before test11
- //
- // Return Value:
- // 0 - The two strings are equal
- // -1 - string1 is greater than string2
- // +1 - string1 is lesser than string2
- //
- int LogicalStringCompare(const char16* string1, int str1size, const char16* string2, int str2size);
- };
- };
|