UnicodeText.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. #include "Core/CommonTypedefs.h"
  7. #include "ChakraICU.h"
  8. #include "sal.h"
  9. namespace PlatformAgnostic
  10. {
  11. namespace UnicodeText
  12. {
  13. // This structure is used by a subset of APIs where
  14. // errors are expected. In that case, ApiError is the last out
  15. // parameter.
  16. enum ApiError
  17. {
  18. NoError,
  19. InvalidParameter,
  20. InvalidUnicodeText,
  21. InsufficientBuffer,
  22. OutOfMemory,
  23. UntranslatedError
  24. };
  25. // The form to use for NormalizeString
  26. // Intentionally compatible with backing Normalization Kind enums if available
  27. // enum NormalizationForm
  28. // {
  29. // C, // Each base plus combining characters to the canonical precomposed equivalent.
  30. // D, // Each precomposed character to its canonical decomposed equivalent.
  31. // KC, // Each base plus combining characters to the canonical precomposed
  32. // // equivalents and all compatibility characters to their equivalents.
  33. // KD, // Each precomposed character to its canonical decomposed equivalent
  34. // // and all compatibility characters to their equivalents.
  35. // Other, // Not supported
  36. // };
  37. #if !defined(HAS_ICU) && !_WIN32
  38. enum NormalizationForm
  39. {
  40. C,
  41. D,
  42. KC,
  43. KD,
  44. Other
  45. };
  46. #elif !defined(HAS_ICU) && _WIN32
  47. enum NormalizationForm
  48. {
  49. C = NORM_FORM::NormalizationC,
  50. D = NORM_FORM::NormalizationD,
  51. KC = NORM_FORM::NormalizationKC,
  52. KD = NORM_FORM::NormalizationKD,
  53. Other = NORM_FORM::NormalizationOther
  54. };
  55. #else
  56. // ICU does not have specific enums for KC and KD
  57. // Instead, they have a string argument, "nfc" or "nfkc",
  58. // coupled with the COMPOSE or DECOMPOSE enum variant
  59. // ICU does not have an explicit "other", but the static_asserts
  60. // in UnicodeText.ICU.cpp ensure that Other is different from C/D/KC/KD
  61. static const char * const ICU_NORMALIZATION_NFC = "nfc";
  62. static const char * const ICU_NORMALIZATION_NFKC = "nfkc";
  63. enum NormalizationForm
  64. {
  65. C = UNORM2_COMPOSE,
  66. D = UNORM2_DECOMPOSE,
  67. KC = INT_MIN + UNORM2_COMPOSE,
  68. KD = INT_MIN + UNORM2_DECOMPOSE,
  69. Other = INT_MAX
  70. };
  71. #endif
  72. // Mapping of a unicode codepoint to a class of characters
  73. // Used by the legacy API
  74. enum class CharacterClassificationType
  75. {
  76. Invalid,
  77. Letter,
  78. Whitespace,
  79. NewLine,
  80. DigitOrPunct
  81. };
  82. // Used by the legacy API for mapping
  83. // a codepoint to a set of flags
  84. // This remains as an enum rather than an enum class
  85. // because the global names are referenced widely in CharClassifier
  86. // so leaving this as is to make the code more readable
  87. enum CharacterTypeFlags : byte
  88. {
  89. UnknownChar = 0x0,
  90. IdChar = 0x01,
  91. IdLeadChar = 0x02,
  92. HexChar = 0x04,
  93. DecimalChar = 0x08,
  94. SpaceChar = 0x10,
  95. LineFeedChar = 0x20,
  96. LineCharGroup = SpaceChar | LineFeedChar,
  97. LetterCharGroup = IdChar | IdLeadChar,
  98. HexCharGroup = IdChar | IdLeadChar | HexChar,
  99. DecimalCharGroup = IdChar | DecimalChar,
  100. };
  101. // Parameters for APIs to change a strings case
  102. enum CaseFlags
  103. {
  104. CaseFlagsUpper,
  105. CaseFlagsLower
  106. };
  107. // Subset of the unicode general category listed in Table 4-9 of the Unicode 8.0 Spec
  108. // We map the unicode categories into "category classes" at the granularity that the
  109. // user of this data needs to discriminate the category at.
  110. enum UnicodeGeneralCategoryClass
  111. {
  112. CategoryClassLetter,
  113. CategoryClassDigit,
  114. CategoryClassLineSeparator,
  115. CategoryClassParagraphSeparator,
  116. CategoryClassSpaceSeparator,
  117. CategoryClassSpacingCombiningMark,
  118. CategoryClassNonSpacingMark,
  119. CategoryClassConnectorPunctuation,
  120. CategoryClassOther
  121. };
  122. //
  123. // This method normalizes the characters of a given UTF16 string according to the rules of Unicode 4.0 TR#15
  124. // This is needed for implementation of the ES6 method String.prototoype.normalize
  125. //
  126. // Params:
  127. // normalizationForm: the Unicode Normalization Form
  128. // sourceString: The string to normalize
  129. // sourceLength: The number of characters in the source string. This must be provided, the function does not assume null-termination etc. Length should be greater than 0.
  130. // destString: Optional pointer to the destination string buffer. It can be null if destLength is 0.
  131. // destLength: Size in characters of the destination buffer, or 0 if the function shuld just return the required character count for the dest buffer.
  132. // pErrorOut: Set to NoError, or the actual error if one occurred.
  133. //
  134. // Return Value:
  135. // length of the normalized string in the destination buffer
  136. // If the return value is less than or equal to 0, then see the value of pErrorOut to understand the error
  137. //
  138. int32 NormalizeString(NormalizationForm normalizationForm, const char16* sourceString, uint32 sourceLength, char16* destString, int32 destLength, ApiError* pErrorOut);
  139. //
  140. // This method verifies that a given UTF16 string is normalized according to the rules of Unicode 4.0 TR#15.
  141. //
  142. // Params:
  143. // normalizationForm: the Unicode Normalization Form
  144. // testString: The string to test
  145. // testStringLength: The number of characters in the test string. If the string is null-terminated, and the API should calculate the length, set testStringLength to 0.
  146. //
  147. // Return Value:
  148. // true if the input string is already normalized, false if it isn't
  149. // No error codes are returned since they're not used by the caller.
  150. //
  151. bool IsNormalizedString(NormalizationForm normalizatingForm, const char16* testString, int32 testStringLength);
  152. //
  153. // This method lets the caller know if an external Unicode helper library is being used by the PAL
  154. // For example, if we're using ICU/Windows.Globalization.dll/JsIntl.dll
  155. //
  156. // Return Value:
  157. // true if Windows.Globalization or the ICU is available and being used
  158. // false otherwise
  159. //
  160. bool IsExternalUnicodeLibraryAvailable();
  161. //
  162. // Return if a codepoint is considered a whitespace character according to the Unicode spec
  163. //
  164. bool IsWhitespace(codepoint_t ch);
  165. //
  166. // Return if a codepoint is in the ID_START class according to the Unicode Standard Annex 31.
  167. // These characters are considered valid to start identifiers for programming languages.
  168. //
  169. bool IsIdStart(codepoint_t ch);
  170. //
  171. // Return if a codepoint is in the ID_CONTINUE class according to the Unicode Standard Annex 31.
  172. // These characters are considered valid to be in identifiers for programming languages
  173. // These are also called identifier characters as they are a superset of ID_Start characters
  174. //
  175. bool IsIdContinue(codepoint_t ch);
  176. //
  177. // Return the General Category of the unicode code point based on Chapter 4, Section 4.5 of the Unicode 8 Spec
  178. //
  179. UnicodeGeneralCategoryClass GetGeneralCategoryClass(codepoint_t ch);
  180. //
  181. // Change the case of a string using linguistic rules
  182. // Params:
  183. // sourceString: The string to convert
  184. // sourceLength: The number of characters in the source string. This must be provided, the function does not assume null-termination. Length should be greater than 0.
  185. // destString: Optional pointer to the destination string buffer. It can be null if destLength is 0, if you want the required buffer size
  186. // destLength: Size in characters of the destination buffer, or 0 if the function shuld just return the required character count for the dest buffer.
  187. // pErrorOut: Set to NoError, or the actual error if one occurred.
  188. //
  189. // Return Value:
  190. // The length required to convert sourceString to the given case, even if destString was not large enough to hold it, including the null terminator
  191. //
  192. template<bool toUpper, bool useInvariant>
  193. charcount_t ChangeStringLinguisticCase(_In_count_(sourceLength) const char16* sourceString, _In_ charcount_t sourceLength, _Out_writes_(destLength) char16* destString, _In_ charcount_t destLength, _Out_ ApiError* pErrorOut);
  194. //
  195. // Return the classification type of the character using Unicode 2.0 rules
  196. // Used for ES5 compat
  197. //
  198. CharacterClassificationType GetLegacyCharacterClassificationType(char16 character);
  199. //
  200. // Return the flags associated with the character using Unicode 2.0 rules
  201. // Used for ES5 compat
  202. //
  203. CharacterTypeFlags GetLegacyCharacterTypeFlags(char16 character);
  204. //
  205. // Compares two unicode strings but numbers are compared
  206. // numerically rather than as text.
  207. // For example, test2 comes before test11
  208. //
  209. // Return Value:
  210. // 0 - The two strings are equal
  211. // -1 - string1 is greater than string2
  212. // +1 - string1 is lesser than string2
  213. //
  214. int LogicalStringCompare(const char16* string1, int str1size, const char16* string2, int str2size);
  215. };
  216. };