filter_utf8.cc 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. // Copyright (c) 2018-present, Facebook, Inc.
  2. // All rights reserved.
  3. //
  4. // This source code is licensed under the MIT license found in the
  5. // LICENSE file in the root directory of this source tree.
  6. #include <cstdint>
  7. #include <iostream>
  8. #include <string>
  9. // Check that the next n bytes are continuation bytes.
  10. bool continuation(uint8_t* str, int n)
  11. {
  12. for (int i = 0; i < n; i++) {
  13. if ((str[i] & 0xc0) != 0x80) return false;
  14. }
  15. return true;
  16. }
  17. // Invalid UTF8 correspond to codepoints which are larger than U+10FFFF.
  18. // This value is encoded in UTF8 as:
  19. // * 11110.100 10.001111 10.111111 10.111111
  20. // We thus check if the first byte is larger than 0xf4, or if it is equal
  21. // to 0xf4 and the second byte is larger than 0x8f.
  22. bool invalid(uint8_t* str)
  23. {
  24. return str[0] > 0xf4 || (str[0] == 0xf4 && str[1] > 0x8f);
  25. }
  26. // Surrogate halves corresponds to the range U+D800 through U+DFFF,
  27. // which are encoded in UTF8 as:
  28. // * 1110.1101 10.100000 10.000000
  29. // * 1110.1101 10.111111 10.111111
  30. // We thus check is the first byte is equal to 0xed and if the
  31. // sixth bit of the second byte is set.
  32. bool surrogate(uint8_t* str)
  33. {
  34. return str[0] == 0xed && str[1] & 0x20;
  35. }
  36. // Sequences of length 2 are overlong if the leading 4 bits (noted as y)
  37. // are equal to 0: 110.yyyyx 10xxxxxx
  38. bool overlong_2(uint8_t* str)
  39. {
  40. return (str[0] & 0x1e) == 0;
  41. }
  42. // Sequences of lenth 3 are overlong if the leading 5 bits (noted as y)
  43. // are equal to 0: 1110.yyyy 10.yxxxxx 10.xxxxxx
  44. bool overlong_3(uint8_t* str)
  45. {
  46. return (str[0] & 0x0f) == 0 && (str[1] & 0x20) == 0;
  47. }
  48. // Sequences of length 4 are overlong if the leading 5 bits (noted as y)
  49. // are equal to 0: 11110.yyy 10.yyxxxx 10.xxxxxx 10.xxxxxx
  50. bool overlong_4(uint8_t* str)
  51. {
  52. return (str[0] & 0x07) == 0 && (str[1] & 0x30) == 0;
  53. }
  54. bool valid_utf8(uint8_t* str, size_t length)
  55. {
  56. uint8_t* end = str + length;
  57. while (str < end) {
  58. if (str[0] < 0x80) {
  59. // 0.xxxxxxx
  60. str += 1;
  61. } else if ((str[0] & 0xe0) == 0xc0) {
  62. // 110.xxxxx 10.xxxxxx
  63. if (str + 1 >= end) return false;
  64. if (!continuation(str + 1, 1)) return false;
  65. if (overlong_2(str)) return false;
  66. str += 2;
  67. } else if ((str[0] & 0xf0) == 0xe0) {
  68. // 1110.xxxx 10.xxxxxx 10.xxxxxx
  69. if (str + 2 >= end) return false;
  70. if (!continuation(str + 1, 2)) return false;
  71. if (overlong_3(str)) return false;
  72. if (surrogate(str)) return false;
  73. str += 3;
  74. } else if ((str[0] & 0xf8) == 0xf0) {
  75. // 11110.xxx 10.xxxxxx 10.xxxxxx 10.xxxxxx
  76. if (str + 3 >= end) return false;
  77. if (!continuation(str + 1, 3)) return false;
  78. if (overlong_4(str)) return false;
  79. if (invalid(str)) return false;
  80. str += 4;
  81. } else {
  82. return false;
  83. }
  84. }
  85. return true;
  86. }
  87. int main(int argc, char** argv)
  88. {
  89. std::ios_base::sync_with_stdio(false);
  90. for (std::string line; std::getline(std::cin, line);) {
  91. if (valid_utf8((uint8_t*) line.data(), line.length())) {
  92. std::cout << line << std::endl;
  93. }
  94. }
  95. return 0;
  96. }