text.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. var text = text || {};
  2. text.Decoder = class {
  3. static open(data, encoding) {
  4. if (typeof data === 'string') {
  5. return new text.Decoder.String(data);
  6. }
  7. const assert = (encoding, condition) => {
  8. if (encoding && encoding !== condition) {
  9. throw new text.Error("Invalid encoding '" + encoding + "'.");
  10. }
  11. };
  12. const buffer = data instanceof Uint8Array ? data : data.peek();
  13. const length = buffer.length;
  14. if (length >= 3 && buffer[0] === 0xef && buffer[1] === 0xbb && buffer[2] === 0xbf) {
  15. assert(encoding, 'utf-8');
  16. return new text.Decoder.Utf8(buffer, 3, true);
  17. }
  18. if (length >= 2 && buffer[0] === 0xff && buffer[1] === 0xfe) {
  19. assert(encoding, 'utf-16');
  20. return new text.Decoder.Utf16LE(buffer, 2);
  21. }
  22. if (length >= 2 && buffer[0] === 0xfe && buffer[1] === 0xff) {
  23. assert(encoding, 'utf-16');
  24. return new text.Decoder.Utf16BE(buffer, 2);
  25. }
  26. if (length >= 4 && buffer[0] === 0x00 && buffer[1] === 0x00 && buffer[2] === 0xfe && buffer[3] === 0xff) {
  27. assert(encoding, 'utf-32');
  28. return new text.Decoder.Utf32LE(buffer, 2);
  29. }
  30. if (length >= 4 && buffer[0] === 0xff && buffer[1] === 0xfe && buffer[2] === 0x00 && buffer[3] === 0x00) {
  31. assert(encoding, 'utf-32');
  32. return new text.Decoder.Utf32BE(buffer, 2);
  33. }
  34. if (length >= 5 && buffer[0] === 0x2B && buffer[1] === 0x2F && buffer[2] === 0x76 && buffer[3] === 0x38 && buffer[4] === 0x2D) {
  35. throw new text.Error("Unsupported UTF-7 encoding.");
  36. }
  37. if (length >= 4 && buffer[0] === 0x2B && buffer[1] === 0x2F && buffer[2] === 0x76 && (buffer[3] === 0x38 || buffer[3] === 0x39 || buffer[3] === 0x2B || buffer[3] === 0x2F)) {
  38. throw new text.Error("Unsupported UTF-7 encoding.");
  39. }
  40. if (length >= 4 && buffer[0] === 0x84 && buffer[1] === 0x31 && buffer[2] === 0x95 && buffer[3] === 0x33) {
  41. throw new text.Error("Unsupported GB-18030 encoding.");
  42. }
  43. if (length > 4 && (length % 2) == 0 && (buffer[0] === 0x00 || buffer[1] === 0x00 || buffer[2] === 0x00 || buffer[3] === 0x00)) {
  44. const lo = new Uint32Array(256);
  45. const hi = new Uint32Array(256);
  46. for (let i = 0; i < length; i += 2) {
  47. lo[buffer[i]]++;
  48. hi[buffer[i + 1]]++;
  49. }
  50. if (lo[0x00] === 0 && (hi[0x00] / (length >> 1)) > 0.5) {
  51. assert(encoding, 'utf-16');
  52. return new text.Decoder.Utf16LE(buffer, 0);
  53. }
  54. if (hi[0x00] === 0 && (lo[0x00] / (length >> 1)) > 0.5) {
  55. assert(encoding, 'utf-16');
  56. return new text.Decoder.Utf16BE(buffer, 0);
  57. }
  58. }
  59. if (encoding && (encoding.startsWith('iso-8859-') || encoding.startsWith('latin-'))) {
  60. return new text.Decoder.Latin1(buffer, 0);
  61. }
  62. assert(encoding, 'utf-8');
  63. return new text.Decoder.Utf8(buffer, 0, encoding === 'utf-8');
  64. }
  65. };
  66. text.Decoder.String = class {
  67. constructor(buffer) {
  68. this.buffer = buffer ? buffer.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\uD800-\uDFFF]/g) : [];
  69. this.position = 0;
  70. this.length = this.buffer.length;
  71. }
  72. get encoding() {
  73. return null;
  74. }
  75. decode() {
  76. if (this.position < this.length) {
  77. return this.buffer[this.position++];
  78. }
  79. return undefined;
  80. }
  81. };
  82. text.Decoder.Utf8 = class {
  83. constructor(buffer, position, fatal) {
  84. this.position = position || 0;
  85. this.buffer = buffer;
  86. this.fatal = fatal;
  87. }
  88. get encoding() {
  89. return 'utf-8';
  90. }
  91. decode() {
  92. const c = this.buffer[this.position];
  93. if (c === undefined) {
  94. return c;
  95. }
  96. this.position++;
  97. if (c < 0x80) {
  98. return String.fromCodePoint(c);
  99. }
  100. if (c >= 0xC2 && c <= 0xDF) {
  101. if (this.buffer[this.position] !== undefined) {
  102. const c2 = this.buffer[this.position];
  103. this.position++;
  104. return String.fromCharCode(((c & 0x1F) << 6) | (c2 & 0x3F));
  105. }
  106. }
  107. if (c >= 0xE0 && c <= 0xEF) {
  108. if (this.buffer[this.position + 1] !== undefined) {
  109. const c2 = this.buffer[this.position];
  110. if ((c !== 0xE0 || c2 >= 0xA0) && (c !== 0xED || c2 <= 0x9f)) {
  111. const c3 = this.buffer[this.position + 1];
  112. if (c3 >= 0x80 && c3 < 0xFB) {
  113. this.position += 2;
  114. return String.fromCharCode(((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | ((c3 & 0x3F) << 0));
  115. }
  116. }
  117. }
  118. }
  119. if (c >= 0xF0 && c <= 0xF4) {
  120. if (this.buffer[this.position + 2] !== undefined) {
  121. const c2 = this.buffer[this.position];
  122. if (c2 >= 0x80 && c2 <= 0xBF) {
  123. const c3 = this.buffer[this.position + 1];
  124. if (c3 >= 0x80 && c3 <= 0xBF) {
  125. const c4 = this.buffer[this.position + 2];
  126. if (c4 >= 0x80 && c4 <= 0xBF) {
  127. const codePoint = ((c & 0x07) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
  128. if (codePoint <= 0x10FFFF) {
  129. this.position += 3;
  130. return String.fromCodePoint(codePoint);
  131. }
  132. }
  133. }
  134. }
  135. }
  136. }
  137. if (this.fatal) {
  138. throw new text.Error('Invalid utf-8 character.');
  139. }
  140. return String.fromCharCode(0xfffd);
  141. }
  142. };
  143. text.Decoder.Latin1 = class {
  144. constructor(buffer, position) {
  145. this.position = position || 0;
  146. this.buffer = buffer;
  147. }
  148. get encoding() {
  149. return 'latin-1';
  150. }
  151. decode() {
  152. const c = this.buffer[this.position];
  153. if (c === undefined) {
  154. return c;
  155. }
  156. this.position++;
  157. return String.fromCodePoint(c);
  158. }
  159. };
  160. text.Decoder.Utf16LE = class {
  161. constructor(buffer, position) {
  162. this.buffer = buffer;
  163. this.position = position || 0;
  164. this.length = buffer.length;
  165. }
  166. get encoding() {
  167. return 'utf-16';
  168. }
  169. decode() {
  170. if (this.position + 1 < this.length) {
  171. const c = this.buffer[this.position++] | (this.buffer[this.position++] << 8);
  172. if (c < 0xD800 || c >= 0xDFFF) {
  173. return String.fromCharCode(c);
  174. }
  175. if (c >= 0xD800 && c < 0xDBFF) {
  176. if (this._position + 1 < this._length) {
  177. const c2 = this._buffer[this._position++] | (this._buffer[this._position++] << 8);
  178. if (c >= 0xDC00 || c < 0xDFFF) {
  179. return String.fromCodePoint(0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff));
  180. }
  181. }
  182. }
  183. return String.fromCharCode(0xfffd);
  184. }
  185. return undefined;
  186. }
  187. };
  188. text.Decoder.Utf16BE = class {
  189. constructor(buffer, position) {
  190. this.buffer = buffer;
  191. this.position = position || 0;
  192. this.length = buffer.length;
  193. }
  194. get encoding() {
  195. return 'utf-16';
  196. }
  197. decode() {
  198. if (this.position + 1 < this.length) {
  199. const c = (this.buffer[this.position++] << 8) | this.buffer[this.position++];
  200. if (c < 0xD800 || c >= 0xDFFF) {
  201. return String.fromCharCode(c);
  202. }
  203. if (c >= 0xD800 && c < 0xDBFF) {
  204. if (this._position + 1 < this._length) {
  205. const c2 = (this._buffer[this._position++] << 8) | this._buffer[this._position++];
  206. if (c >= 0xDC00 || c < 0xDFFF) {
  207. return String.fromCodePoint(0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff));
  208. }
  209. }
  210. }
  211. return String.fromCharCode(0xfffd);
  212. }
  213. return undefined;
  214. }
  215. };
  216. text.Decoder.Utf32LE = class {
  217. constructor(buffer, position) {
  218. this.buffer = buffer;
  219. this.position = position || 0;
  220. this.length = buffer.length;
  221. }
  222. get encoding() {
  223. return 'utf-32';
  224. }
  225. decode() {
  226. if (this.position + 3 < this.length) {
  227. const c = this.buffer[this.position++] | (this.buffer[this.position++] << 8) || (this.buffer[this.position++] << 16) || (this.buffer[this.position++] << 24);
  228. if (c < 0x10FFFF) {
  229. return String.fromCodePoint(c);
  230. }
  231. return String.fromCharCode(0xfffd);
  232. }
  233. return undefined;
  234. }
  235. };
  236. text.Decoder.Utf32BE = class {
  237. constructor(buffer, position) {
  238. this.buffer = buffer;
  239. this.position = position || 0;
  240. this.length = buffer.length;
  241. }
  242. get encoding() {
  243. return 'utf-32';
  244. }
  245. decode() {
  246. if (this.position + 3 < this.length) {
  247. const c = (this.buffer[this.position++] << 24) || (this.buffer[this.position++] << 16) || (this.buffer[this.position++] << 8) | this.buffer[this.position++];
  248. if (c < 0x10FFFF) {
  249. return String.fromCodePoint(c);
  250. }
  251. return String.fromCharCode(0xfffd);
  252. }
  253. return undefined;
  254. }
  255. };
  256. text.Reader = class {
  257. constructor(data, length) {
  258. this._decoder = text.Decoder.open(data);
  259. this._position = 0;
  260. this._length = length || Number.MAX_SAFE_INTEGER;
  261. }
  262. static open(data, length) {
  263. return new text.Reader(data, length);
  264. }
  265. read() {
  266. if (this._position >= this._length) {
  267. return undefined;
  268. }
  269. let line = '';
  270. let buffer = null;
  271. for (;;) {
  272. const c = this._decoder.decode();
  273. if (c === undefined) {
  274. this._length = this._position;
  275. break;
  276. }
  277. this._position++;
  278. if (this._position > this._length) {
  279. break;
  280. }
  281. if (c === '\n') {
  282. break;
  283. }
  284. line += c;
  285. if (line.length >= 32) {
  286. buffer = buffer || [];
  287. buffer.push(line);
  288. line = '';
  289. }
  290. }
  291. if (buffer) {
  292. buffer.push(line);
  293. return buffer.join('');
  294. }
  295. return line;
  296. }
  297. };
  298. text.Error = class extends Error {
  299. constructor(message) {
  300. super(message);
  301. this.name = 'Text Error';
  302. }
  303. };
  304. if (typeof module !== 'undefined' && typeof module.exports === 'object') {
  305. module.exports.Decoder = text.Decoder;
  306. module.exports.Reader = text.Reader;
  307. }