2
0

sentencepiece-proto.js 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. export const sentencepiece = {};
  2. sentencepiece.TrainerSpec = class TrainerSpec {
  3. constructor() {
  4. this.input = [];
  5. this.accept_language = [];
  6. this.control_symbols = [];
  7. this.user_defined_symbols = [];
  8. }
  9. static decode(reader, length) {
  10. const message = new sentencepiece.TrainerSpec();
  11. const end = length === undefined ? reader.length : reader.position + length;
  12. while (reader.position < end) {
  13. const tag = reader.uint32();
  14. switch (tag >>> 3) {
  15. case 1:
  16. message.input.push(reader.string());
  17. break;
  18. case 7:
  19. message.input_format = reader.string();
  20. break;
  21. case 2:
  22. message.model_prefix = reader.string();
  23. break;
  24. case 3:
  25. message.model_type = reader.int32();
  26. break;
  27. case 4:
  28. message.vocab_size = reader.int32();
  29. break;
  30. case 5:
  31. message.accept_language.push(reader.string());
  32. break;
  33. case 6:
  34. message.self_test_sample_size = reader.int32();
  35. break;
  36. case 50:
  37. message.enable_differential_privacy = reader.bool();
  38. break;
  39. case 51:
  40. message.differential_privacy_noise_level = reader.float();
  41. break;
  42. case 52:
  43. message.differential_privacy_clipping_threshold = reader.uint64();
  44. break;
  45. case 10:
  46. message.character_coverage = reader.float();
  47. break;
  48. case 11:
  49. message.input_sentence_size = reader.uint64();
  50. break;
  51. case 19:
  52. message.shuffle_input_sentence = reader.bool();
  53. break;
  54. case 12:
  55. message.mining_sentence_size = reader.int32();
  56. break;
  57. case 13:
  58. message.training_sentence_size = reader.int32();
  59. break;
  60. case 14:
  61. message.seed_sentencepiece_size = reader.int32();
  62. break;
  63. case 15:
  64. message.shrinking_factor = reader.float();
  65. break;
  66. case 18:
  67. message.max_sentence_length = reader.int32();
  68. break;
  69. case 16:
  70. message.num_threads = reader.int32();
  71. break;
  72. case 17:
  73. message.num_sub_iterations = reader.int32();
  74. break;
  75. case 20:
  76. message.max_sentencepiece_length = reader.int32();
  77. break;
  78. case 21:
  79. message.split_by_unicode_script = reader.bool();
  80. break;
  81. case 23:
  82. message.split_by_number = reader.bool();
  83. break;
  84. case 22:
  85. message.split_by_whitespace = reader.bool();
  86. break;
  87. case 24:
  88. message.treat_whitespace_as_suffix = reader.bool();
  89. break;
  90. case 26:
  91. message.allow_whitespace_only_pieces = reader.bool();
  92. break;
  93. case 25:
  94. message.split_digits = reader.bool();
  95. break;
  96. case 53:
  97. message.pretokenization_delimiter = reader.string();
  98. break;
  99. case 30:
  100. message.control_symbols.push(reader.string());
  101. break;
  102. case 31:
  103. message.user_defined_symbols.push(reader.string());
  104. break;
  105. case 36:
  106. message.required_chars = reader.string();
  107. break;
  108. case 35:
  109. message.byte_fallback = reader.bool();
  110. break;
  111. case 32:
  112. message.vocabulary_output_piece_score = reader.bool();
  113. break;
  114. case 33:
  115. message.hard_vocab_limit = reader.bool();
  116. break;
  117. case 34:
  118. message.use_all_vocab = reader.bool();
  119. break;
  120. case 40:
  121. message.unk_id = reader.int32();
  122. break;
  123. case 41:
  124. message.bos_id = reader.int32();
  125. break;
  126. case 42:
  127. message.eos_id = reader.int32();
  128. break;
  129. case 43:
  130. message.pad_id = reader.int32();
  131. break;
  132. case 45:
  133. message.unk_piece = reader.string();
  134. break;
  135. case 46:
  136. message.bos_piece = reader.string();
  137. break;
  138. case 47:
  139. message.eos_piece = reader.string();
  140. break;
  141. case 48:
  142. message.pad_piece = reader.string();
  143. break;
  144. case 44:
  145. message.unk_surface = reader.string();
  146. break;
  147. case 49:
  148. message.train_extremely_large_corpus = reader.bool();
  149. break;
  150. case 54:
  151. message.seed_sentencepieces_file = reader.string();
  152. break;
  153. default:
  154. reader.skipType(tag & 7);
  155. break;
  156. }
  157. }
  158. return message;
  159. }
  160. static decodeText(reader) {
  161. const message = new sentencepiece.TrainerSpec();
  162. reader.start();
  163. while (!reader.end()) {
  164. const tag = reader.tag();
  165. switch (tag) {
  166. case "input":
  167. reader.array(message.input, () => reader.string());
  168. break;
  169. case "input_format":
  170. message.input_format = reader.string();
  171. break;
  172. case "model_prefix":
  173. message.model_prefix = reader.string();
  174. break;
  175. case "model_type":
  176. message.model_type = reader.enum(sentencepiece.TrainerSpec.ModelType);
  177. break;
  178. case "vocab_size":
  179. message.vocab_size = reader.int32();
  180. break;
  181. case "accept_language":
  182. reader.array(message.accept_language, () => reader.string());
  183. break;
  184. case "self_test_sample_size":
  185. message.self_test_sample_size = reader.int32();
  186. break;
  187. case "enable_differential_privacy":
  188. message.enable_differential_privacy = reader.bool();
  189. break;
  190. case "differential_privacy_noise_level":
  191. message.differential_privacy_noise_level = reader.float();
  192. break;
  193. case "differential_privacy_clipping_threshold":
  194. message.differential_privacy_clipping_threshold = reader.uint64();
  195. break;
  196. case "character_coverage":
  197. message.character_coverage = reader.float();
  198. break;
  199. case "input_sentence_size":
  200. message.input_sentence_size = reader.uint64();
  201. break;
  202. case "shuffle_input_sentence":
  203. message.shuffle_input_sentence = reader.bool();
  204. break;
  205. case "mining_sentence_size":
  206. message.mining_sentence_size = reader.int32();
  207. break;
  208. case "training_sentence_size":
  209. message.training_sentence_size = reader.int32();
  210. break;
  211. case "seed_sentencepiece_size":
  212. message.seed_sentencepiece_size = reader.int32();
  213. break;
  214. case "shrinking_factor":
  215. message.shrinking_factor = reader.float();
  216. break;
  217. case "max_sentence_length":
  218. message.max_sentence_length = reader.int32();
  219. break;
  220. case "num_threads":
  221. message.num_threads = reader.int32();
  222. break;
  223. case "num_sub_iterations":
  224. message.num_sub_iterations = reader.int32();
  225. break;
  226. case "max_sentencepiece_length":
  227. message.max_sentencepiece_length = reader.int32();
  228. break;
  229. case "split_by_unicode_script":
  230. message.split_by_unicode_script = reader.bool();
  231. break;
  232. case "split_by_number":
  233. message.split_by_number = reader.bool();
  234. break;
  235. case "split_by_whitespace":
  236. message.split_by_whitespace = reader.bool();
  237. break;
  238. case "treat_whitespace_as_suffix":
  239. message.treat_whitespace_as_suffix = reader.bool();
  240. break;
  241. case "allow_whitespace_only_pieces":
  242. message.allow_whitespace_only_pieces = reader.bool();
  243. break;
  244. case "split_digits":
  245. message.split_digits = reader.bool();
  246. break;
  247. case "pretokenization_delimiter":
  248. message.pretokenization_delimiter = reader.string();
  249. break;
  250. case "control_symbols":
  251. reader.array(message.control_symbols, () => reader.string());
  252. break;
  253. case "user_defined_symbols":
  254. reader.array(message.user_defined_symbols, () => reader.string());
  255. break;
  256. case "required_chars":
  257. message.required_chars = reader.string();
  258. break;
  259. case "byte_fallback":
  260. message.byte_fallback = reader.bool();
  261. break;
  262. case "vocabulary_output_piece_score":
  263. message.vocabulary_output_piece_score = reader.bool();
  264. break;
  265. case "hard_vocab_limit":
  266. message.hard_vocab_limit = reader.bool();
  267. break;
  268. case "use_all_vocab":
  269. message.use_all_vocab = reader.bool();
  270. break;
  271. case "unk_id":
  272. message.unk_id = reader.int32();
  273. break;
  274. case "bos_id":
  275. message.bos_id = reader.int32();
  276. break;
  277. case "eos_id":
  278. message.eos_id = reader.int32();
  279. break;
  280. case "pad_id":
  281. message.pad_id = reader.int32();
  282. break;
  283. case "unk_piece":
  284. message.unk_piece = reader.string();
  285. break;
  286. case "bos_piece":
  287. message.bos_piece = reader.string();
  288. break;
  289. case "eos_piece":
  290. message.eos_piece = reader.string();
  291. break;
  292. case "pad_piece":
  293. message.pad_piece = reader.string();
  294. break;
  295. case "unk_surface":
  296. message.unk_surface = reader.string();
  297. break;
  298. case "train_extremely_large_corpus":
  299. message.train_extremely_large_corpus = reader.bool();
  300. break;
  301. case "seed_sentencepieces_file":
  302. message.seed_sentencepieces_file = reader.string();
  303. break;
  304. default:
  305. reader.field(tag, message);
  306. break;
  307. }
  308. }
  309. return message;
  310. }
  311. };
  312. sentencepiece.TrainerSpec.prototype.input_format = "";
  313. sentencepiece.TrainerSpec.prototype.model_prefix = "";
  314. sentencepiece.TrainerSpec.prototype.model_type = 1;
  315. sentencepiece.TrainerSpec.prototype.vocab_size = 8000;
  316. sentencepiece.TrainerSpec.prototype.self_test_sample_size = 0;
  317. sentencepiece.TrainerSpec.prototype.enable_differential_privacy = false;
  318. sentencepiece.TrainerSpec.prototype.differential_privacy_noise_level = 0;
  319. sentencepiece.TrainerSpec.prototype.differential_privacy_clipping_threshold = 0n;
  320. sentencepiece.TrainerSpec.prototype.character_coverage = 0.9995;
  321. sentencepiece.TrainerSpec.prototype.input_sentence_size = 0n;
  322. sentencepiece.TrainerSpec.prototype.shuffle_input_sentence = true;
  323. sentencepiece.TrainerSpec.prototype.mining_sentence_size = 0;
  324. sentencepiece.TrainerSpec.prototype.training_sentence_size = 0;
  325. sentencepiece.TrainerSpec.prototype.seed_sentencepiece_size = 1000000;
  326. sentencepiece.TrainerSpec.prototype.shrinking_factor = 0.75;
  327. sentencepiece.TrainerSpec.prototype.max_sentence_length = 4192;
  328. sentencepiece.TrainerSpec.prototype.num_threads = 16;
  329. sentencepiece.TrainerSpec.prototype.num_sub_iterations = 2;
  330. sentencepiece.TrainerSpec.prototype.max_sentencepiece_length = 16;
  331. sentencepiece.TrainerSpec.prototype.split_by_unicode_script = true;
  332. sentencepiece.TrainerSpec.prototype.split_by_number = true;
  333. sentencepiece.TrainerSpec.prototype.split_by_whitespace = true;
  334. sentencepiece.TrainerSpec.prototype.treat_whitespace_as_suffix = false;
  335. sentencepiece.TrainerSpec.prototype.allow_whitespace_only_pieces = false;
  336. sentencepiece.TrainerSpec.prototype.split_digits = false;
  337. sentencepiece.TrainerSpec.prototype.pretokenization_delimiter = "";
  338. sentencepiece.TrainerSpec.prototype.required_chars = "";
  339. sentencepiece.TrainerSpec.prototype.byte_fallback = false;
  340. sentencepiece.TrainerSpec.prototype.vocabulary_output_piece_score = true;
  341. sentencepiece.TrainerSpec.prototype.hard_vocab_limit = true;
  342. sentencepiece.TrainerSpec.prototype.use_all_vocab = false;
  343. sentencepiece.TrainerSpec.prototype.unk_id = 0;
  344. sentencepiece.TrainerSpec.prototype.bos_id = 1;
  345. sentencepiece.TrainerSpec.prototype.eos_id = 2;
  346. sentencepiece.TrainerSpec.prototype.pad_id = -1;
  347. sentencepiece.TrainerSpec.prototype.unk_piece = "<unk>";
  348. sentencepiece.TrainerSpec.prototype.bos_piece = "<s>";
  349. sentencepiece.TrainerSpec.prototype.eos_piece = "</s>";
  350. sentencepiece.TrainerSpec.prototype.pad_piece = "<pad>";
  351. sentencepiece.TrainerSpec.prototype.unk_surface = " E28187 ";
  352. sentencepiece.TrainerSpec.prototype.train_extremely_large_corpus = false;
  353. sentencepiece.TrainerSpec.prototype.seed_sentencepieces_file = "";
  354. sentencepiece.TrainerSpec.ModelType = {
  355. "UNIGRAM": 1,
  356. "BPE": 2,
  357. "WORD": 3,
  358. "CHAR": 4
  359. };
  360. sentencepiece.NormalizerSpec = class NormalizerSpec {
  361. static decode(reader, length) {
  362. const message = new sentencepiece.NormalizerSpec();
  363. const end = length === undefined ? reader.length : reader.position + length;
  364. while (reader.position < end) {
  365. const tag = reader.uint32();
  366. switch (tag >>> 3) {
  367. case 1:
  368. message.name = reader.string();
  369. break;
  370. case 2:
  371. message.precompiled_charsmap = reader.bytes();
  372. break;
  373. case 3:
  374. message.add_dummy_prefix = reader.bool();
  375. break;
  376. case 4:
  377. message.remove_extra_whitespaces = reader.bool();
  378. break;
  379. case 5:
  380. message.escape_whitespaces = reader.bool();
  381. break;
  382. case 6:
  383. message.normalization_rule_tsv = reader.string();
  384. break;
  385. default:
  386. reader.skipType(tag & 7);
  387. break;
  388. }
  389. }
  390. return message;
  391. }
  392. static decodeText(reader) {
  393. const message = new sentencepiece.NormalizerSpec();
  394. reader.start();
  395. while (!reader.end()) {
  396. const tag = reader.tag();
  397. switch (tag) {
  398. case "name":
  399. message.name = reader.string();
  400. break;
  401. case "precompiled_charsmap":
  402. message.precompiled_charsmap = reader.bytes();
  403. break;
  404. case "add_dummy_prefix":
  405. message.add_dummy_prefix = reader.bool();
  406. break;
  407. case "remove_extra_whitespaces":
  408. message.remove_extra_whitespaces = reader.bool();
  409. break;
  410. case "escape_whitespaces":
  411. message.escape_whitespaces = reader.bool();
  412. break;
  413. case "normalization_rule_tsv":
  414. message.normalization_rule_tsv = reader.string();
  415. break;
  416. default:
  417. reader.field(tag, message);
  418. break;
  419. }
  420. }
  421. return message;
  422. }
  423. };
  424. sentencepiece.NormalizerSpec.prototype.name = "";
  425. sentencepiece.NormalizerSpec.prototype.precompiled_charsmap = new Uint8Array([]);
  426. sentencepiece.NormalizerSpec.prototype.add_dummy_prefix = true;
  427. sentencepiece.NormalizerSpec.prototype.remove_extra_whitespaces = true;
  428. sentencepiece.NormalizerSpec.prototype.escape_whitespaces = true;
  429. sentencepiece.NormalizerSpec.prototype.normalization_rule_tsv = "";
  430. sentencepiece.SelfTestData = class SelfTestData {
  431. constructor() {
  432. this.samples = [];
  433. }
  434. static decode(reader, length) {
  435. const message = new sentencepiece.SelfTestData();
  436. const end = length === undefined ? reader.length : reader.position + length;
  437. while (reader.position < end) {
  438. const tag = reader.uint32();
  439. switch (tag >>> 3) {
  440. case 1:
  441. message.samples.push(sentencepiece.SelfTestData.Sample.decode(reader, reader.uint32()));
  442. break;
  443. default:
  444. reader.skipType(tag & 7);
  445. break;
  446. }
  447. }
  448. return message;
  449. }
  450. static decodeText(reader) {
  451. const message = new sentencepiece.SelfTestData();
  452. reader.start();
  453. while (!reader.end()) {
  454. const tag = reader.tag();
  455. switch (tag) {
  456. case "samples":
  457. message.samples.push(sentencepiece.SelfTestData.Sample.decodeText(reader));
  458. break;
  459. default:
  460. reader.field(tag, message);
  461. break;
  462. }
  463. }
  464. return message;
  465. }
  466. };
  467. sentencepiece.SelfTestData.Sample = class Sample {
  468. static decode(reader, length) {
  469. const message = new sentencepiece.SelfTestData.Sample();
  470. const end = length === undefined ? reader.length : reader.position + length;
  471. while (reader.position < end) {
  472. const tag = reader.uint32();
  473. switch (tag >>> 3) {
  474. case 1:
  475. message.input = reader.string();
  476. break;
  477. case 2:
  478. message.expected = reader.string();
  479. break;
  480. default:
  481. reader.skipType(tag & 7);
  482. break;
  483. }
  484. }
  485. return message;
  486. }
  487. static decodeText(reader) {
  488. const message = new sentencepiece.SelfTestData.Sample();
  489. reader.start();
  490. while (!reader.end()) {
  491. const tag = reader.tag();
  492. switch (tag) {
  493. case "input":
  494. message.input = reader.string();
  495. break;
  496. case "expected":
  497. message.expected = reader.string();
  498. break;
  499. default:
  500. reader.field(tag, message);
  501. break;
  502. }
  503. }
  504. return message;
  505. }
  506. };
  507. sentencepiece.SelfTestData.Sample.prototype.input = "";
  508. sentencepiece.SelfTestData.Sample.prototype.expected = "";
  509. sentencepiece.ModelProto = class ModelProto {
  510. constructor() {
  511. this.pieces = [];
  512. }
  513. static decode(reader, length) {
  514. const message = new sentencepiece.ModelProto();
  515. const end = length === undefined ? reader.length : reader.position + length;
  516. while (reader.position < end) {
  517. const tag = reader.uint32();
  518. switch (tag >>> 3) {
  519. case 1:
  520. message.pieces.push(sentencepiece.ModelProto.SentencePiece.decode(reader, reader.uint32()));
  521. break;
  522. case 2:
  523. message.trainer_spec = sentencepiece.TrainerSpec.decode(reader, reader.uint32());
  524. break;
  525. case 3:
  526. message.normalizer_spec = sentencepiece.NormalizerSpec.decode(reader, reader.uint32());
  527. break;
  528. case 4:
  529. message.self_test_data = sentencepiece.SelfTestData.decode(reader, reader.uint32());
  530. break;
  531. case 5:
  532. message.denormalizer_spec = sentencepiece.NormalizerSpec.decode(reader, reader.uint32());
  533. break;
  534. default:
  535. reader.skipType(tag & 7);
  536. break;
  537. }
  538. }
  539. return message;
  540. }
  541. static decodeText(reader) {
  542. const message = new sentencepiece.ModelProto();
  543. reader.start();
  544. while (!reader.end()) {
  545. const tag = reader.tag();
  546. switch (tag) {
  547. case "pieces":
  548. message.pieces.push(sentencepiece.ModelProto.SentencePiece.decodeText(reader));
  549. break;
  550. case "trainer_spec":
  551. message.trainer_spec = sentencepiece.TrainerSpec.decodeText(reader);
  552. break;
  553. case "normalizer_spec":
  554. message.normalizer_spec = sentencepiece.NormalizerSpec.decodeText(reader);
  555. break;
  556. case "self_test_data":
  557. message.self_test_data = sentencepiece.SelfTestData.decodeText(reader);
  558. break;
  559. case "denormalizer_spec":
  560. message.denormalizer_spec = sentencepiece.NormalizerSpec.decodeText(reader);
  561. break;
  562. default:
  563. reader.field(tag, message);
  564. break;
  565. }
  566. }
  567. return message;
  568. }
  569. };
  570. sentencepiece.ModelProto.prototype.trainer_spec = null;
  571. sentencepiece.ModelProto.prototype.normalizer_spec = null;
  572. sentencepiece.ModelProto.prototype.self_test_data = null;
  573. sentencepiece.ModelProto.prototype.denormalizer_spec = null;
  574. sentencepiece.ModelProto.SentencePiece = class SentencePiece {
  575. static decode(reader, length) {
  576. const message = new sentencepiece.ModelProto.SentencePiece();
  577. const end = length === undefined ? reader.length : reader.position + length;
  578. while (reader.position < end) {
  579. const tag = reader.uint32();
  580. switch (tag >>> 3) {
  581. case 1:
  582. message.piece = reader.string();
  583. break;
  584. case 2:
  585. message.score = reader.float();
  586. break;
  587. case 3:
  588. message.type = reader.int32();
  589. break;
  590. default:
  591. reader.skipType(tag & 7);
  592. break;
  593. }
  594. }
  595. return message;
  596. }
  597. static decodeText(reader) {
  598. const message = new sentencepiece.ModelProto.SentencePiece();
  599. reader.start();
  600. while (!reader.end()) {
  601. const tag = reader.tag();
  602. switch (tag) {
  603. case "piece":
  604. message.piece = reader.string();
  605. break;
  606. case "score":
  607. message.score = reader.float();
  608. break;
  609. case "type":
  610. message.type = reader.enum(sentencepiece.ModelProto.SentencePiece.Type);
  611. break;
  612. default:
  613. reader.field(tag, message);
  614. break;
  615. }
  616. }
  617. return message;
  618. }
  619. };
  620. sentencepiece.ModelProto.SentencePiece.prototype.piece = "";
  621. sentencepiece.ModelProto.SentencePiece.prototype.score = 0;
  622. sentencepiece.ModelProto.SentencePiece.prototype.type = 1;
  623. sentencepiece.ModelProto.SentencePiece.Type = {
  624. "NORMAL": 1,
  625. "UNKNOWN": 2,
  626. "CONTROL": 3,
  627. "USER_DEFINED": 4,
  628. "BYTE": 6,
  629. "UNUSED": 5
  630. };