test_configurations.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the MIT license found in the
  5. # LICENSE file in the root directory of this source tree.
  6. from __future__ import absolute_import
  7. from __future__ import division
  8. from __future__ import print_function
  9. from __future__ import unicode_literals
  10. import multiprocessing
  11. # This script represents a collection of integration tests
  12. # Each integration test comes with a full set of parameters,
  13. # a dataset, and expected metrics.
  14. # These configurations can be used by various fastText APIs
  15. # to confirm some level of correctness.
  16. def max_thread():
  17. return multiprocessing.cpu_count() - 1
  18. def check_supervised_configuration(configuration, verbose=1):
  19. configuration["args"]["verbose"] = verbose
  20. configuration["quant_args"]["verbose"] = verbose
  21. return configuration
  22. def check_supervised_configurations(configurations, verbose=1):
  23. for i in range(len(configurations)):
  24. configurations[i] = check_supervised_configuration(
  25. configurations[i], verbose=verbose
  26. )
  27. return configurations
  28. def flickr_job(thread=None):
  29. if thread is None:
  30. thread = max_thread()
  31. config = {}
  32. config["dataset"] = "YFCC100M"
  33. config["args"] = {
  34. "dim": 256,
  35. "wordNgrams": 2,
  36. "minCount": 10,
  37. "bucket": 10000000,
  38. "epoch": 20,
  39. "loss": "hs",
  40. "minCountLabel": 100,
  41. "thread": thread
  42. }
  43. config["args"]["input"] = "YFCC100M/train"
  44. config["quant_args"] = {
  45. "dsub": 2,
  46. "lr": 0.1,
  47. "epoch": 5,
  48. "cutoff": 100000,
  49. "qnorm": True,
  50. "retrain": True,
  51. "qout": True
  52. }
  53. config["quant_args"]["input"] = config["args"]["input"]
  54. config["test"] = {
  55. "n": 647224,
  56. "p1": 0.470,
  57. "r1": 0.071,
  58. "size": 12060039727,
  59. "data": "YFCC100M/test",
  60. }
  61. # One quant example (to illustrate slack): 0.344, 0.0528, 64506972
  62. config["quant_test"] = {
  63. "n": 647224,
  64. "p1": 0.300,
  65. "r1": 0.0450,
  66. "size": 70000000,
  67. "data": "YFCC100M/test",
  68. }
  69. return config
  70. def langid_job1(thread=None):
  71. if thread is None:
  72. thread = max_thread()
  73. config = {}
  74. config["dataset"] = "langid"
  75. config["args"] = {"dim": 16, "minn": 2, "maxn": 4, "thread": thread}
  76. config["args"]["input"] = "langid.train"
  77. config["quant_args"] = {"qnorm": True, "cutoff": 50000, "retrain": True}
  78. config["quant_args"]["input"] = config["args"]["input"]
  79. config["test"] = {
  80. "n": 10000,
  81. "p1": 0.985,
  82. "r1": 0.985,
  83. "size": 368132610,
  84. "data": "langid.valid",
  85. }
  86. # One quant example (to illustrate slack): 0.984 0.984 932793
  87. config["quant_test"] = {
  88. "p1": 0.97,
  89. "r1": 0.97,
  90. "size": 1000000,
  91. }
  92. config["quant_test"]["n"] = config["test"]["n"]
  93. config["quant_test"]["data"] = config["test"]["data"]
  94. return config
  95. def langid_job2(thread=None):
  96. if thread is None:
  97. thread = max_thread()
  98. config = langid_job1(thread).copy()
  99. config["args"]["loss"] = "hs"
  100. return config
  101. def cooking_job1(thread=None):
  102. if thread is None:
  103. thread = max_thread()
  104. config = {}
  105. config["dataset"] = "cooking"
  106. config["args"] = {
  107. "epoch": 25,
  108. "lr": 1.0,
  109. "wordNgrams": 2,
  110. "minCount": 1,
  111. "thread": thread,
  112. }
  113. config["args"]["input"] = "cooking.train"
  114. config["quant_args"] = {"qnorm": True, "cutoff": 50000, "retrain": True}
  115. config["quant_args"]["input"] = config["args"]["input"]
  116. config["test"] = {
  117. "n": 3000,
  118. "p1": 0.59,
  119. "r1": 0.25,
  120. "size": 804047585,
  121. "data": "cooking.valid",
  122. }
  123. # One quant example (to illustrate slack): 0.602 0.26 3439172
  124. config["quant_test"] = {
  125. "p1": 0.55,
  126. "r1": 0.20,
  127. "size": 4000000,
  128. }
  129. config["quant_test"]["n"] = config["test"]["n"]
  130. config["quant_test"]["data"] = config["test"]["data"]
  131. return config
  132. def cooking_job2(thread=None):
  133. if thread is None:
  134. thread = max_thread()
  135. config = cooking_job1(thread).copy()
  136. config["args"]["loss"] = "hs"
  137. return config
  138. # Supervised models
  139. # See https://fasttext.cc/docs/en/supervised-models.html
  140. def get_supervised_models(thread=None, verbose=1):
  141. if thread is None:
  142. thread = max_thread()
  143. sup_job_dataset = [
  144. "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity",
  145. "yelp_review_full", "yahoo_answers", "amazon_review_full",
  146. "amazon_review_polarity"
  147. ]
  148. sup_params = {
  149. "dim": 10,
  150. "wordNgrams": 2,
  151. "minCount": 1,
  152. "bucket": 10000000,
  153. "epoch": 5,
  154. "thread": thread,
  155. "verbose": 1,
  156. }
  157. quant_params = {
  158. "retrain": True,
  159. "cutoff": 100000,
  160. "qnorm": True,
  161. "verbose": 1,
  162. }
  163. sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05]
  164. sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000]
  165. sup_job_p1 = [0.915, 0.968, 0.983, 0.956, 0.638, 0.723, 0.600, 0.940]
  166. sup_job_r1 = [0.915, 0.968, 0.983, 0.956, 0.638, 0.723, 0.600, 0.940]
  167. sup_job_size = [
  168. 405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
  169. 483742593, 493604598
  170. ]
  171. sup_job_quant_p1 = [0.918, 0.965, 0.983, 0.950, 0.625, 0.707, 0.58, 0.920]
  172. sup_job_quant_r1 = [0.918, 0.965, 0.983, 0.950, 0.625, 0.707, 0.58, 0.920]
  173. sup_job_quant_size = [
  174. 1600000, 1500000, 1700000, 1600000, 1600000, 1700000, 1600000, 1600000
  175. ]
  176. configurations = []
  177. for i in range(len(sup_job_dataset)):
  178. configuration = {}
  179. configuration["dataset"] = sup_job_dataset[i]
  180. args = sup_params.copy()
  181. quant_args = quant_params.copy()
  182. args["lr"] = sup_job_lr[i]
  183. args["input"] = sup_job_dataset[i] + ".train"
  184. quant_args["lr"] = sup_job_lr[i]
  185. quant_args["input"] = sup_job_dataset[i] + ".train"
  186. configuration["args"] = args
  187. configuration["quant_args"] = quant_args
  188. test = {
  189. "n": sup_job_n[i],
  190. "p1": sup_job_p1[i],
  191. "r1": sup_job_r1[i],
  192. "size": sup_job_size[i],
  193. "data": sup_job_dataset[i] + ".test",
  194. }
  195. quant_test = {
  196. "n": sup_job_n[i],
  197. "p1": sup_job_quant_p1[i],
  198. "r1": sup_job_quant_r1[i],
  199. "size": sup_job_quant_size[i],
  200. "data": sup_job_dataset[i] + ".test",
  201. }
  202. configuration["test"] = test
  203. configuration["quant_test"] = quant_test
  204. configurations.append(configuration)
  205. configurations.append(flickr_job())
  206. configurations.append(langid_job1())
  207. configurations.append(langid_job2())
  208. configurations.append(cooking_job1())
  209. configurations.append(cooking_job2())
  210. configurations = check_supervised_configurations(
  211. configurations, verbose=verbose
  212. )
  213. return configurations