test_configurations.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the BSD-style license found in the
  5. # LICENSE file in the root directory of this source tree. An additional grant
  6. # of patent rights can be found in the PATENTS file in the same directory.
  7. from __future__ import absolute_import
  8. from __future__ import division
  9. from __future__ import print_function
  10. from __future__ import unicode_literals
  11. import multiprocessing
  12. # This script represents a collection of integration tests
  13. # Each integration test comes with a full set of parameters,
  14. # a dataset, and expected metrics.
  15. # These configurations can be used by various fastText apis
  16. # to confirm some level of correctness.
  17. def max_thread():
  18. return multiprocessing.cpu_count() - 1
  19. def check_supervised_configuration(configuration, verbose=1):
  20. configuration["args"]["verbose"] = verbose
  21. configuration["quant_args"]["verbose"] = verbose
  22. return configuration
  23. def check_supervised_configurations(configurations, verbose=1):
  24. for i in range(len(configurations)):
  25. configurations[i] = check_supervised_configuration(
  26. configurations[i], verbose=verbose
  27. )
  28. return configurations
  29. def flickr_job(thread=max_thread()):
  30. config = {}
  31. config["dataset"] = "YFCC100M"
  32. config["args"] = {
  33. "dim": 256,
  34. "wordNgrams": 2,
  35. "minCount": 10,
  36. "bucket": 10000000,
  37. "epoch": 20,
  38. "loss": "hs",
  39. "minCountLabel": 100,
  40. "thread": thread
  41. }
  42. config["args"]["input"] = "YFCC100M/train"
  43. config["quant_args"] = {
  44. "dsub": 2,
  45. "lr": "0.1",
  46. "epoch": 5,
  47. "cutoff": 100000,
  48. "qnorm": True,
  49. "retrain": True,
  50. "qout": True
  51. }
  52. config["quant_args"]["input"] = config["args"]["input"]
  53. config["test"] = {
  54. "n": 647224,
  55. "p1": 0.471,
  56. "r1": 0.0722,
  57. "size": 12060039727,
  58. "data": "YFCC100M/test",
  59. }
  60. # One quant example (to illustrate slack): 0.344, 0.0528, 64506972
  61. config["quant_test"] = {
  62. "n": 647224,
  63. "p1": 0.300,
  64. "r1": 0.0450,
  65. "size": 70000000,
  66. "data": "YFCC100M/test",
  67. }
  68. return config
  69. def langid_job1(thread=max_thread()):
  70. config = {}
  71. config["dataset"] = "langid"
  72. config["args"] = {"dim": 16, "minn": 2, "maxn": 4, "thread": thread}
  73. config["args"]["input"] = "langid.train"
  74. config["quant_args"] = {"qnorm": True, "cutoff": 50000, "retrain": True}
  75. config["quant_args"]["input"] = config["args"]["input"]
  76. config["test"] = {
  77. "n": 10000,
  78. "p1": 0.985,
  79. "r1": 0.985,
  80. "size": 368369579,
  81. "data": "langid.valid",
  82. }
  83. # One quant example (to illustrate slack): 0.984 0.984 932793
  84. config["quant_test"] = {
  85. "p1": 0.97,
  86. "r1": 0.97,
  87. "size": 1000000,
  88. }
  89. config["quant_test"]["n"] = config["test"]["n"]
  90. config["quant_test"]["data"] = config["test"]["data"]
  91. return config
  92. def langid_job2(thread=max_thread()):
  93. config = langid_job1(thread).copy()
  94. config["args"]["loss"] = "hs"
  95. return config
  96. def cooking_job1(thread=max_thread()):
  97. config = {}
  98. config["dataset"] = "cooking"
  99. config["args"] = {
  100. "epoch": 25,
  101. "lr": 1.0,
  102. "wordNgrams": 2,
  103. "minCount": 1,
  104. "thread": thread,
  105. }
  106. config["args"]["input"] = "cooking.train"
  107. config["quant_args"] = {"qnorm": True, "cutoff": 50000, "retrain": True}
  108. config["quant_args"]["input"] = config["args"]["input"]
  109. config["test"] = {
  110. "n": 3000,
  111. "p1": 0.59,
  112. "r1": 0.25,
  113. "size": 804047585,
  114. "data": "cooking.valid",
  115. }
  116. # One quant example (to illustrate slack): 0.602 0.26 3439172
  117. config["quant_test"] = {
  118. "p1": 0.55,
  119. "r1": 0.20,
  120. "size": 4000000,
  121. }
  122. config["quant_test"]["n"] = config["test"]["n"]
  123. config["quant_test"]["data"] = config["test"]["data"]
  124. return config
  125. def cooking_job2(thread=max_thread()):
  126. config = cooking_job1(thread).copy()
  127. config["args"]["loss"] = "hs"
  128. return config
  129. # Supervised models
  130. # See https://fasttext.cc/docs/en/supervised-models.html
  131. def get_supervised_models(thread=max_thread(), verbose=1):
  132. sup_job_dataset = [
  133. "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity",
  134. "yelp_review_full", "yahoo_answers", "amazon_review_full",
  135. "amazon_review_polarity"
  136. ]
  137. sup_params = {
  138. "dim": 10,
  139. "wordNgrams": 2,
  140. "minCount": 1,
  141. "bucket": 10000000,
  142. "epoch": 5,
  143. "thread": thread,
  144. "verbose": 1,
  145. }
  146. quant_params = {
  147. "retrain": True,
  148. "cutoff": 100000,
  149. "qnorm": True,
  150. "verbose": 1,
  151. }
  152. sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05]
  153. sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000]
  154. sup_job_p1 = [0.915, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
  155. sup_job_r1 = [0.915, 0.968, 0.984, 0.956, 0.638, 0.723, 0.603, 0.946]
  156. sup_job_size = [
  157. 405607193, 421445471, 447481878, 427867393, 431292576, 517549567,
  158. 483742593, 493604598
  159. ]
  160. sup_job_quant_p1 = [0.918, 0.965, 0.984, 0.950, 0.625, 0.707, 0.58, 0.940]
  161. sup_job_quant_r1 = [0.918, 0.965, 0.984, 0.950, 0.625, 0.707, 0.58, 0.940]
  162. sup_job_quant_size = [
  163. 1600000, 1457000, 1690000, 1550000, 1567896, 1655000, 1600000, 1575000
  164. ]
  165. configurations = []
  166. for i in range(len(sup_job_dataset)):
  167. configuration = {}
  168. configuration["dataset"] = sup_job_dataset[i]
  169. args = sup_params.copy()
  170. quant_args = quant_params.copy()
  171. args["lr"] = sup_job_lr[i]
  172. args["input"] = sup_job_dataset[i] + ".train"
  173. quant_args["lr"] = sup_job_lr[i]
  174. quant_args["input"] = sup_job_dataset[i] + ".train"
  175. configuration["args"] = args
  176. configuration["quant_args"] = quant_args
  177. test = {
  178. "n": sup_job_n[i],
  179. "p1": sup_job_p1[i],
  180. "r1": sup_job_r1[i],
  181. "size": sup_job_size[i],
  182. "data": sup_job_dataset[i] + ".test",
  183. }
  184. quant_test = {
  185. "n": sup_job_n[i],
  186. "p1": sup_job_quant_p1[i],
  187. "r1": sup_job_quant_r1[i],
  188. "size": sup_job_quant_size[i],
  189. "data": sup_job_dataset[i] + ".test",
  190. }
  191. configuration["test"] = test
  192. configuration["quant_test"] = quant_test
  193. configurations.append(configuration)
  194. configurations.append(flickr_job())
  195. configurations.append(langid_job1())
  196. configurations.append(langid_job2())
  197. configurations.append(cooking_job1())
  198. configurations.append(cooking_job2())
  199. configurations = check_supervised_configurations(
  200. configurations, verbose=verbose
  201. )
  202. return configurations