english-vectors.html 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. <!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><title>English word vectors · fastText</title><meta name="viewport" content="width=device-width, initial-scale=1.0"/><meta name="generator" content="Docusaurus"/><meta name="description" content="This page gathers several pre-trained word vectors trained using fastText."/><meta name="docsearch:language" content="en"/><meta property="og:title" content="English word vectors · fastText"/><meta property="og:type" content="website"/><meta property="og:url" content="https://fasttext.cc/index.html"/><meta property="og:description" content="This page gathers several pre-trained word vectors trained using fastText."/><meta property="og:image" content="https://fasttext.cc/img/ogimage.png"/><meta name="twitter:card" content="summary"/><link rel="shortcut icon" href="/img/fasttext-icon-bg-web.png"/><link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/default.min.css"/><link rel="alternate" type="application/atom+xml" href="https://fasttext.cc/blog/atom.xml" title="fastText Blog ATOM Feed"/><link rel="alternate" type="application/rss+xml" href="https://fasttext.cc/blog/feed.xml" title="fastText Blog RSS Feed"/><script>
  2. (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  3. (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  4. m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  5. })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
  6. ga('create', 'UA-44373548-30', 'auto');
  7. ga('send', 'pageview');
  8. </script><script type="text/javascript" src="/tabber.js"></script><script src="/js/scrollSpy.js"></script><link rel="stylesheet" href="/css/main.css"/><script src="/js/codetabs.js"></script></head><body class="sideNavVisible"><div class="fixedHeaderContainer"><div class="headerWrapper wrapper"><header><a href="/"><img class="logo" src="/img/fasttext-icon-white-web.png" alt="fastText"/></a><div class="navigationWrapper navigationSlider"><nav class="slidingNav"><ul class="nav-site nav-site-internal"><li class=""><a href="/docs/en/support.html" target="_self">Docs</a></li><li class="siteNavGroupActive siteNavItemActive"><a href="/docs/en/english-vectors.html" target="_self">Resources</a></li><li class=""><a href="/blog/" target="_self">Blog</a></li><li class=""><a href="https://github.com/facebookresearch/fastText/" target="_blank">GitHub</a></li></ul></nav></div></header></div></div><div class="navPusher"><div class="docMainWrapper wrapper"><div class="docsNavContainer" id="docsNav"><nav class="toc"><div class="toggleNav"><section class="navWrapper wrapper"><div class="navBreadcrumb wrapper"><div class="navToggle" id="navToggler"><div class="hamburger-menu"><div class="line1"></div><div class="line2"></div><div class="line3"></div></div></div><h2><i>›</i><span>Resources</span></h2></div><div class="navGroups"><div class="navGroup"><h3 class="navGroupCategoryTitle">Resources</h3><ul class=""><li class="navListItem navListItemActive"><a class="navItem" href="/docs/en/english-vectors.html">English word vectors</a></li><li class="navListItem"><a class="navItem" href="/docs/en/crawl-vectors.html">Word vectors for 157 languages</a></li><li class="navListItem"><a class="navItem" href="/docs/en/pretrained-vectors.html">Wiki word vectors</a></li><li class="navListItem"><a class="navItem" href="/docs/en/aligned-vectors.html">Aligned word vectors</a></li><li class="navListItem"><a class="navItem" href="/docs/en/supervised-models.html">Supervised models</a></li><li class="navListItem"><a class="navItem" href="/docs/en/language-identification.html">Language identification</a></li><li class="navListItem"><a class="navItem" href="/docs/en/dataset.html">Datasets</a></li></ul></div></div></section></div><script>
  9. var coll = document.getElementsByClassName('collapsible');
  10. var checkActiveCategory = true;
  11. for (var i = 0; i < coll.length; i++) {
  12. var links = coll[i].nextElementSibling.getElementsByTagName('*');
  13. if (checkActiveCategory){
  14. for (var j = 0; j < links.length; j++) {
  15. if (links[j].classList.contains('navListItemActive')){
  16. coll[i].nextElementSibling.classList.toggle('hide');
  17. coll[i].childNodes[1].classList.toggle('rotate');
  18. checkActiveCategory = false;
  19. break;
  20. }
  21. }
  22. }
  23. coll[i].addEventListener('click', function() {
  24. var arrow = this.childNodes[1];
  25. arrow.classList.toggle('rotate');
  26. var content = this.nextElementSibling;
  27. content.classList.toggle('hide');
  28. });
  29. }
  30. document.addEventListener('DOMContentLoaded', function() {
  31. createToggler('#navToggler', '#docsNav', 'docsSliderActive');
  32. createToggler('#tocToggler', 'body', 'tocActive');
  33. var headings = document.querySelector('.toc-headings');
  34. headings && headings.addEventListener('click', function(event) {
  35. var el = event.target;
  36. while(el !== headings){
  37. if (el.tagName === 'A') {
  38. document.body.classList.remove('tocActive');
  39. break;
  40. } else{
  41. el = el.parentNode;
  42. }
  43. }
  44. }, false);
  45. function createToggler(togglerSelector, targetSelector, className) {
  46. var toggler = document.querySelector(togglerSelector);
  47. var target = document.querySelector(targetSelector);
  48. if (!toggler) {
  49. return;
  50. }
  51. toggler.onclick = function(event) {
  52. event.preventDefault();
  53. target.classList.toggle(className);
  54. };
  55. }
  56. });
  57. </script></nav></div><div class="container mainContainer docsContainer"><div class="wrapper"><div class="post"><header class="postHeader"><h1 id="__docusaurus" class="postHeaderTitle">English word vectors</h1></header><article><div><span><p>This page gathers several pre-trained word vectors trained using fastText.</p>
  58. <h3><a class="anchor" aria-hidden="true" id="download-pre-trained-word-vectors"></a><a href="#download-pre-trained-word-vectors" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Download pre-trained word vectors</h3>
  59. <p>Pre-trained word vectors learned on different sources can be downloaded below:</p>
  60. <ol>
  61. <li><a href="https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip">wiki-news-300d-1M.vec.zip</a>: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).</li>
  62. <li><a href="https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip">wiki-news-300d-1M-subword.vec.zip</a>: 1 million word vectors trained with subword infomation on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).</li>
  63. <li><a href="https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip">crawl-300d-2M.vec.zip</a>: 2 million word vectors trained on Common Crawl (600B tokens).</li>
  64. <li><a href="https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip">crawl-300d-2M-subword.zip</a>: 2 million word vectors trained with subword information on Common Crawl (600B tokens).</li>
  65. </ol>
  66. <h3><a class="anchor" aria-hidden="true" id="format"></a><a href="#format" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Format</h3>
  67. <p>The first line of the file contains the number of words in the vocabulary and the size of the vectors.
  68. Each line contains a word followed by its vectors, like in the default fastText text format.
  69. Each value is space separated. Words are ordered by descending frequency.
  70. These text models can easily be loaded in Python using the following code:</p>
  71. <pre><code class="hljs css language-python"><span class="hljs-keyword">import</span> io
  72. <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">load_vectors</span><span class="hljs-params">(fname)</span>:</span>
  73. fin = io.open(fname, <span class="hljs-string">'r'</span>, encoding=<span class="hljs-string">'utf-8'</span>, newline=<span class="hljs-string">'\n'</span>, errors=<span class="hljs-string">'ignore'</span>)
  74. n, d = map(int, fin.readline().split())
  75. data = {}
  76. <span class="hljs-keyword">for</span> line <span class="hljs-keyword">in</span> fin:
  77. tokens = line.rstrip().split(<span class="hljs-string">' '</span>)
  78. data[tokens[<span class="hljs-number">0</span>]] = map(float, tokens[<span class="hljs-number">1</span>:])
  79. <span class="hljs-keyword">return</span> data
  80. </code></pre>
  81. <h3><a class="anchor" aria-hidden="true" id="license"></a><a href="#license" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>License</h3>
  82. <p>These word vectors are distributed under the <a href="https://creativecommons.org/licenses/by-sa/3.0/"><em>Creative Commons Attribution-Share-Alike License 3.0</em></a>.</p>
  83. <h3><a class="anchor" aria-hidden="true" id="references"></a><a href="#references" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>References</h3>
  84. <p>If you use these word vectors, please cite the following paper:</p>
  85. <p>T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. <a href="https://arxiv.org/abs/1712.09405"><em>Advances in Pre-Training Distributed Word Representations</em></a></p>
  86. <pre><code class="hljs css language-markup">@inproceedings{mikolov2018advances,
  87. title={Advances in Pre-Training <span class="hljs-keyword">Distributed </span>Word Representations},
  88. author={Mikolov, Tomas <span class="hljs-keyword">and </span>Grave, Edouard <span class="hljs-keyword">and </span><span class="hljs-keyword">Bojanowski, </span>Piotr <span class="hljs-keyword">and </span>Puhrsch, Christian <span class="hljs-keyword">and </span><span class="hljs-keyword">Joulin, </span>Armand},
  89. <span class="hljs-keyword">booktitle={Proceedings </span>of the International Conference on Language Resources <span class="hljs-keyword">and </span>Evaluation (LREC <span class="hljs-number">2018</span>)},
  90. year={<span class="hljs-number">2018</span>}
  91. }
  92. </code></pre>
  93. </span></div></article></div><div class="docs-prevnext"><a class="docs-next button" href="/docs/en/crawl-vectors.html"><span>Word vectors for 157 languages</span><span class="arrow-next"> →</span></a></div></div></div></div><footer class="nav-footer" id="footer"><section class="sitemap"><a href="/" class="nav-home"><img src="/img/fasttext-icon-white-web.png" alt="fastText"/></a><div><h5>Support</h5><a href="/docs/en/support.html">Getting Started</a><a href="/docs/en/supervised-tutorial.html">Tutorials</a><a href="/docs/en/faqs.html">FAQs</a><a href="/docs/en/api.html">API</a></div><div><h5>Community</h5><a href="https://www.facebook.com/groups/1174547215919768/" target="_blank">Facebook Group</a><a href="http://stackoverflow.com/questions/tagged/fasttext" target="_blank">Stack Overflow</a><a href="https://groups.google.com/forum/#!forum/fasttext-library" target="_blank">Google Group</a></div><div><h5>More</h5><a href="/blog">Blog</a><a href="https://github.com/facebookresearch/fastText" target="_blank">GitHub</a><a class="github-button" href="https://github.com/facebookresearch/fastText/" data-icon="octicon-star" data-count-href="/fastText/stargazers" data-count-api="/repos/fastText#stargazers_count" data-count-aria-label="# stargazers on GitHub" aria-label="Star this project on GitHub">Star</a></div></section><a href="https://code.facebook.com/projects/" target="_blank" class="fbOpenSource"><img src="/img/oss_logo.png" alt="Facebook Open Source" width="170" height="45"/></a><section class="copyright">Copyright © 2022 Facebook Inc.</section></footer></div></body></html>