| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- <!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><title>English word vectors · fastText</title><meta name="viewport" content="width=device-width, initial-scale=1.0"/><meta name="generator" content="Docusaurus"/><meta name="description" content="This page gathers several pre-trained word vectors trained using fastText."/><meta name="docsearch:language" content="en"/><meta property="og:title" content="English word vectors · fastText"/><meta property="og:type" content="website"/><meta property="og:url" content="https://fasttext.cc/index.html"/><meta property="og:description" content="This page gathers several pre-trained word vectors trained using fastText."/><meta property="og:image" content="https://fasttext.cc/img/ogimage.png"/><meta name="twitter:card" content="summary"/><link rel="shortcut icon" href="/img/fasttext-icon-bg-web.png"/><link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/default.min.css"/><link rel="alternate" type="application/atom+xml" href="https://fasttext.cc/blog/atom.xml" title="fastText Blog ATOM Feed"/><link rel="alternate" type="application/rss+xml" href="https://fasttext.cc/blog/feed.xml" title="fastText Blog RSS Feed"/><script>
- (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
- (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
- m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
- })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
- ga('create', 'UA-44373548-30', 'auto');
- ga('send', 'pageview');
- </script><script type="text/javascript" src="/tabber.js"></script><script src="/js/scrollSpy.js"></script><link rel="stylesheet" href="/css/main.css"/><script src="/js/codetabs.js"></script></head><body class="sideNavVisible"><div class="fixedHeaderContainer"><div class="headerWrapper wrapper"><header><a href="/"><img class="logo" src="/img/fasttext-icon-white-web.png" alt="fastText"/></a><div class="navigationWrapper navigationSlider"><nav class="slidingNav"><ul class="nav-site nav-site-internal"><li class=""><a href="/docs/en/support.html" target="_self">Docs</a></li><li class="siteNavGroupActive siteNavItemActive"><a href="/docs/en/english-vectors.html" target="_self">Resources</a></li><li class=""><a href="/blog/" target="_self">Blog</a></li><li class=""><a href="https://github.com/facebookresearch/fastText/" target="_blank">GitHub</a></li></ul></nav></div></header></div></div><div class="navPusher"><div class="docMainWrapper wrapper"><div class="docsNavContainer" id="docsNav"><nav class="toc"><div class="toggleNav"><section class="navWrapper wrapper"><div class="navBreadcrumb wrapper"><div class="navToggle" id="navToggler"><div class="hamburger-menu"><div class="line1"></div><div class="line2"></div><div class="line3"></div></div></div><h2><i>›</i><span>Resources</span></h2></div><div class="navGroups"><div class="navGroup"><h3 class="navGroupCategoryTitle">Resources</h3><ul class=""><li class="navListItem navListItemActive"><a class="navItem" href="/docs/en/english-vectors.html">English word vectors</a></li><li class="navListItem"><a class="navItem" href="/docs/en/crawl-vectors.html">Word vectors for 157 languages</a></li><li class="navListItem"><a class="navItem" href="/docs/en/pretrained-vectors.html">Wiki word vectors</a></li><li class="navListItem"><a class="navItem" href="/docs/en/aligned-vectors.html">Aligned word vectors</a></li><li class="navListItem"><a class="navItem" href="/docs/en/supervised-models.html">Supervised models</a></li><li class="navListItem"><a class="navItem" href="/docs/en/language-identification.html">Language identification</a></li><li class="navListItem"><a class="navItem" href="/docs/en/dataset.html">Datasets</a></li></ul></div></div></section></div><script>
- var coll = document.getElementsByClassName('collapsible');
- var checkActiveCategory = true;
- for (var i = 0; i < coll.length; i++) {
- var links = coll[i].nextElementSibling.getElementsByTagName('*');
- if (checkActiveCategory){
- for (var j = 0; j < links.length; j++) {
- if (links[j].classList.contains('navListItemActive')){
- coll[i].nextElementSibling.classList.toggle('hide');
- coll[i].childNodes[1].classList.toggle('rotate');
- checkActiveCategory = false;
- break;
- }
- }
- }
- coll[i].addEventListener('click', function() {
- var arrow = this.childNodes[1];
- arrow.classList.toggle('rotate');
- var content = this.nextElementSibling;
- content.classList.toggle('hide');
- });
- }
- document.addEventListener('DOMContentLoaded', function() {
- createToggler('#navToggler', '#docsNav', 'docsSliderActive');
- createToggler('#tocToggler', 'body', 'tocActive');
- var headings = document.querySelector('.toc-headings');
- headings && headings.addEventListener('click', function(event) {
- var el = event.target;
- while(el !== headings){
- if (el.tagName === 'A') {
- document.body.classList.remove('tocActive');
- break;
- } else{
- el = el.parentNode;
- }
- }
- }, false);
- function createToggler(togglerSelector, targetSelector, className) {
- var toggler = document.querySelector(togglerSelector);
- var target = document.querySelector(targetSelector);
- if (!toggler) {
- return;
- }
- toggler.onclick = function(event) {
- event.preventDefault();
- target.classList.toggle(className);
- };
- }
- });
- </script></nav></div><div class="container mainContainer docsContainer"><div class="wrapper"><div class="post"><header class="postHeader"><h1 id="__docusaurus" class="postHeaderTitle">English word vectors</h1></header><article><div><span><p>This page gathers several pre-trained word vectors trained using fastText.</p>
- <h3><a class="anchor" aria-hidden="true" id="download-pre-trained-word-vectors"></a><a href="#download-pre-trained-word-vectors" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Download pre-trained word vectors</h3>
- <p>Pre-trained word vectors learned on different sources can be downloaded below:</p>
- <ol>
- <li><a href="https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip">wiki-news-300d-1M.vec.zip</a>: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).</li>
- <li><a href="https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip">wiki-news-300d-1M-subword.vec.zip</a>: 1 million word vectors trained with subword infomation on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).</li>
- <li><a href="https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip">crawl-300d-2M.vec.zip</a>: 2 million word vectors trained on Common Crawl (600B tokens).</li>
- <li><a href="https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip">crawl-300d-2M-subword.zip</a>: 2 million word vectors trained with subword information on Common Crawl (600B tokens).</li>
- </ol>
- <h3><a class="anchor" aria-hidden="true" id="format"></a><a href="#format" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Format</h3>
- <p>The first line of the file contains the number of words in the vocabulary and the size of the vectors.
- Each line contains a word followed by its vectors, like in the default fastText text format.
- Each value is space separated. Words are ordered by descending frequency.
- These text models can easily be loaded in Python using the following code:</p>
- <pre><code class="hljs css language-python"><span class="hljs-keyword">import</span> io
- <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">load_vectors</span><span class="hljs-params">(fname)</span>:</span>
- fin = io.open(fname, <span class="hljs-string">'r'</span>, encoding=<span class="hljs-string">'utf-8'</span>, newline=<span class="hljs-string">'\n'</span>, errors=<span class="hljs-string">'ignore'</span>)
- n, d = map(int, fin.readline().split())
- data = {}
- <span class="hljs-keyword">for</span> line <span class="hljs-keyword">in</span> fin:
- tokens = line.rstrip().split(<span class="hljs-string">' '</span>)
- data[tokens[<span class="hljs-number">0</span>]] = map(float, tokens[<span class="hljs-number">1</span>:])
- <span class="hljs-keyword">return</span> data
- </code></pre>
- <h3><a class="anchor" aria-hidden="true" id="license"></a><a href="#license" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>License</h3>
- <p>These word vectors are distributed under the <a href="https://creativecommons.org/licenses/by-sa/3.0/"><em>Creative Commons Attribution-Share-Alike License 3.0</em></a>.</p>
- <h3><a class="anchor" aria-hidden="true" id="references"></a><a href="#references" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>References</h3>
- <p>If you use these word vectors, please cite the following paper:</p>
- <p>T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. <a href="https://arxiv.org/abs/1712.09405"><em>Advances in Pre-Training Distributed Word Representations</em></a></p>
- <pre><code class="hljs css language-markup">@inproceedings{mikolov2018advances,
- title={Advances in Pre-Training <span class="hljs-keyword">Distributed </span>Word Representations},
- author={Mikolov, Tomas <span class="hljs-keyword">and </span>Grave, Edouard <span class="hljs-keyword">and </span><span class="hljs-keyword">Bojanowski, </span>Piotr <span class="hljs-keyword">and </span>Puhrsch, Christian <span class="hljs-keyword">and </span><span class="hljs-keyword">Joulin, </span>Armand},
- <span class="hljs-keyword">booktitle={Proceedings </span>of the International Conference on Language Resources <span class="hljs-keyword">and </span>Evaluation (LREC <span class="hljs-number">2018</span>)},
- year={<span class="hljs-number">2018</span>}
- }
- </code></pre>
- </span></div></article></div><div class="docs-prevnext"><a class="docs-next button" href="/docs/en/crawl-vectors.html"><span>Word vectors for 157 languages</span><span class="arrow-next"> →</span></a></div></div></div></div><footer class="nav-footer" id="footer"><section class="sitemap"><a href="/" class="nav-home"><img src="/img/fasttext-icon-white-web.png" alt="fastText"/></a><div><h5>Support</h5><a href="/docs/en/support.html">Getting Started</a><a href="/docs/en/supervised-tutorial.html">Tutorials</a><a href="/docs/en/faqs.html">FAQs</a><a href="/docs/en/api.html">API</a></div><div><h5>Community</h5><a href="https://www.facebook.com/groups/1174547215919768/" target="_blank">Facebook Group</a><a href="http://stackoverflow.com/questions/tagged/fasttext" target="_blank">Stack Overflow</a><a href="https://groups.google.com/forum/#!forum/fasttext-library" target="_blank">Google Group</a></div><div><h5>More</h5><a href="/blog">Blog</a><a href="https://github.com/facebookresearch/fastText" target="_blank">GitHub</a><a class="github-button" href="https://github.com/facebookresearch/fastText/" data-icon="octicon-star" data-count-href="/fastText/stargazers" data-count-api="/repos/fastText#stargazers_count" data-count-aria-label="# stargazers on GitHub" aria-label="Star this project on GitHub">Star</a></div></section><a href="https://code.facebook.com/projects/" target="_blank" class="fbOpenSource"><img src="/img/oss_logo.png" alt="Facebook Open Source" width="170" height="45"/></a><section class="copyright">Copyright © 2022 Facebook Inc.</section></footer></div></body></html>
|