fasttext源码剖析

目的：记录结合多方资料以及个人理解的剖析代码；

https://heleifz.github.io/14732610572844.html

http://www.cnblogs.com/peghoty/p/3857839.html

一：代码总体模块关联图：

核心模块是fasttext.cc以及model.cc模块，但是辅助模块也很重要，是代码的螺丝钉，以及实现了数据采取什么样子数据结构进行组织，这里的东西值得学习借鉴，而且你会发现存储训练数据的结构比较常用的手段，后期可以对比多个源码的训练数据的结构对比。

部分：螺丝钉代码的剖析

二：dictionary模版

  1 /**

  2  * Copyright (c) 2016-present, Facebook, Inc.

  3  * All rights reserved.

  4  *

  5  * This source code is licensed under the BSD-style license found in the

  6  * LICENSE file in the root directory of this source tree. An additional grant

  7  * of patent rights can be found in the PATENTS file in the same directory.

  8  */

  9

 10 #include "dictionary.h"

 11

 12 #include <assert.h>

 13

 14 #include <iostream>

 15 #include <algorithm>

 16 #include <iterator>

 17 #include <unordered_map>

 18

 19 namespace fasttext {

 20

 21 const std::string Dictionary::EOS = "</s>";

 22 const std::string Dictionary::BOW = "<";

 23 const std::string Dictionary::EOW = ">";

 24

 25 Dictionary::Dictionary(std::shared_ptr<Args> args) {

 26   args_ = args;

 27   size_ = 0;

 28   nwords_ = 0;

 29   nlabels_ = 0;

 30   ntokens_ = 0;

 31   word2int_.resize(MAX_VOCAB_SIZE);//建立全词的索引,hash值在0~MAX_VOCAB_SIZE-1之间

 32   for (int32_t i = 0; i < MAX_VOCAB_SIZE; i++) {

 33     word2int_[i] = -1;

 34   }

 35 }

 36 //根据字符串，进行hash，hash后若是冲突则线性探索，找到其对应的hash位置

 37 int32_t Dictionary::find(const std::string& w) const {

 38   int32_t h = hash(w) % MAX_VOCAB_SIZE;

 39   while (word2int_[h] != -1 && words_[word2int_[h]].word != w) {

 40     h = (h + 1) % MAX_VOCAB_SIZE;

 41   }

 42   return h;

 43 }

 44 //向words_添加词，词可能是标签词

 45 void Dictionary::add(const std::string& w) {

 46   int32_t h = find(w);

 47   ntokens_++;//已处理的词

 48   if (word2int_[h] == -1) {

 49     entry e;

 50     e.word = w;

 51     e.count = 1;

 52     e.type = (w.find(args_->label) == 0) ? entry_type::label : entry_type::word;//与给出标签相同，则表示标签词

 53     words_.push_back(e);

 54     word2int_[h] = size_++;

 55   } else {

 56     words_[word2int_[h]].count++;

 57   }

 58 }

 59 //返回纯词个数--去重

 60 int32_t Dictionary::nwords() const {

 61   return nwords_;

 62 }

 63 //标签词个数---去重

 64 int32_t Dictionary::nlabels() const {

 65   return nlabels_;

 66 }

 67 //返回已经处理的词数---可以重复

 68 int64_t Dictionary::ntokens() const {

 69   return ntokens_;

 70 }

 71 //获取纯词的ngram

 72 const std::vector<int32_t>& Dictionary::getNgrams(int32_t i) const {

 73   assert(i >= 0);

 74   assert(i < nwords_);

 75   return words_[i].subwords;

 76 }

 77 //获取纯词的ngram，根据词串

 78 const std::vector<int32_t> Dictionary::getNgrams(const std::string& word) const {

 79   int32_t i = getId(word);

 80   if (i >= 0) {

 81     return getNgrams(i);

 82   }

 83   //若是该词没有被入库词典中，未知词，则计算ngram

 84   //这就可以通过其他词的近似ngram来获取该词的ngram

 85   std::vector<int32_t> ngrams;

 86   computeNgrams(BOW + word + EOW, ngrams);

 87   return ngrams;

 88 }

 89 //是否丢弃的判断标准---这是由于无用词会出现过多的词频，需要被丢弃，

 90 bool Dictionary::discard(int32_t id, real rand) const {

 91   assert(id >= 0);

 92   assert(id < nwords_);

 93   if (args_->model == model_name::sup) return false;//非词向量不需要丢弃

 94   return rand > pdiscard_[id];

 95 }

 96 //获取词的id号

 97 int32_t Dictionary::getId(const std::string& w) const {

 98   int32_t h = find(w);

 99   return word2int_[h];

100 }

101 //词的类型

102 entry_type Dictionary::getType(int32_t id) const {

103   assert(id >= 0);

104   assert(id < size_);

105   return words_[id].type;

106 }

107 //根据词id获取词串

108 std::string Dictionary::getWord(int32_t id) const {

109   assert(id >= 0);

110   assert(id < size_);

111   return words_[id].word;

112 }

113 //hash规则

114 uint32_t Dictionary::hash(const std::string& str) const {

115   uint32_t h = 2166136261;

116   for (size_t i = 0; i < str.size(); i++) {

117     h = h ^ uint32_t(str[i]);

118     h = h * 16777619;

119   }

120   return h;

121 }

122 //根据词计算其ngram情况

123 void Dictionary::computeNgrams(const std::string& word,

124                                std::vector<int32_t>& ngrams) const {

125   for (size_t i = 0; i < word.size(); i++) {

126     std::string ngram;

127     if ((word[i] & 0xC0) == 0x80) continue;

128     for (size_t j = i, n = 1; j < word.size() && n <= args_->maxn; n++) {//n-1个词背景

129       ngram.push_back(word[j++]);

130       while (j < word.size() && (word[j] & 0xC0) == 0x80) {

131         ngram.push_back(word[j++]);

132       }

133       if (n >= args_->minn && !(n == 1 && (i == 0 || j == word.size()))) {

134         int32_t h = hash(ngram) % args_->bucket;//hash余数值

135         ngrams.push_back(nwords_ + h);

136       }

137     }

138   }

139 }

140 //初始化ngram值

141 void Dictionary::initNgrams() {

142   for (size_t i = 0; i < size_; i++) {

143     std::string word = BOW + words_[i].word + EOW;

144     words_[i].subwords.push_back(i);

145     computeNgrams(word, words_[i].subwords);

146   }

147 }

148 //读取词

149 bool Dictionary::readWord(std::istream& in, std::string& word) const

150 {

151   char c;

152   std::streambuf& sb = *in.rdbuf();

153   word.clear();

154   while ((c = sb.sbumpc()) != EOF) {

155     if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' || c == '\f' || c == '\0') {

156       if (word.empty()) {

157         if (c == '\n') {//若是空行，则增加一个EOS

158           word += EOS;

159           return true;

160         }

161         continue;

162       } else {

163         if (c == '\n')

164           sb.sungetc();//放回，体现对于换行符会用EOS替换

165         return true;

166       }

167     }

168     word.push_back(c);

169   }

170   // trigger eofbit

171   in.get();

172   return !word.empty();

173 }

174 //读取文件---获取词典；初始化舍弃规则，初始化ngram

175 void Dictionary::readFromFile(std::istream& in) {

176   std::string word;

177   int64_t minThreshold = 1;//阈值

178   while (readWord(in, word)) {

179     add(word);

180     if (ntokens_ % 1000000 == 0 && args_->verbose > 1) {

181       std::cout << "\rRead " << ntokens_  / 1000000 << "M words" << std::flush;

182     }

183     if (size_ > 0.75 * MAX_VOCAB_SIZE) {//词保证是不超过75%

184       minThreshold++;

185       threshold(minThreshold, minThreshold);//过滤小于minThreshold的词，顺便排序了

186     }

187   }

188   threshold(args_->minCount, args_->minCountLabel);//目的是排序，顺带过滤词，指定过滤

189

190   initTableDiscard();

191   initNgrams();

192   if (args_->verbose > 0) {

193     std::cout << "\rRead " << ntokens_  / 1000000 << "M words" << std::endl;

194     std::cout << "Number of words:  " << nwords_ << std::endl;

195     std::cout << "Number of labels: " << nlabels_ << std::endl;

196   }

197   if (size_ == 0) {

198     std::cerr << "Empty vocabulary. Try a smaller -minCount value." << std::endl;

199     exit(EXIT_FAILURE);

200   }

201 }

202 //缩减词，且排序词

203 void Dictionary::threshold(int64_t t, int64_t tl) {

204   sort(words_.begin(), words_.end(), [](const entry& e1, const entry& e2) {

205       if (e1.type != e2.type) return e1.type < e2.type;//不同类型词，将标签词排在后面

206       return e1.count > e2.count;//同类则词频降序排

207     });//排序，根据词频

208   words_.erase(remove_if(words_.begin(), words_.end(), [&](const entry& e) {

209         return (e.type == entry_type::word && e.count < t) ||

210                (e.type == entry_type::label && e.count < tl);

211       }), words_.end());//删除阈值以下的词

212   words_.shrink_to_fit();//剔除

213   //更新词典的信息

214   size_ = 0;

215   nwords_ = 0;

216   nlabels_ = 0;

217   for (int32_t i = 0; i < MAX_VOCAB_SIZE; i++) {

218     word2int_[i] = -1;//重置

219   }

220   for (auto it = words_.begin(); it != words_.end(); ++it) {

221     int32_t h = find(it->word);//重新构造hash

222     word2int_[h] = size_++;

223     if (it->type == entry_type::word) nwords_++;

224     if (it->type == entry_type::label) nlabels_++;

225   }

226 }

227 //初始化丢弃规则---

228 void Dictionary::initTableDiscard() {//t采样的阈值，0表示全部舍弃，1表示不采样

229   pdiscard_.resize(size_);

230   for (size_t i = 0; i < size_; i++) {

231     real f = real(words_[i].count) / real(ntokens_);//f概率高

232     pdiscard_[i] = sqrt(args_->t / f) + args_->t / f;//与论文貌似不一样？？？？？

233   }

234 }

235 //返回词的频数--所以词的词频和

236 std::vector<int64_t> Dictionary::getCounts(entry_type type) const {

237   std::vector<int64_t> counts;

238   for (auto& w : words_) {

239     if (w.type == type) counts.push_back(w.count);

240   }

241   return counts;

242 }

243 //增加ngram，

244 void Dictionary::addNgrams(std::vector<int32_t>& line, int32_t n) const {

245   int32_t line_size = line.size();

246   for (int32_t i = 0; i < line_size; i++) {

247     uint64_t h = line[i];

248     for (int32_t j = i + 1; j < line_size && j < i + n; j++) {

249       h = h * 116049371 + line[j];

250       line.push_back(nwords_ + (h % args_->bucket));

251     }

252   }

253 }

254 //获取词行

255 int32_t Dictionary::getLine(std::istream& in,

256                             std::vector<int32_t>& words,

257                             std::vector<int32_t>& labels,

258                             std::minstd_rand& rng) const {

259   std::uniform_real_distribution<> uniform(0, 1);//均匀随机0~1

260   std::string token;

261   int32_t ntokens = 0;

262   words.clear();

263   labels.clear();

264   if (in.eof()) {

265     in.clear();

266     in.seekg(std::streampos(0));

267   }

268   while (readWord(in, token)) {

269     if (token == EOS) break;//表示一行的结束

270     int32_t wid = getId(token);

271     if (wid < 0) continue;//表示词的id木有，代表未知词，则跳过

272     entry_type type = getType(wid);

273     ntokens++;//已经获取词数

274     if (type == entry_type::word && !discard(wid, uniform(rng))) {//随机采取样，表示是否取该词

275       words.push_back(wid);//词的收集--词肯定在nwords_以下

276     }

277     if (type == entry_type::label) {//标签词全部采取，肯定在nwords_以上

278       labels.push_back(wid - nwords_);//也就是labels的值需要加上nwords才能够寻找到标签词

279     }

280     if (words.size() > MAX_LINE_SIZE && args_->model != model_name::sup) break;//词向量则有限制句子长度

281   }

282   return ntokens;

283 }

284 //获取标签词，根据的是标签词的lid

285 std::string Dictionary::getLabel(int32_t lid) const {//标签词

286   assert(lid >= 0);

287   assert(lid < nlabels_);

288   return words_[lid + nwords_].word;

289 }

290 //保存词典

291 void Dictionary::save(std::ostream& out) const {

292   out.write((char*) &size_, sizeof(int32_t));

293   out.write((char*) &nwords_, sizeof(int32_t));

294   out.write((char*) &nlabels_, sizeof(int32_t));

295   out.write((char*) &ntokens_, sizeof(int64_t));

296   for (int32_t i = 0; i < size_; i++) {//词

297     entry e = words_[i];

298     out.write(e.word.data(), e.word.size() * sizeof(char));//词

299     out.put(0);//字符串结束标志位

300     out.write((char*) &(e.count), sizeof(int64_t));

301     out.write((char*) &(e.type), sizeof(entry_type));

302   }

303 }

304 //加载词典

305 void Dictionary::load(std::istream& in) {

306   words_.clear();

307   for (int32_t i = 0; i < MAX_VOCAB_SIZE; i++) {

308     word2int_[i] = -1;

309   }

310   in.read((char*) &size_, sizeof(int32_t));

311   in.read((char*) &nwords_, sizeof(int32_t));

312   in.read((char*) &nlabels_, sizeof(int32_t));

313   in.read((char*) &ntokens_, sizeof(int64_t));

314   for (int32_t i = 0; i < size_; i++) {

315     char c;

316     entry e;

317     while ((c = in.get()) != 0) {

318       e.word.push_back(c);

319     }

320     in.read((char*) &e.count, sizeof(int64_t));

321     in.read((char*) &e.type, sizeof(entry_type));

322     words_.push_back(e);

323     word2int_[find(e.word)] = i;//建立索引

324   }

325   initTableDiscard();//初始化抛弃规则

326   initNgrams();//初始化ngram词

327 }

328

329 }

个人觉得有必要说明的地方：

1：关于字符串映射过程，以及如何建立一套索引的，详情见下图：涉及的函数主要是find，内部实现需要hash函数建立hash规则，借助2个vector来进行关联。StrToHash(find函数) HashToIndex(word2int数组) IndexToStruct(words_数组)

2：初始化几个有用的表，目的是加速运行速度

1)初始化ngram表，即每个词都对应一个ngram的表的id列表。比如词 "我想你" ，通过computeNgrams函数可以计算出相应ngram的词索引，假设ngram的词最短为2，最长为3，则就是"<我"，"我想"，"想你"，"你>"，<我想"，"我想你"，"想你>"的子词组成，这里有"<>"因为这里会自动添加这样的词的开始和结束位。这里注意代码实现中的"(word[j] & 0xC0) == 0x80)"这里是考虑utf-8的汉字情况，来使得能够取出完整的一个汉字作为一个"字"

2) 初始化initTableDiscard表，对每个词根据词的频率获取相应的丢弃概率值，若是给定的阈值小于这个表的值那么就丢弃该词，这里是因为对于频率过高的词可能就是无用词，所以丢弃。比如"的"，"是"等；这里的实现与论文中有点差异，这里是当表中的词小于某个值表示该丢弃，这里因为这里没有对其求1-p形式，而是p+p^2。若是同理转为同方向，则论文是p，现实是p+p^2，这样的做法是使得打压更加宽松点，也就是更多词会被当作无用词丢弃。（不知道原因）

3：外界使用该.cc的主线，一是readFromFile函数，加载词；二是getLine，获取句的词。

类似的vector.cc，matrix.cc，args.cc等代码解析如下：

  1 /**

  2  * Copyright (c) 2016-present, Facebook, Inc.

  3  * All rights reserved.

  4  *

  5  * This source code is licensed under the BSD-style license found in the

  6  * LICENSE file in the root directory of this source tree. An additional grant

  7  * of patent rights can be found in the PATENTS file in the same directory.

  8  */

  9

 10 #include "matrix.h"

 11

 12 #include <assert.h>

 13

 14 #include <random>

 15

 16 #include "utils.h"

 17 #include "vector.h"

 18

 19 namespace fasttext {

 20

 21 Matrix::Matrix() {

 22   m_ = 0;

 23   n_ = 0;

 24   data_ = nullptr;

 25 }

 26

 27 Matrix::Matrix(int64_t m, int64_t n) {

 28   m_ = m;

 29   n_ = n;

 30   data_ = new real[m * n];

 31 }

 32

 33 Matrix::Matrix(const Matrix& other) {

 34   m_ = other.m_;

 35   n_ = other.n_;

 36   data_ = new real[m_ * n_];

 37   for (int64_t i = 0; i < (m_ * n_); i++) {

 38     data_[i] = other.data_[i];

 39   }

 40 }

 41

 42 Matrix& Matrix::operator=(const Matrix& other) {

 43   Matrix temp(other);

 44   m_ = temp.m_;

 45   n_ = temp.n_;

 46   std::swap(data_, temp.data_);

 47   return *this;

 48 }

 49

 50 Matrix::~Matrix() {

 51   delete[] data_;

 52 }

 53

 54 void Matrix::zero() {

 55   for (int64_t i = 0; i < (m_ * n_); i++) {

 56       data_[i] = 0.0;

 57   }

 58 }

 59 //随机初始化矩阵-均匀随机

 60 void Matrix::uniform(real a) {

 61   std::minstd_rand rng(1);

 62   std::uniform_real_distribution<> uniform(-a, a);

 63   for (int64_t i = 0; i < (m_ * n_); i++) {

 64     data_[i] = uniform(rng);

 65   }

 66 }

 67 //加向量

 68 void Matrix::addRow(const Vector& vec, int64_t i, real a) {

 69   assert(i >= 0);

 70   assert(i < m_);

 71   assert(vec.m_ == n_);

 72   for (int64_t j = 0; j < n_; j++) {

 73     data_[i * n_ + j] += a * vec.data_[j];

 74   }

 75 }

 76 //点乘向量

 77 real Matrix::dotRow(const Vector& vec, int64_t i) {

 78   assert(i >= 0);

 79   assert(i < m_);

 80   assert(vec.m_ == n_);

 81   real d = 0.0;

 82   for (int64_t j = 0; j < n_; j++) {

 83     d += data_[i * n_ + j] * vec.data_[j];

 84   }

 85   return d;

 86 }

 87 //存储

 88 void Matrix::save(std::ostream& out) {

 89   out.write((char*) &m_, sizeof(int64_t));

 90   out.write((char*) &n_, sizeof(int64_t));

 91   out.write((char*) data_, m_ * n_ * sizeof(real));

 92 }

 93 //加载

 94 void Matrix::load(std::istream& in) {

 95   in.read((char*) &m_, sizeof(int64_t));

 96   in.read((char*) &n_, sizeof(int64_t));

 97   delete[] data_;

 98   data_ = new real[m_ * n_];

 99   in.read((char*) data_, m_ * n_ * sizeof(real));

100 }

101

102 }

/**

 * Copyright (c) 2016-present, Facebook, Inc.

 * All rights reserved.

 *

 * This source code is licensed under the BSD-style license found in the

 * LICENSE file in the root directory of this source tree. An additional grant

 * of patent rights can be found in the PATENTS file in the same directory.

 */

#include "vector.h"

#include <assert.h>

#include <iomanip>

#include "matrix.h"

#include "utils.h"

namespace fasttext {

Vector::Vector(int64_t m) {

  m_ = m;

  data_ = new real[m];

}

Vector::~Vector() {

  delete[] data_;

}

int64_t Vector::size() const {

  return m_;

}

void Vector::zero() {

  for (int64_t i = 0; i < m_; i++) {

    data_[i] = 0.0;

  }

}

//数乘向量

void Vector::mul(real a) {

  for (int64_t i = 0; i < m_; i++) {

    data_[i] *= a;

  }

}

//向量相加

void Vector::addRow(const Matrix& A, int64_t i) {

  assert(i >= 0);

  assert(i < A.m_);

  assert(m_ == A.n_);

  for (int64_t j = 0; j < A.n_; j++) {

    data_[j] += A.data_[i * A.n_ + j];

  }

}

//加数乘向量

void Vector::addRow(const Matrix& A, int64_t i, real a) {

  assert(i >= 0);

  assert(i < A.m_);

  assert(m_ == A.n_);

  for (int64_t j = 0; j < A.n_; j++) {

    data_[j] += a * A.data_[i * A.n_ + j];

  }

}

//向量与矩阵相乘得到的向量

void Vector::mul(const Matrix& A, const Vector& vec) {

  assert(A.m_ == m_);

  assert(A.n_ == vec.m_);

  for (int64_t i = 0; i < m_; i++) {

    data_[i] = 0.0;

    for (int64_t j = 0; j < A.n_; j++) {

      data_[i] += A.data_[i * A.n_ + j] * vec.data_[j];

    }

  }

}

//最大分量

int64_t Vector::argmax() {

  real max = data_[0];

  int64_t argmax = 0;

  for (int64_t i = 1; i < m_; i++) {

    if (data_[i] > max) {

      max = data_[i];

      argmax = i;

    }

  }

  return argmax;

}

real& Vector::operator[](int64_t i) {

  return data_[i];

}

const real& Vector::operator[](int64_t i) const {

  return data_[i];

}

std::ostream& operator<<(std::ostream& os, const Vector& v)

{

  os << std::setprecision(5);

  for (int64_t j = 0; j < v.m_; j++) {

    os << v.data_[j] << ' ';

  }

  return os;

}

}

  1 /**

  2  * Copyright (c) 2016-present, Facebook, Inc.

  3  * All rights reserved.

  4  *

  5  * This source code is licensed under the BSD-style license found in the

  6  * LICENSE file in the root directory of this source tree. An additional grant

  7  * of patent rights can be found in the PATENTS file in the same directory.

  8  */

  9

 10 #include "args.h"

 11

 12 #include <stdlib.h>

 13 #include <string.h>

 14

 15 #include <iostream>

 16

 17 namespace fasttext {

 18

 19 Args::Args() {

 20   lr = 0.05;

 21   dim = 100;

 22   ws = 5;

 23   epoch = 5;

 24   minCount = 5;

 25   minCountLabel = 0;

 26   neg = 5;

 27   wordNgrams = 1;

 28   loss = loss_name::ns;

 29   model = model_name::sg;

 30   bucket = 2000000;//允许的ngram词典大小2M

 31   minn = 3;

 32   maxn = 6;

 33   thread = 12;

 34   lrUpdateRate = 100;

 35   t = 1e-4;//默认

 36   label = "__label__";

 37   verbose = 2;

 38   pretrainedVectors = "";

 39 }

 40

 41 void Args::parseArgs(int argc, char** argv) {

 42   std::string command(argv[1]);

 43   if (command == "supervised") {

 44     model = model_name::sup;

 45     loss = loss_name::softmax;

 46     minCount = 1;

 47     minn = 0;

 48     maxn = 0;

 49     lr = 0.1;

 50   } else if (command == "cbow") {

 51     model = model_name::cbow;

 52   }

 53   int ai = 2;

 54   while (ai < argc) {

 55     if (argv[ai][0] != '-') {

 56       std::cout << "Provided argument without a dash! Usage:" << std::endl;

 57       printHelp();

 58       exit(EXIT_FAILURE);

 59     }

 60     if (strcmp(argv[ai], "-h") == 0) {

 61       std::cout << "Here is the help! Usage:" << std::endl;

 62       printHelp();

 63       exit(EXIT_FAILURE);

 64     } else if (strcmp(argv[ai], "-input") == 0) {

 65       input = std::string(argv[ai + 1]);

 66     } else if (strcmp(argv[ai], "-test") == 0) {

 67       test = std::string(argv[ai + 1]);

 68     } else if (strcmp(argv[ai], "-output") == 0) {

 69       output = std::string(argv[ai + 1]);

 70     } else if (strcmp(argv[ai], "-lr") == 0) {

 71       lr = atof(argv[ai + 1]);

 72     } else if (strcmp(argv[ai], "-lrUpdateRate") == 0) {

 73       lrUpdateRate = atoi(argv[ai + 1]);

 74     } else if (strcmp(argv[ai], "-dim") == 0) {

 75       dim = atoi(argv[ai + 1]);

 76     } else if (strcmp(argv[ai], "-ws") == 0) {

 77       ws = atoi(argv[ai + 1]);

 78     } else if (strcmp(argv[ai], "-epoch") == 0) {

 79       epoch = atoi(argv[ai + 1]);

 80     } else if (strcmp(argv[ai], "-minCount") == 0) {

 81       minCount = atoi(argv[ai + 1]);

 82     } else if (strcmp(argv[ai], "-minCountLabel") == 0) {

 83       minCountLabel = atoi(argv[ai + 1]);

 84     } else if (strcmp(argv[ai], "-neg") == 0) {

 85       neg = atoi(argv[ai + 1]);

 86     } else if (strcmp(argv[ai], "-wordNgrams") == 0) {

 87       wordNgrams = atoi(argv[ai + 1]);

 88     } else if (strcmp(argv[ai], "-loss") == 0) {

 89       if (strcmp(argv[ai + 1], "hs") == 0) {

 90         loss = loss_name::hs;

 91       } else if (strcmp(argv[ai + 1], "ns") == 0) {

 92         loss = loss_name::ns;

 93       } else if (strcmp(argv[ai + 1], "softmax") == 0) {

 94         loss = loss_name::softmax;

 95       } else {

 96         std::cout << "Unknown loss: " << argv[ai + 1] << std::endl;

 97         printHelp();

 98         exit(EXIT_FAILURE);

 99       }

100     } else if (strcmp(argv[ai], "-bucket") == 0) {

101       bucket = atoi(argv[ai + 1]);

102     } else if (strcmp(argv[ai], "-minn") == 0) {

103       minn = atoi(argv[ai + 1]);

104     } else if (strcmp(argv[ai], "-maxn") == 0) {

105       maxn = atoi(argv[ai + 1]);

106     } else if (strcmp(argv[ai], "-thread") == 0) {

107       thread = atoi(argv[ai + 1]);

108     } else if (strcmp(argv[ai], "-t") == 0) {

109       t = atof(argv[ai + 1]);

110     } else if (strcmp(argv[ai], "-label") == 0) {

111       label = std::string(argv[ai + 1]);

112     } else if (strcmp(argv[ai], "-verbose") == 0) {

113       verbose = atoi(argv[ai + 1]);

114     } else if (strcmp(argv[ai], "-pretrainedVectors") == 0) {

115       pretrainedVectors = std::string(argv[ai + 1]);

116     } else {

117       std::cout << "Unknown argument: " << argv[ai] << std::endl;

118       printHelp();

119       exit(EXIT_FAILURE);

120     }

121     ai += 2;

122   }

123   if (input.empty() || output.empty()) {

124     std::cout << "Empty input or output path." << std::endl;

125     printHelp();

126     exit(EXIT_FAILURE);

127   }

128   if (wordNgrams <= 1 && maxn == 0) {

129     bucket = 0;

130   }

131 }

132

133 void Args::printHelp() {

134   std::string lname = "ns";

135   if (loss == loss_name::hs) lname = "hs";

136   if (loss == loss_name::softmax) lname = "softmax";

137   std::cout

138     << "\n"

139     << "The following arguments are mandatory:\n"

140     << "  -input              training file path\n"

141     << "  -output             output file path\n\n"

142     << "The following arguments are optional:\n"

143     << "  -lr                 learning rate [" << lr << "]\n"

144     << "  -lrUpdateRate       change the rate of updates for the learning rate [" << lrUpdateRate << "]\n"

145     << "  -dim                size of word vectors [" << dim << "]\n"

146     << "  -ws                 size of the context window [" << ws << "]\n"

147     << "  -epoch              number of epochs [" << epoch << "]\n"

148     << "  -minCount           minimal number of word occurences [" << minCount << "]\n"

149     << "  -minCountLabel      minimal number of label occurences [" << minCountLabel << "]\n"

150     << "  -neg                number of negatives sampled [" << neg << "]\n"

151     << "  -wordNgrams         max length of word ngram [" << wordNgrams << "]\n"

152     << "  -loss               loss function {ns, hs, softmax} [ns]\n"

153     << "  -bucket             number of buckets [" << bucket << "]\n"

154     << "  -minn               min length of char ngram [" << minn << "]\n"

155     << "  -maxn               max length of char ngram [" << maxn << "]\n"

156     << "  -thread             number of threads [" << thread << "]\n"

157     << "  -t                  sampling threshold [" << t << "]\n"

158     << "  -label              labels prefix [" << label << "]\n"

159     << "  -verbose            verbosity level [" << verbose << "]\n"

160     << "  -pretrainedVectors  pretrained word vectors for supervised learning []"

161     << std::endl;

162 }

163

164 void Args::save(std::ostream& out) {

165   out.write((char*) &(dim), sizeof(int));

166   out.write((char*) &(ws), sizeof(int));

167   out.write((char*) &(epoch), sizeof(int));

168   out.write((char*) &(minCount), sizeof(int));

169   out.write((char*) &(neg), sizeof(int));

170   out.write((char*) &(wordNgrams), sizeof(int));

171   out.write((char*) &(loss), sizeof(loss_name));

172   out.write((char*) &(model), sizeof(model_name));

173   out.write((char*) &(bucket), sizeof(int));

174   out.write((char*) &(minn), sizeof(int));

175   out.write((char*) &(maxn), sizeof(int));

176   out.write((char*) &(lrUpdateRate), sizeof(int));

177   out.write((char*) &(t), sizeof(double));

178 }

179

180 void Args::load(std::istream& in) {

181   in.read((char*) &(dim), sizeof(int));

182   in.read((char*) &(ws), sizeof(int));

183   in.read((char*) &(epoch), sizeof(int));

184   in.read((char*) &(minCount), sizeof(int));

185   in.read((char*) &(neg), sizeof(int));

186   in.read((char*) &(wordNgrams), sizeof(int));

187   in.read((char*) &(loss), sizeof(loss_name));

188   in.read((char*) &(model), sizeof(model_name));

189   in.read((char*) &(bucket), sizeof(int));

190   in.read((char*) &(minn), sizeof(int));

191   in.read((char*) &(maxn), sizeof(int));

192   in.read((char*) &(lrUpdateRate), sizeof(int));

193   in.read((char*) &(t), sizeof(double));

194 }

195

196 }

三：model.cc

/**

 * Copyright (c) 2016-present, Facebook, Inc.

 * All rights reserved.

 *

 * This source code is licensed under the BSD-style license found in the

 * LICENSE file in the root directory of this source tree. An additional grant

 * of patent rights can be found in the PATENTS file in the same directory.

 */

#include "model.h"

#include <assert.h>

#include <algorithm>

#include "utils.h"

namespace fasttext {

Model::Model(std::shared_ptr<Matrix> wi,

             std::shared_ptr<Matrix> wo,

             std::shared_ptr<Args> args,

             int32_t seed)

  : hidden_(args->dim), output_(wo->m_), grad_(args->dim), rng(seed)

{

  wi_ = wi;//输入--上下文

  wo_ = wo;//参数矩阵，行对应于某个词的参数集合

  args_ = args;//参数

  isz_ = wi->m_;

  osz_ = wo->m_;

  hsz_ = args->dim;

  negpos = 0;

  loss_ = 0.0;

  nexamples_ = 1;

  initSigmoid();

  initLog();

}

Model::~Model() {

  delete[] t_sigmoid;

  delete[] t_log;

}

//小型逻辑回归

real Model::binaryLogistic(int32_t target, bool label, real lr) {

  real score = sigmoid(wo_->dotRow(hidden_, target));//获取sigmod，某一行的-target==== q

  real alpha = lr * (real(label) - score);//若是正样本，则1，否则是0=================  g

  grad_.addRow(*wo_, target, alpha);//更新中间值                                    == e

  wo_->addRow(hidden_, target, alpha);//更新参数

  if (label) {//记录损失值----根据公式来的，L=log(１/p(x))  ,p(x)是概率值

    return -log(score);//p(x)=score

  } else {

    return -log(1.0 - score);//p(x)=1-score    score表示为1的概率

  }

}

//负采样的方式

real Model::negativeSampling(int32_t target, real lr) {//target表示目标词的index

  real loss = 0.0;

  grad_.zero();//e值的设置为0

  for (int32_t n = 0; n <= args_->neg; n++) {//负采样的比例，这里数目

    if (n == 0) {//正样例

      loss += binaryLogistic(target, true, lr);

    } else {//负样例--neg 个

      loss += binaryLogistic(getNegative(target), false, lr);

    }

  }

  return loss;

}

//层次softmax

real Model::hierarchicalSoftmax(int32_t target, real lr) {

  real loss = 0.0;

  grad_.zero();

  const std::vector<bool>& binaryCode = codes[target];

  const std::vector<int32_t>& pathToRoot = paths[target];

  for (int32_t i = 0; i < pathToRoot.size(); i++) {//根据编码路劲搞，词到根目录的

    loss += binaryLogistic(pathToRoot[i], binaryCode[i], lr);

  }

  return loss;

}

//计算softmax值，存入output中

void Model::computeOutputSoftmax(Vector& hidden, Vector& output) const {

  output.mul(*wo_, hidden);//向量乘以矩阵---输出=参数转移矩阵*输入

  real max = output[0], z = 0.0;

  for (int32_t i = 0; i < osz_; i++) {//获取最大的内积值

    max = std::max(output[i], max);

  }

  for (int32_t i = 0; i < osz_; i++) {//求出每个内积值相对最大值的情况

    output[i] = exp(output[i] - max);

    z += output[i];//累计和，用于归一化

  }

  for (int32_t i = 0; i < osz_; i++) {//求出softmax值

    output[i] /= z;

  }

}

void Model::computeOutputSoftmax() {

  computeOutputSoftmax(hidden_, output_);

}

//普通softmax计算

real Model::softmax(int32_t target, real lr) {

  grad_.zero();

  computeOutputSoftmax();

  for (int32_t i = 0; i < osz_; i++) {//遍历所有词---此次操作只是针对一个词的更新

    real label = (i == target) ? 1.0 : 0.0;

    real alpha = lr * (label - output_[i]);//中间参数

    grad_.addRow(*wo_, i, alpha);//更新e值

    wo_->addRow(hidden_, i, alpha);//更新参数

  }

  return -log(output_[target]);//损失值

}

//计算映射层的向量

void Model::computeHidden(const std::vector<int32_t>& input, Vector& hidden) const {

  assert(hidden.size() == hsz_);

  hidden.zero();

  for (auto it = input.cbegin(); it != input.cend(); ++it) {//指定的行进行累加，也就是上下文的词向量

    hidden.addRow(*wi_, *it);

  }

  hidden.mul(1.0 / input.size());//求均值为Xw

}

//比较，按照第一个降序

bool Model::comparePairs(const std::pair<real, int32_t> &l,

                         const std::pair<real, int32_t> &r) {

  return l.first > r.first;

}

//模型预测函数

void Model::predict(const std::vector<int32_t>& input, int32_t k,

                    std::vector<std::pair<real, int32_t>>& heap,

                    Vector& hidden, Vector& output) const {

  assert(k > 0);

  heap.reserve(k + 1);

  computeHidden(input, hidden);//计算映射层，input是上下文

  if (args_->loss == loss_name::hs) {//层次softmax，遍历树结构

    dfs(k, 2 * osz_ - 2, 0.0, heap, hidden);

  } else {//其他则通过数组寻最大

    findKBest(k, heap, hidden, output);

  }

  std::sort_heap(heap.begin(), heap.end(), comparePairs);//堆排序，得到最终的排序的值，降序排

}

void Model::predict(const std::vector<int32_t>& input, int32_t k,

                    std::vector<std::pair<real, int32_t>>& heap) {

  predict(input, k, heap, hidden_, output_);

}

//vector寻找topk---获得一个最小堆

void Model::findKBest(int32_t k, std::vector<std::pair<real, int32_t>>& heap,

                      Vector& hidden, Vector& output) const {

  computeOutputSoftmax(hidden, output);//计算soft值

  for (int32_t i = 0; i < osz_; i++) {//输出的大小

    if (heap.size() == k && log(output[i]) < heap.front().first) {//小于topk中最小的那个，最小堆，损失值

      continue;

    }

    heap.push_back(std::make_pair(log(output[i]), i));//加入堆中

    std::push_heap(heap.begin(), heap.end(), comparePairs);//做对排序

    if (heap.size() > k) {//

      std::pop_heap(heap.begin(), heap.end(), comparePairs);//移动最小的那个到最后面，且堆排序

      heap.pop_back();//删除最后一个元素

    }

  }

}

//层次softmax的topk获取

void Model::dfs(int32_t k, int32_t node, real score,

                std::vector<std::pair<real, int32_t>>& heap,

                Vector& hidden) const {//从根开始

  if (heap.size() == k && score < heap.front().first) {//跳过

    return;

  }

  if (tree[node].left == -1 && tree[node].right == -1) {//表示为叶子节点

    heap.push_back(std::make_pair(score, node));//根到叶子的损失总值，叶子也就是词了

    std::push_heap(heap.begin(), heap.end(), comparePairs);//维持最小堆，以损失值

    if (heap.size() > k) {

      std::pop_heap(heap.begin(), heap.end(), comparePairs);

      heap.pop_back();

    }

    return;

  }

  real f = sigmoid(wo_->dotRow(hidden, node - osz_));//计算出sigmod值，用于计算损失

  dfs(k, tree[node].left, score + log(1.0 - f), heap, hidden);//左侧为1损失

  dfs(k, tree[node].right, score + log(f), heap, hidden);

}

//更新操作

void Model::update(const std::vector<int32_t>& input, int32_t target, real lr) {

  assert(target >= 0);

  assert(target < osz_);

  if (input.size() == 0) return;

  computeHidden(input, hidden_);//计算映射层值

  if (args_->loss == loss_name::ns) {//负采样的更新

    loss_ += negativeSampling(target, lr);

  } else if (args_->loss == loss_name::hs) {//层次soft

    loss_ += hierarchicalSoftmax(target, lr);

  } else {//普通soft

    loss_ += softmax(target, lr);

  }

  nexamples_ += 1;//处理的样例数，

  if (args_->model == model_name::sup) {//分类

    grad_.mul(1.0 / input.size());

  }

  for (auto it = input.cbegin(); it != input.cend(); ++it) {//获取指向常数的指针

    wi_->addRow(grad_, *it, 1.0);//迭代加上上下文的词向量，来更新上下文的词向量

  }

}

//根据词频的向量，构建哈夫曼树或者初始化负采样的表

void Model::setTargetCounts(const std::vector<int64_t>& counts) {

  assert(counts.size() == osz_);

  if (args_->loss == loss_name::ns) {

    initTableNegatives(counts);

  }

  if (args_->loss == loss_name::hs) {

    buildTree(counts);

  }

}

//负采样的采样表获取

void Model::initTableNegatives(const std::vector<int64_t>& counts) {

  real z = 0.0;

  for (size_t i = 0; i < counts.size(); i++) {

    z += pow(counts[i], 0.5);//采取是词频的0.5次方

  }

  for (size_t i = 0; i < counts.size(); i++) {

    real c = pow(counts[i], 0.5);//c值

    //0,0,0,1,1,1,1,1,1,1,2,2类似这种有序的，0表示第一个词，占个坑，随机读取时，越多则概率越大。所有词的随机化

    //最多重复次数，若是c/z足够小，会导致重复次数很少，最小是1次

    //NEGATIVE_TABLE_SIZE含义是一个词最多重复不能够超过的值

    for (size_t j = 0; j < c * NEGATIVE_TABLE_SIZE / z; j++) {//该词映射到表的维度上的取值情况，也就是不等分区映射到等区分段上

      negatives.push_back(i);

    }

  }

  std::shuffle(negatives.begin(), negatives.end(), rng);//随机化一下，均匀随机化,

}

//对于词target获取负采样的值

int32_t Model::getNegative(int32_t target) {

  int32_t negative;

  do {

    negative = negatives[negpos];//由于表是随机化的，取值就是随机采的

    negpos = (negpos + 1) % negatives.size();//下一个，不断的累加的，由于表格随机的，所以不需要pos随机了

  } while (target == negative);//若是遇到为正样本则跳过

  return negative;

}

//构建哈夫曼树过程

void Model::buildTree(const std::vector<int64_t>& counts) {

  tree.resize(2 * osz_ - 1);

  for (int32_t i = 0; i < 2 * osz_ - 1; i++) {

    tree[i].parent = -1;

    tree[i].left = -1;

    tree[i].right = -1;

    tree[i].count = 1e15;

    tree[i].binary = false;

  }

  for (int32_t i = 0; i < osz_; i++) {

    tree[i].count = counts[i];

  }

  int32_t leaf = osz_ - 1;

  int32_t node = osz_;

  for (int32_t i = osz_; i < 2 * osz_ - 1; i++) {

    int32_t mini[2];

    for (int32_t j = 0; j < 2; j++) {

      if (leaf >= 0 && tree[leaf].count < tree[node].count) {

        mini[j] = leaf--;

      } else {

        mini[j] = node++;

      }

    }

    tree[i].left = mini[0];

    tree[i].right = mini[1];

    tree[i].count = tree[mini[0]].count + tree[mini[1]].count;

    tree[mini[0]].parent = i;

    tree[mini[1]].parent = i;

    tree[mini[1]].binary = true;

  }

  for (int32_t i = 0; i < osz_; i++) {

    std::vector<int32_t> path;

    std::vector<bool> code;

    int32_t j = i;

    while (tree[j].parent != -1) {

      path.push_back(tree[j].parent - osz_);

      code.push_back(tree[j].binary);

      j = tree[j].parent;

    }

    paths.push_back(path);

    codes.push_back(code);

  }

}

//获取均匀损失值，平均每个样本的损失

real Model::getLoss() const {

  return loss_ / nexamples_;

}

//初始化sigmod表

void Model::initSigmoid() {

  t_sigmoid = new real[SIGMOID_TABLE_SIZE + 1];

  for (int i = 0; i < SIGMOID_TABLE_SIZE + 1; i++) {

    real x = real(i * 2 * MAX_SIGMOID) / SIGMOID_TABLE_SIZE - MAX_SIGMOID;

    t_sigmoid[i] = 1.0 / (1.0 + std::exp(-x));

  }

}

//初始化log函数的表，对于0~1之间的值

void Model::initLog() {

  t_log = new real[LOG_TABLE_SIZE + 1];

  for (int i = 0; i < LOG_TABLE_SIZE + 1; i++) {

    real x = (real(i) + 1e-5) / LOG_TABLE_SIZE;

    t_log[i] = std::log(x);

  }

}

//log的处理

real Model::log(real x) const {

  if (x > 1.0) {

    return 0.0;

  }

  int i = int(x * LOG_TABLE_SIZE);

  return t_log[i];

}

//获取sigmod值

real Model::sigmoid(real x) const {

  if (x < -MAX_SIGMOID) {

    return 0.0;

  } else if (x > MAX_SIGMOID) {

    return 1.0;

  } else {

    int i = int((x + MAX_SIGMOID) * SIGMOID_TABLE_SIZE / MAX_SIGMOID / 2);

    return t_sigmoid[i];

  }

}

}

说明：

1：模型核心在于模型的更新即update函数，此时函数根据不同参数，选择不同的模型训练方法，共提供了3种方式

2：前两种方式的公有处理方式的提取，由于前两种方式的共有的更新。区别度在于选择部分词，还是将词累到共公节点上

四：fasttext.cc

/**

 * Copyright (c) 2016-present, Facebook, Inc.

 * All rights reserved.

 *

 * This source code is licensed under the BSD-style license found in the

 * LICENSE file in the root directory of this source tree. An additional grant

 * of patent rights can be found in the PATENTS file in the same directory.

 */

#include "fasttext.h"

#include <math.h>

#include <iostream>

#include <iomanip>

#include <thread>

#include <string>

#include <vector>

#include <algorithm>

namespace fasttext {

//获取词向量

void FastText::getVector(Vector& vec, const std::string& word) {

  const std::vector<int32_t>& ngrams = dict_->getNgrams(word);

  vec.zero();

  for (auto it = ngrams.begin(); it != ngrams.end(); ++it) {

    vec.addRow(*input_, *it);//ngram的累加

  }

  if (ngrams.size() > 0) {//ngram均值，来体现词向量

    vec.mul(1.0 / ngrams.size());

  }

}

//保存词向量

void FastText::saveVectors() {

  std::ofstream ofs(args_->output + ".vec");

  if (!ofs.is_open()) {

    std::cout << "Error opening file for saving vectors." << std::endl;

    exit(EXIT_FAILURE);

  }

  ofs << dict_->nwords() << " " << args_->dim << std::endl;

  Vector vec(args_->dim);

  for (int32_t i = 0; i < dict_->nwords(); i++) {

    std::string word = dict_->getWord(i);//获取词

    getVector(vec, word);//获取词的向量

    ofs << word << " " << vec << std::endl;

  }

  ofs.close();

}

//保存模型

void FastText::saveModel() {

  std::ofstream ofs(args_->output + ".bin", std::ofstream::binary);

  if (!ofs.is_open()) {

    std::cerr << "Model file cannot be opened for saving!" << std::endl;

    exit(EXIT_FAILURE);

  }

  args_->save(ofs);

  dict_->save(ofs);

  input_->save(ofs);

  output_->save(ofs);

  ofs.close();

}

//加载模型

void FastText::loadModel(const std::string& filename) {

  std::ifstream ifs(filename, std::ifstream::binary);

  if (!ifs.is_open()) {

    std::cerr << "Model file cannot be opened for loading!" << std::endl;

    exit(EXIT_FAILURE);

  }

  loadModel(ifs);

  ifs.close();

}

void FastText::loadModel(std::istream& in) {

  args_ = std::make_shared<Args>();

  dict_ = std::make_shared<Dictionary>(args_);

  input_ = std::make_shared<Matrix>();

  output_ = std::make_shared<Matrix>();

  args_->load(in);

  dict_->load(in);

  input_->load(in);

  output_->load(in);

  model_ = std::make_shared<Model>(input_, output_, args_, 0);//传的是指针，改变可以带回

  if (args_->model == model_name::sup) {//构建模型的过程

    model_->setTargetCounts(dict_->getCounts(entry_type::label));

  } else {

    model_->setTargetCounts(dict_->getCounts(entry_type::word));

  }

}

//打印提示信息

void FastText::printInfo(real progress, real loss) {

  real t = real(clock() - start) / CLOCKS_PER_SEC;//多少秒

  real wst = real(tokenCount) / t;//每秒处理词数

  real lr = args_->lr * (1.0 - progress);//学习率

  int eta = int(t / progress * (1 - progress) / args_->thread);

  int etah = eta / 3600;

  int etam = (eta - etah * 3600) / 60;

  std::cout << std::fixed;

  std::cout << "\rProgress: " << std::setprecision(1) << 100 * progress << "%";//完成度

  std::cout << "  words/sec/thread: " << std::setprecision(0) << wst;//每秒每线程处理个数

  std::cout << "  lr: " << std::setprecision(6) << lr;//学习率

  std::cout << "  loss: " << std::setprecision(6) << loss;//损失度

  std::cout << "  eta: " << etah << "h" << etam << "m ";

  std::cout << std::flush;

}

void FastText::supervised(Model& model, real lr,

                          const std::vector<int32_t>& line,

                          const std::vector<int32_t>& labels) {

  if (labels.size() == 0 || line.size() == 0) return;

  std::uniform_int_distribution<> uniform(0, labels.size() - 1);

  int32_t i = uniform(model.rng);

  model.update(line, labels[i], lr);

}

//cbow模型

void FastText::cbow(Model& model, real lr,

                    const std::vector<int32_t>& line) {

  std::vector<int32_t> bow;

  std::uniform_int_distribution<> uniform(1, args_->ws);

  for (int32_t w = 0; w < line.size(); w++) {

    int32_t boundary = uniform(model.rng);//随机取个窗口--每个词的窗口不一样

    bow.clear();

    for (int32_t c = -boundary; c <= boundary; c++) {

      if (c != 0 && w + c >= 0 && w + c < line.size()) {

        const std::vector<int32_t>& ngrams = dict_->getNgrams(line[w + c]);//ngrams语言

        bow.insert(bow.end(), ngrams.cbegin(), ngrams.cend());//加入上下文中

      }

    }

    model.update(bow, line[w], lr);//根据上下文更新

  }

}

//skipgram模型

void FastText::skipgram(Model& model, real lr,

                        const std::vector<int32_t>& line) {

  std::uniform_int_distribution<> uniform(1, args_->ws);

  for (int32_t w = 0; w < line.size(); w++) {

    int32_t boundary = uniform(model.rng);//窗口随机

    const std::vector<int32_t>& ngrams = dict_->getNgrams(line[w]);

    for (int32_t c = -boundary; c <= boundary; c++) {//每个预测词的更新

      if (c != 0 && w + c >= 0 && w + c < line.size()) {

        model.update(ngrams, line[w + c], lr);//ngram作为上下文

      }

    }

  }

}

//测试模型

void FastText::test(std::istream& in, int32_t k) {

  int32_t nexamples = 0, nlabels = 0;

  double precision = 0.0;

  std::vector<int32_t> line, labels;

  while (in.peek() != EOF) {

    dict_->getLine(in, line, labels, model_->rng);//获取句子

    dict_->addNgrams(line, args_->wordNgrams);//对句子增加其ngram

    if (labels.size() > 0 && line.size() > 0) {

      std::vector<std::pair<real, int32_t>> modelPredictions;

      model_->predict(line, k, modelPredictions);//预测

      for (auto it = modelPredictions.cbegin(); it != modelPredictions.cend(); it++) {

        if (std::find(labels.begin(), labels.end(), it->second) != labels.end()) {

          precision += 1.0;//准确数

        }

      }

      nexamples++;

      nlabels += labels.size();

    }

  }

  std::cout << std::setprecision(3);

  std::cout << "P@" << k << ": " << precision / (k * nexamples) << std::endl;

  std::cout << "R@" << k << ": " << precision / nlabels << std::endl;

  std::cout << "Number of examples: " << nexamples << std::endl;

}

//预测

void FastText::predict(std::istream& in, int32_t k,

                       std::vector<std::pair<real,std::string>>& predictions) const {

  std::vector<int32_t> words, labels;

  dict_->getLine(in, words, labels, model_->rng);

  dict_->addNgrams(words, args_->wordNgrams);

  if (words.empty()) return;

  Vector hidden(args_->dim);

  Vector output(dict_->nlabels());

  std::vector<std::pair<real,int32_t>> modelPredictions;

  model_->predict(words, k, modelPredictions, hidden, output);

  predictions.clear();

  for (auto it = modelPredictions.cbegin(); it != modelPredictions.cend(); it++) {

    predictions.push_back(std::make_pair(it->first, dict_->getLabel(it->second)));//不同标签的预测分

  }

}

//预测

void FastText::predict(std::istream& in, int32_t k, bool print_prob) {

  std::vector<std::pair<real,std::string>> predictions;

  while (in.peek() != EOF) {

    predict(in, k, predictions);

    if (predictions.empty()) {

      std::cout << "n/a" << std::endl;

      continue;

    }

    for (auto it = predictions.cbegin(); it != predictions.cend(); it++) {

      if (it != predictions.cbegin()) {

        std::cout << ' ';

      }

      std::cout << it->second;

      if (print_prob) {

        std::cout << ' ' << exp(it->first);

      }

    }

    std::cout << std::endl;

  }

}

//获取词向量

void FastText::wordVectors() {

  std::string word;

  Vector vec(args_->dim);

  while (std::cin >> word) {

    getVector(vec, word);//获取一个词的词向量，不仅仅是对已知的，还能对未知进行预测

    std::cout << word << " " << vec << std::endl;

  }

}

//句子的向量

void FastText::textVectors() {

  std::vector<int32_t> line, labels;

  Vector vec(args_->dim);

  while (std::cin.peek() != EOF) {

    dict_->getLine(std::cin, line, labels, model_->rng);//句子

    dict_->addNgrams(line, args_->wordNgrams);//对应ngram

    vec.zero();

    for (auto it = line.cbegin(); it != line.cend(); ++it) {//句子的词以及ngram的索引

      vec.addRow(*input_, *it);//将词的向量求出和

    }

    if (!line.empty()) {//求均值

      vec.mul(1.0 / line.size());

    }

    std::cout << vec << std::endl;//表示句子的词向量

  }

}

void FastText::printVectors() {

  if (args_->model == model_name::sup) {

    textVectors();

  } else {//词向量

    wordVectors();

  }

}

//训练线程

void FastText::trainThread(int32_t threadId) {

  std::ifstream ifs(args_->input);

  utils::seek(ifs, threadId * utils::size(ifs) / args_->thread);

  Model model(input_, output_, args_, threadId);

  if (args_->model == model_name::sup) {

    model.setTargetCounts(dict_->getCounts(entry_type::label));

  } else {

    model.setTargetCounts(dict_->getCounts(entry_type::word));

  }

  const int64_t ntokens = dict_->ntokens();

  int64_t localTokenCount = 0;

  std::vector<int32_t> line, labels;

  while (tokenCount < args_->epoch * ntokens) {//epoch迭代次数

    real progress = real(tokenCount) / (args_->epoch * ntokens);//进度

    real lr = args_->lr * (1.0 - progress);

    localTokenCount += dict_->getLine(ifs, line, labels, model.rng);

    if (args_->model == model_name::sup) {//分不同函数进行处理

      dict_->addNgrams(line, args_->wordNgrams);

      supervised(model, lr, line, labels);

    } else if (args_->model == model_name::cbow) {

      cbow(model, lr, line);

    } else if (args_->model == model_name::sg) {

      skipgram(model, lr, line);

    }

    if (localTokenCount > args_->lrUpdateRate) {//修正学习率

      tokenCount += localTokenCount;

      localTokenCount = 0;

      if (threadId == 0 && args_->verbose > 1) {

        printInfo(progress, model.getLoss());

      }

    }

  }

  if (threadId == 0 && args_->verbose > 0) {

    printInfo(1.0, model.getLoss());

    std::cout << std::endl;

  }

  ifs.close();

}

//加载Vectors过程， 字典

void FastText::loadVectors(std::string filename) {

  std::ifstream in(filename);

  std::vector<std::string> words;

  std::shared_ptr<Matrix> mat; // temp. matrix for pretrained vectors

  int64_t n, dim;

  if (!in.is_open()) {

    std::cerr << "Pretrained vectors file cannot be opened!" << std::endl;

    exit(EXIT_FAILURE);

  }

  in >> n >> dim;

  if (dim != args_->dim) {

    std::cerr << "Dimension of pretrained vectors does not match -dim option"

              << std::endl;

    exit(EXIT_FAILURE);

  }

  mat = std::make_shared<Matrix>(n, dim);

  for (size_t i = 0; i < n; i++) {

    std::string word;

    in >> word;

    words.push_back(word);

    dict_->add(word);

    for (size_t j = 0; j < dim; j++) {

      in >> mat->data_[i * dim + j];

    }

  }

  in.close();

  dict_->threshold(1, 0);

  input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);

  input_->uniform(1.0 / args_->dim);

  for (size_t i = 0; i < n; i++) {

    int32_t idx = dict_->getId(words[i]);

    if (idx < 0 || idx >= dict_->nwords()) continue;

    for (size_t j = 0; j < dim; j++) {

      input_->data_[idx * dim + j] = mat->data_[i * dim + j];

    }

  }

}

//训练

void FastText::train(std::shared_ptr<Args> args) {

  args_ = args;

  dict_ = std::make_shared<Dictionary>(args_);

  if (args_->input == "-") {

    // manage expectations

    std::cerr << "Cannot use stdin for training!" << std::endl;

    exit(EXIT_FAILURE);

  }

  std::ifstream ifs(args_->input);

  if (!ifs.is_open()) {

    std::cerr << "Input file cannot be opened!" << std::endl;

    exit(EXIT_FAILURE);

  }

  dict_->readFromFile(ifs);

  ifs.close();

  if (args_->pretrainedVectors.size() != 0) {

    loadVectors(args_->pretrainedVectors);

  } else {

    input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);

    input_->uniform(1.0 / args_->dim);

  }

  if (args_->model == model_name::sup) {

    output_ = std::make_shared<Matrix>(dict_->nlabels(), args_->dim);

  } else {

    output_ = std::make_shared<Matrix>(dict_->nwords(), args_->dim);

  }

  output_->zero();

  start = clock();

  tokenCount = 0;

  std::vector<std::thread> threads;

  for (int32_t i = 0; i < args_->thread; i++) {

    threads.push_back(std::thread([=]() { trainThread(i); }));

  }

  for (auto it = threads.begin(); it != threads.end(); ++it) {

    it->join();

  }

  model_ = std::make_shared<Model>(input_, output_, args_, 0);

  saveModel();

  if (args_->model != model_name::sup) {

    saveVectors();

  }

}

}

fasttext源码剖析的更多相关文章

jQuery之Deferred源码剖析
一.前言大约在夏季,我们谈过ES6的Promise(详见here),其实在ES6前jQuery早就有了Promise,也就是我们所知道的Deferred对象,宗旨当然也和ES6的Promise一样, ...
Nodejs事件引擎libuv源码剖析之：高效线程池(threadpool)的实现
声明:本文为原创博文,转载请注明出处. Nodejs编程是全异步的,这就意味着我们不必每次都阻塞等待该次操作的结果,而事件完成(就绪)时会主动回调通知我们.在网络编程中,一般都是基于Reactor线程 ...
Apache Spark源码剖析
Apache Spark源码剖析(全面系统介绍Spark源码,提供分析源码的实用技巧和合理的阅读顺序,充分了解Spark的设计思想和运行机理) 许鹏著 ISBN 978-7-121-25420- ...
基于mybatis-generator-core 1.3.5项目的修订版以及源码剖析
项目简单说明 mybatis-generator,是根据数据库表.字段反向生成实体类等代码文件.我在国庆时候,没事剖析了mybatis-generator-core源码,写了相当详细的中文注释,可以去 ...
STL"源码"剖析-重点知识总结
STL是C++重要的组件之一,大学时看过<STL源码剖析>这本书,这几天复习了一下,总结出以下LZ认为比较重要的知识点,内容有点略多 :) 1.STL概述 STL提供六大组件,彼此可以组合 ...
SpringMVC源码剖析（四）- DispatcherServlet请求转发的实现
SpringMVC完成初始化流程之后,就进入Servlet标准生命周期的第二个阶段,即“service”阶段.在“service”阶段中,每一次Http请求到来,容器都会启动一个请求线程,通过serv ...
自己实现多线程的socket，socketserver源码剖析
1,IO多路复用三种多路复用的机制:select.poll.epoll 用的多的两个:select和epoll 简单的说就是:1,select和poll所有平台都支持,epoll只有linux支持2 ...
Java多线程9：ThreadLocal源码剖析
ThreadLocal源码剖析 ThreadLocal其实比较简单,因为类里就三个public方法:set(T value).get().remove().先剖析源码清楚地知道ThreadLocal是 ...
JS魔法堂：mmDeferred源码剖析
一.前言 avalon.js的影响力愈发强劲,而作为子模块之一的mmDeferred必然成为异步调用模式学习之旅的又一站呢!本文将记录我对mmDeferred的认识,若有纰漏请各位指正,谢谢.项目请见 ...

随机推荐

loadrunner——win7+LR11配置
一. 安装vmware虚拟机下载安装vmware15后,可使用密钥为:CG392-4PX5J-H816Z-HYZNG-PQRG2 二. 安装win7系统 2.1下载win7镜像文件 2.2 vmwa ...
SQL 与，或，非
SQL AND, OR and NOT(与,或不是运算符) AND&OR运算符用于根据一个以上的条件过滤记录. SQL AND & OR 运算符 WHERE子句可以与AND,OR和NO ...
jsp网站访问次数统计的几种方法
我采用的是jsp网页,但是不管采用什么语言,原理是一样的. 第一种,单页面统计.就是说,只要点击这个页面就会统计一次. <body> <%!//在这种标记中定义的变量为全局变量 in ...
小程序解析HTML5
最近做项目的时候碰到一个问题,就是调用接口获取信息到页面上,内容与HTML5标签一起获取过来了.一起显示在微信端上.一般都是二次开发才有可能出现这种问题.通过查找方法,找到了一个可以把HTML5标签转 ...
PHP FILTER_UNSAFE_RAW 过滤器
定义和用法 FILTER_UNSAFE_RAW 过滤器不进行任何过滤,去除或编码特殊字符. 该过滤器删除那些对应用程序有潜在危害的数据.它用于去除标签以及删除或编码不需要的字符. 如果不规定标志,则该 ...
readUTF()和writeUTF()
readUTF()和writeUTF() 这是dataOutputStream 的方法~~使用utf-8编码其实就是从unicode变过来的,utf8编码把unicode的ASCII编码变成1个字节 ...
AndroidFine Error:Annotation processors must be explicitly declared now.
环境 Android Studio 3.0 Gradle 3.0.0 gradle 4.1 Error Error:Execution failed for task ':app:javaPreCom ...
HDU 6697 Closest Pair of Segments (计算几何暴力)
2019 杭电多校 10 1007 题目链接:HDU 6697 比赛链接:2019 Multi-University Training Contest 10 Problem Description T ...
PostgreSQL——服务器基本设置与操作
一.编译安装: 环境准备: GNU make 版本 >=3.8 (make --version) ISO/ANSI C 编译器,至少須兼容 C89 标准,GCC 或 intel 編译器等均可 g ...
Ubuntu12.04开机自动挂载windows分区
最近使用Ubuntu12.04时不知到怎么搞的原本能自动识别的Windows的C .D .E盘突然间无法识别了,于是上网搜了一下Ubuntu12.04下自动挂载Windows NTFS分区的方法. 还 ...

fasttext源码剖析

fasttext源码剖析

fasttext源码剖析的更多相关文章

随机推荐

热门专题