
  1. 串行读入并统计词频
    // LoadDocsInUbuntu.cpp
    #include <iostream>
    #include <stdio.h>
    #include <vector>
    using namespace std;
    int main()
        char filename[100];
        size_t d;
        FILE *fileptr;
        int word;
        vector< vector<int> > corpus;
        printf("load data ...\n");
        for (d = 1; d < 37; d++){
            sprintf(filename, "..//data/doc_%d.txt", d);
            fileptr = fopen(filename, "r");
            vector<int> doc;
            int ff[25] = { 0 };
            while (fscanf(fileptr, "%d", &word) != EOF)
                ff[word - 1] = ff[word - 1] + 1;
            sprintf(filename, "..//result/freqUbuntuSerial_%d.txt", d);
            fileptr = fopen(filename, "w");
            for (int f = 0; f < 25; f++)
                fprintf(fileptr, "%d ", ff[f]);
        cout <<"corpus.size()="<< corpus.size() << endl;
        return 0;
  2. 这里讨论并行有三种思路:一,按照文档序号进行分组读入统计等操作;二,在文档内按单词数目分组进行统计;三,将统计与读写操作并行处理。



// LoadDocsByOpenMP.cpp
#include <omp.h>
#include <iostream>
#include <stdio.h>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include <string>
using namespace std; int main()
char filename[100],resultname[100];
int d;
FILE *fileptr[360];
int word;
int ff[360][25] = { 0 };
//vector< vector<int> > corpus;
clock_t start,finish;
int f[360]={0}; start=clock();
printf("load data ...\n");
#pragma omp parallel for num_threads(4)
for (d = 1; d < 361; d++){
printf("Hello world, I am %d, docs index %d.\n",omp_get_thread_num(),d);
sprintf(filename, "..//data/doc_%d.txt", d);
fileptr[d-1] = fopen(filename, "r");
//int ff[25]={0};
////vector<int> doc; while (fscanf(fileptr[d-1], "%d", &word) != EOF)
ff[d-1][word - 1] = ff[d-1][word - 1] + 1;
// //doc.push_back(word);
sprintf(resultname, "..//result/freqByOpenMP_%d.txt", d);//Be CAREFUL!For the name "filename" has been used before, we must name the string differently here.
fileptr[d-1] = fopen(resultname, "w");
for (f[d-1] = 0; f[d-1] < 25; f[d-1]++)
{ fprintf(fileptr[d-1], "%d ", ff[f[d-1]]);
} //cout <<"corpus.size()="<< corpus.size() << endl;
cout<<"time cost : "<< (double)(finish-start)/ CLOCKS_PER_SEC<<endl;
return 0;



