update:

  • 简单整理了一下代码的组织。
  • 处理的单词封装成类,单词的修正,信息的显示都作为其内的方法。

写得还比较糙,工具本身可以封装,还有对于单词的变形基本没什么处理,以后有时间再改。

项目托管到github上了。https://github.com/MorpheusDong/TextVocabularyAnalyzer

TypeDefine.h

#ifndef _TYPE_DEFINE_H_
#define _TYPE_DEFINE_H_ #include <iostream>
#include <fstream>
#include <string>
#include <array>
#include <vector>
#include <iterator>
#include <map> using namespace std; #define COCA_WORDS_NUM 20201U
#define WORDS_HEAD_NUM 26U #define WORDS_HEAD_A 0U
#define WORDS_HEAD_B 1U
#define WORDS_HEAD_C 2U
#define WORDS_HEAD_D 3U
#define WORDS_HEAD_E 4U
#define WORDS_HEAD_F 5U
#define WORDS_HEAD_G 6U
#define WORDS_HEAD_H 7U
#define WORDS_HEAD_I 8U
#define WORDS_HEAD_J 9U
#define WORDS_HEAD_K 10U
#define WORDS_HEAD_L 11U
#define WORDS_HEAD_M 12U
#define WORDS_HEAD_N 13U
#define WORDS_HEAD_O 14U
#define WORDS_HEAD_P 15U
#define WORDS_HEAD_Q 16U
#define WORDS_HEAD_R 17U
#define WORDS_HEAD_S 18U
#define WORDS_HEAD_T 19U
#define WORDS_HEAD_U 20U
#define WORDS_HEAD_V 21U
#define WORDS_HEAD_W 22U
#define WORDS_HEAD_X 23U
#define WORDS_HEAD_Y 24U
#define WORDS_HEAD_Z 25U #define USUAL_WORD_NUM 17U typedef enum WordFrequencyType
{
WORD_UNDER_4000 = 0,
WORD_4000_6000,
WORD_6000_8000,
WORD_8000_10000,
WORD_10000_12000,
WORD_12000_14000,
WORD_14000_16000,
WORD_OVER_16000,
WORD_NOT_FOUND_COCA,
WORD_LEVEL_NUM
}TagWordFrequencyType; const string alphabet_str = "abcdefghijklmnopqrstuvwxyz"; const string report_str[WORD_LEVEL_NUM] = {
"UNDER 4000: ",
"4000-6000: ",
"6000-8000: ",
"8000-10000: ",
"10000-12000: ",
"12000-14000: ",
"14000-16000: ",
"16000-20000+: ",
"\nNot found in COCA:"
}; //for usual words not included in COCA
const string usual_w_out_of_COCA_str[USUAL_WORD_NUM] =
{
"s","is","are","re","was","were",
"an","won","t","has","had","been",
"did","does","cannot","got","men"
}; #endif

TextVocabularyAnalyzer.h

#ifndef _TEXT_VOCABULARY_ANALYZER_H_
#define _TEXT_VOCABULARY_ANALYZER_H_ #include "TypeDefine.h" extern TagWordFrequencyType frequency_classify(const int wfrq);
extern void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag);
extern bool isaletter(const char& c); class CLetters
{
private:
string m_word; public:
CLetters();
~CLetters();
void fill(vector<char>& vw);
const string word();
const char firstletter();
void processing();
bool usual_recheck();
bool form_recheck();
}; #endif // !_TEXT_VOCABULARY_ANALYZER_H_

TextVocabularyAnalyzer.cpp

/* TextVocabularyAnalyzer.cpp */

#include <algorithm>
#include "TextVocabularyAnalyzer.h" TagWordFrequencyType frequency_classify(const int wfrq)
{
if (wfrq == 0)
{
return WORD_NOT_FOUND_COCA;
}
else if (wfrq > 0 && wfrq <= 4000)
{
return WORD_UNDER_4000;
}
else if (wfrq > 4000 && wfrq <= 6000)
{
return WORD_4000_6000;
}
else if (wfrq > 6000 && wfrq <= 8000)
{
return WORD_6000_8000;
}
else if (wfrq > 8000 && wfrq <= 10000)
{
return WORD_8000_10000;
}
else if (wfrq > 10000 && wfrq <= 12000)
{
return WORD_10000_12000;
}
else if (wfrq > 12000 && wfrq <= 14000)
{
return WORD_12000_14000;
}
else if (wfrq > 14000 && wfrq <= 16000)
{
return WORD_14000_16000;
}
else
{
return WORD_OVER_16000;
}
} void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag)
{
switch (wfrq_tag)
{
case WORD_UNDER_4000:
{
wfrq_array[WORD_UNDER_4000] += 1;
break;
}
case WORD_4000_6000:
{
wfrq_array[WORD_4000_6000] += 1;
break;
}
case WORD_6000_8000:
{
wfrq_array[WORD_6000_8000] += 1;
break;
}
case WORD_8000_10000:
{
wfrq_array[WORD_8000_10000] += 1;
break;
}
case WORD_10000_12000:
{
wfrq_array[WORD_10000_12000] += 1;
break;
}
case WORD_12000_14000:
{
wfrq_array[WORD_12000_14000] += 1;
break;
}
case WORD_14000_16000:
{
wfrq_array[WORD_14000_16000] += 1;
break;
}
case WORD_OVER_16000:
{
wfrq_array[WORD_OVER_16000] += 1;
break;
}
default:
{
wfrq_array[WORD_NOT_FOUND_COCA] += 1;
break;
}
}
} bool isaletter(const char& c)
{
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
{
return true;
}
else
{
return false;
}
} //Class Cletters realization
CLetters::CLetters()
{
m_word = "";
} CLetters::~CLetters()
{
//do nothing
} void CLetters::fill(vector<char>& vw)
{
//store the word with lower form
m_word.assign(vw.begin(), vw.end());
transform(m_word.begin(), m_word.end(), m_word.begin(), tolower);
} const string CLetters::word()
{
return m_word;
} const char CLetters::firstletter()
{
return m_word[0];
} void CLetters::processing()
{
cout << "Finding word \"" << m_word << "\"...\t";
} bool CLetters::usual_recheck()
{
//check if the word is usual
bool RetVal = false;
for (int i = 0; i < USUAL_WORD_NUM; i++)
{
if (m_word == usual_w_out_of_COCA_str[i])
{
RetVal = true;
}
else
{
//do nothing
}
}
return RetVal;
} bool CLetters::form_recheck()
{
bool RetVal = false;
if (m_word.length() > 3)
{
char e1, e2, e3;
e3 = m_word[m_word.length() - 3]; //last but two letter
e2 = m_word[m_word.length() - 2]; //last but one letter
e1 = m_word[m_word.length() - 1]; //last letter if (e1 == 's')
{
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else if (e2 == 'e' && e1 == 'd')
{
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else if (e3 == 'i' && e2 == 'n' && e1 == 'g')
{
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else
{
//do nothing
}
}
return RetVal;
}

main.cpp

/* main .cpp */

#include <numeric>
#include <iomanip>
#include <ctime>
#include "TextVocabularyAnalyzer.h" int main()
{
//file init
ifstream COCA_txt("D:\\COCA.txt");
ifstream USER_txt("D:\\JobsSpeech.txt"); //time init
clock_t startTime, endTime;
double build_map_time = 0;
double process_time = 0; startTime = clock(); //build time start //build COCA words map
map<string, int> COCA_WordsList[WORDS_HEAD_NUM];
int readlines = 0; while (readlines < COCA_WORDS_NUM)
{
int frequency = 0; string word = "";
COCA_txt >> frequency;
COCA_txt >> word; //transform to lower uniformly
transform(word.begin(), word.end(), word.begin(), tolower); //import every word
for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
{
//check word head
if (word[0] == alphabet_str[whead])
{
//if a word already exists, only load its lower frequency
if (COCA_WordsList[whead].find(word) == COCA_WordsList[whead].end())
{
COCA_WordsList[whead].insert(make_pair(word, frequency));
}
else
{
COCA_WordsList[whead][word] = frequency < COCA_WordsList[whead][word] ? frequency : COCA_WordsList[whead][word];
}
}
else
{
// do nothing
}
}
readlines++;
} endTime = clock(); //build time stop
build_map_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //user prompt
cout << "COCA words list imported.\nPress any key to start frequency analysis...\n";
cin.get(); startTime = clock(); //process time start //find text words
vector<char> content_read;
CLetters word_readed;
vector<int> frequecy_processed = { 0 };
array<int, WORD_LEVEL_NUM> words_analysis_array{ 0 };
char char_read = ' '; //get text char one by one
while (USER_txt.get(char_read))
{
//only letters and '-' between letters will be received
if (isaletter(char_read) || char_read == '-')
{
content_read.push_back(char_read);
}
else
{
//char which is not a letter marks the end of a word
if (!content_read.empty()) //skip single letter
{
int current_word_frequency = 0; //assign letters to make the word
word_readed.fill(content_read);
word_readed.processing(); cout << "Frequency:";
//check the word's head and find its frequency in COCA list
for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
{
if (word_readed.firstletter() == alphabet_str[whead])
{
cout << COCA_WordsList[whead][word_readed.word()];
current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //check if the word has been processed
if (current_word_frequency == 0)
{
//addtional check
if (word_readed.usual_recheck())
{
word_frequency_analyze(words_analysis_array, WORD_UNDER_4000);
}
else if (word_readed.form_recheck())
{
current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //try again
if (current_word_frequency > 0)
{
frequecy_processed.push_back(current_word_frequency);
word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
}
else
{
// do nothing
}
}
else
{
word_frequency_analyze(words_analysis_array, WORD_NOT_FOUND_COCA);
}
}
else if (find(frequecy_processed.begin(), frequecy_processed.end(), current_word_frequency)
== frequecy_processed.end())
{
//classify this word and make statistics
frequecy_processed.push_back(current_word_frequency);
word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
}
else
{
// do nothing
}
}
else
{
//do nothing
}
}
cout << endl; content_read.clear();
}
else
{
//do nothing
}
}
} endTime = clock(); //process time stop
process_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //calc whole words processed
int whole_words_analyzed = 0;
whole_words_analyzed = accumulate(words_analysis_array.begin(), words_analysis_array.end(), 0); //report result
cout << "\n////////// Report ////////// \n";
for (int i = 0;i< words_analysis_array.size();i++)
{
cout << report_str[i] <<"\t"<< words_analysis_array[i] << " (";
cout<<fixed<<setprecision(2)<<(float)words_analysis_array[i] * 100 / whole_words_analyzed << "%)" << endl;
}
cout << "\nWords totally analyzed: " << whole_words_analyzed << endl; //show run time
cout << "Map build time: " << build_map_time*1000 << "ms.\n";
cout << "Process time: " << process_time*1000 << "ms.\n";
cout << "////////////////////////////" << endl; //close file
COCA_txt.close();
USER_txt.close(); return 0;
}

基于COCA词频表的文本词汇分布测试工具v0.2的更多相关文章

  1. 基于COCA词频表的文本词汇分布测试工具v0.1

    美国语言协会对美国人日常使用的英语单词做了一份详细的统计,按照日常使用的频率做成了一张表,称为COCA词频表.排名越低的单词使用频率越高,该表可以用来统计词汇量. 如果你的词汇量约为6000,那么这张 ...

  2. 基于Text-CNN模型的中文文本分类实战 流川枫 发表于AI星球订阅

    Text-CNN 1.文本分类 转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...

  3. 基于Text-CNN模型的中文文本分类实战

    Text-CNN 1.文本分类 转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...

  4. 基于jquery的bootstrap在线文本编辑器插件Summernote

    Summernote是一个基于jquery的bootstrap超级简单WYSIWYG在线编辑器.Summernote非常的轻量级,大小只有30KB,支持Safari,Chrome,Firefox.Op ...

  5. Chinese-Text-Classification,用卷积神经网络基于 Tensorflow 实现的中文文本分类。

    用卷积神经网络基于 Tensorflow 实现的中文文本分类 项目地址: https://github.com/fendouai/Chinese-Text-Classification 欢迎提问:ht ...

  6. Android版数据结构与算法(四):基于哈希表实现HashMap核心源码彻底分析

    版权声明:本文出自汪磊的博客,未经作者允许禁止转载. 存储键值对我们首先想到HashMap,它的底层基于哈希表,采用数组存储数据,使用链表来解决哈希碰撞,它是线程不安全的,并且存储的key只能有一个为 ...

  7. HDFS的快照原理和Hbase基于快照的表修复

    前一篇文章<HDFS和Hbase误删数据恢复>主要讲了hdfs的回收站机制和Hbase的删除策略.根据hbase的删除策略进行hbase的数据表恢复.本文主要介绍了hdfs的快照原理和根据 ...

  8. js语言评价--js 基于哈希表、原型链、作用域、属性类型可配置的多范式编程语言

    js 基于哈希表.原型链.作用域.属性类型可配置的多范式编程语言 值类型.引用类型.直接赋值: 原型是以对象形式存在的类型信息. ECMA-262把对象定义为:无序属性的集合,其属性可以包含基本值,对 ...

  9. mysql中【update/Delete】update中无法用基于被更新表的子查询,You can't specify target table 'test1' for update in FROM clause.

    关键词:mysql update,mysql delete update中无法用基于被更新表的子查询,You can't specify target table 'test1' for update ...

随机推荐

  1. wampserver64 apache2.4版本局域网互相访问总结

    wampserver64  apache2.4版本局域网互相访问总结 背景:在我的电脑上给算法组开发了一个工具,需要在局域网环境下其他同事都能访问到,搞了一下午终于搞定,于是整理了这篇文档,给其他同行 ...

  2. Labview学习之路(十三)数组函数

    本文章介绍一下函数选版中的数组函数,一眼看懂没什么特殊地方的就不说了 (一)数组大小 返回I32类型的数或者数组. 什么是I32类型?就是32位有符号整型, 有符号什么意思?就是在32位数据中最高做符 ...

  3. Vuex 注入 Vue 生命周期的过程

    首先我们结合 Vue 和 Vuex 的部分源码,来说明 Vuex 注入 Vue 生命周期的过程. 说到源码,其实没有想象的那么难.也和我们平时写业务代码差不多,都是方法的调用.但是源码的调用树会复杂很 ...

  4. 在 Visual Studio for Mac 中编译和生成

    使用Visual Studio将C#生成DLL文件的方法 https://www.cnblogs.com/AaronBlogs/p/6840283.html Visual Studio 开发 - Vi ...

  5. 使用zabbix监控sql server的发布订阅

    (一)背景 个人在使用sql server时,用到了sql server的发布订阅来做主从同步,类似MySQL的异步复制.在发布订阅环境搭建完成后,最重要的就是如何监控复制的状态了,sql serve ...

  6. [bash]调用linux命令获得结果存入变量的两种方式

    代码: #!/bin/bash ls=$(ls) echo $ls whoami=`whoami` echo $whoami 执行结果: [os-××××××××101z ~]$ sh cmd2.sh ...

  7. 在Oracle中快速创建一张百万级别的表,一张十万级别的表 并修改两表中1%的数据 全部运行时间66秒

    万以下小表做性能优化没有多大意义,因此我需要创建大表: 创建大表有三种方法,一种是insert into table selec..connect by.的方式,它最快但是数据要么是连续值,要么是随机 ...

  8. springboot2.x基础教程:动手制作一个starter包

    上一篇博客介绍了springboot自动装配的原理.springboot本身有丰富的spring-boot-starter-xx集成组件,这一篇趁热打铁加深理解,我们利用springboot自动装配的 ...

  9. Java简介以及入门

    JAVA基础知识 Java简介 作者:詹姆斯·高斯林(James Gosling) Java是一门面向对象编程语言,不仅吸收了C++语言的各种优点,还摒弃了C++里难以理解的多继承.指针等概念,因此J ...

  10. MIPS 架构流水线处理器

    该项目系笔者大二时计算机组成课的课程设计,源代码及完整文档请移步 Github 仓库.