基于COCA词频表的文本词汇分布测试工具v0.2
update:
- 简单整理了一下代码的组织。
- 处理的单词封装成类,单词的修正,信息的显示都作为其内的方法。
写得还比较糙,工具本身可以封装,还有对于单词的变形基本没什么处理,以后有时间再改。
项目托管到github上了。https://github.com/MorpheusDong/TextVocabularyAnalyzer
TypeDefine.h
#ifndef _TYPE_DEFINE_H_
#define _TYPE_DEFINE_H_ #include <iostream>
#include <fstream>
#include <string>
#include <array>
#include <vector>
#include <iterator>
#include <map> using namespace std; #define COCA_WORDS_NUM 20201U
#define WORDS_HEAD_NUM 26U #define WORDS_HEAD_A 0U
#define WORDS_HEAD_B 1U
#define WORDS_HEAD_C 2U
#define WORDS_HEAD_D 3U
#define WORDS_HEAD_E 4U
#define WORDS_HEAD_F 5U
#define WORDS_HEAD_G 6U
#define WORDS_HEAD_H 7U
#define WORDS_HEAD_I 8U
#define WORDS_HEAD_J 9U
#define WORDS_HEAD_K 10U
#define WORDS_HEAD_L 11U
#define WORDS_HEAD_M 12U
#define WORDS_HEAD_N 13U
#define WORDS_HEAD_O 14U
#define WORDS_HEAD_P 15U
#define WORDS_HEAD_Q 16U
#define WORDS_HEAD_R 17U
#define WORDS_HEAD_S 18U
#define WORDS_HEAD_T 19U
#define WORDS_HEAD_U 20U
#define WORDS_HEAD_V 21U
#define WORDS_HEAD_W 22U
#define WORDS_HEAD_X 23U
#define WORDS_HEAD_Y 24U
#define WORDS_HEAD_Z 25U #define USUAL_WORD_NUM 17U typedef enum WordFrequencyType
{
WORD_UNDER_4000 = 0,
WORD_4000_6000,
WORD_6000_8000,
WORD_8000_10000,
WORD_10000_12000,
WORD_12000_14000,
WORD_14000_16000,
WORD_OVER_16000,
WORD_NOT_FOUND_COCA,
WORD_LEVEL_NUM
}TagWordFrequencyType; const string alphabet_str = "abcdefghijklmnopqrstuvwxyz"; const string report_str[WORD_LEVEL_NUM] = {
"UNDER 4000: ",
"4000-6000: ",
"6000-8000: ",
"8000-10000: ",
"10000-12000: ",
"12000-14000: ",
"14000-16000: ",
"16000-20000+: ",
"\nNot found in COCA:"
}; //for usual words not included in COCA
const string usual_w_out_of_COCA_str[USUAL_WORD_NUM] =
{
"s","is","are","re","was","were",
"an","won","t","has","had","been",
"did","does","cannot","got","men"
}; #endif
TextVocabularyAnalyzer.h
#ifndef _TEXT_VOCABULARY_ANALYZER_H_
#define _TEXT_VOCABULARY_ANALYZER_H_ #include "TypeDefine.h" extern TagWordFrequencyType frequency_classify(const int wfrq);
extern void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag);
extern bool isaletter(const char& c); class CLetters
{
private:
string m_word; public:
CLetters();
~CLetters();
void fill(vector<char>& vw);
const string word();
const char firstletter();
void processing();
bool usual_recheck();
bool form_recheck();
}; #endif // !_TEXT_VOCABULARY_ANALYZER_H_
TextVocabularyAnalyzer.cpp
/* TextVocabularyAnalyzer.cpp */ #include <algorithm>
#include "TextVocabularyAnalyzer.h" TagWordFrequencyType frequency_classify(const int wfrq)
{
if (wfrq == 0)
{
return WORD_NOT_FOUND_COCA;
}
else if (wfrq > 0 && wfrq <= 4000)
{
return WORD_UNDER_4000;
}
else if (wfrq > 4000 && wfrq <= 6000)
{
return WORD_4000_6000;
}
else if (wfrq > 6000 && wfrq <= 8000)
{
return WORD_6000_8000;
}
else if (wfrq > 8000 && wfrq <= 10000)
{
return WORD_8000_10000;
}
else if (wfrq > 10000 && wfrq <= 12000)
{
return WORD_10000_12000;
}
else if (wfrq > 12000 && wfrq <= 14000)
{
return WORD_12000_14000;
}
else if (wfrq > 14000 && wfrq <= 16000)
{
return WORD_14000_16000;
}
else
{
return WORD_OVER_16000;
}
} void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag)
{
switch (wfrq_tag)
{
case WORD_UNDER_4000:
{
wfrq_array[WORD_UNDER_4000] += 1;
break;
}
case WORD_4000_6000:
{
wfrq_array[WORD_4000_6000] += 1;
break;
}
case WORD_6000_8000:
{
wfrq_array[WORD_6000_8000] += 1;
break;
}
case WORD_8000_10000:
{
wfrq_array[WORD_8000_10000] += 1;
break;
}
case WORD_10000_12000:
{
wfrq_array[WORD_10000_12000] += 1;
break;
}
case WORD_12000_14000:
{
wfrq_array[WORD_12000_14000] += 1;
break;
}
case WORD_14000_16000:
{
wfrq_array[WORD_14000_16000] += 1;
break;
}
case WORD_OVER_16000:
{
wfrq_array[WORD_OVER_16000] += 1;
break;
}
default:
{
wfrq_array[WORD_NOT_FOUND_COCA] += 1;
break;
}
}
} bool isaletter(const char& c)
{
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
{
return true;
}
else
{
return false;
}
} //Class Cletters realization
CLetters::CLetters()
{
m_word = "";
} CLetters::~CLetters()
{
//do nothing
} void CLetters::fill(vector<char>& vw)
{
//store the word with lower form
m_word.assign(vw.begin(), vw.end());
transform(m_word.begin(), m_word.end(), m_word.begin(), tolower);
} const string CLetters::word()
{
return m_word;
} const char CLetters::firstletter()
{
return m_word[0];
} void CLetters::processing()
{
cout << "Finding word \"" << m_word << "\"...\t";
} bool CLetters::usual_recheck()
{
//check if the word is usual
bool RetVal = false;
for (int i = 0; i < USUAL_WORD_NUM; i++)
{
if (m_word == usual_w_out_of_COCA_str[i])
{
RetVal = true;
}
else
{
//do nothing
}
}
return RetVal;
} bool CLetters::form_recheck()
{
bool RetVal = false;
if (m_word.length() > 3)
{
char e1, e2, e3;
e3 = m_word[m_word.length() - 3]; //last but two letter
e2 = m_word[m_word.length() - 2]; //last but one letter
e1 = m_word[m_word.length() - 1]; //last letter if (e1 == 's')
{
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else if (e2 == 'e' && e1 == 'd')
{
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else if (e3 == 'i' && e2 == 'n' && e1 == 'g')
{
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
m_word.erase(m_word.length() - 1);
RetVal = true;
}
else
{
//do nothing
}
}
return RetVal;
}
main.cpp
/* main .cpp */ #include <numeric>
#include <iomanip>
#include <ctime>
#include "TextVocabularyAnalyzer.h" int main()
{
//file init
ifstream COCA_txt("D:\\COCA.txt");
ifstream USER_txt("D:\\JobsSpeech.txt"); //time init
clock_t startTime, endTime;
double build_map_time = 0;
double process_time = 0; startTime = clock(); //build time start //build COCA words map
map<string, int> COCA_WordsList[WORDS_HEAD_NUM];
int readlines = 0; while (readlines < COCA_WORDS_NUM)
{
int frequency = 0; string word = "";
COCA_txt >> frequency;
COCA_txt >> word; //transform to lower uniformly
transform(word.begin(), word.end(), word.begin(), tolower); //import every word
for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
{
//check word head
if (word[0] == alphabet_str[whead])
{
//if a word already exists, only load its lower frequency
if (COCA_WordsList[whead].find(word) == COCA_WordsList[whead].end())
{
COCA_WordsList[whead].insert(make_pair(word, frequency));
}
else
{
COCA_WordsList[whead][word] = frequency < COCA_WordsList[whead][word] ? frequency : COCA_WordsList[whead][word];
}
}
else
{
// do nothing
}
}
readlines++;
} endTime = clock(); //build time stop
build_map_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //user prompt
cout << "COCA words list imported.\nPress any key to start frequency analysis...\n";
cin.get(); startTime = clock(); //process time start //find text words
vector<char> content_read;
CLetters word_readed;
vector<int> frequecy_processed = { 0 };
array<int, WORD_LEVEL_NUM> words_analysis_array{ 0 };
char char_read = ' '; //get text char one by one
while (USER_txt.get(char_read))
{
//only letters and '-' between letters will be received
if (isaletter(char_read) || char_read == '-')
{
content_read.push_back(char_read);
}
else
{
//char which is not a letter marks the end of a word
if (!content_read.empty()) //skip single letter
{
int current_word_frequency = 0; //assign letters to make the word
word_readed.fill(content_read);
word_readed.processing(); cout << "Frequency:";
//check the word's head and find its frequency in COCA list
for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
{
if (word_readed.firstletter() == alphabet_str[whead])
{
cout << COCA_WordsList[whead][word_readed.word()];
current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //check if the word has been processed
if (current_word_frequency == 0)
{
//addtional check
if (word_readed.usual_recheck())
{
word_frequency_analyze(words_analysis_array, WORD_UNDER_4000);
}
else if (word_readed.form_recheck())
{
current_word_frequency = COCA_WordsList[whead][word_readed.word()]; //try again
if (current_word_frequency > 0)
{
frequecy_processed.push_back(current_word_frequency);
word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
}
else
{
// do nothing
}
}
else
{
word_frequency_analyze(words_analysis_array, WORD_NOT_FOUND_COCA);
}
}
else if (find(frequecy_processed.begin(), frequecy_processed.end(), current_word_frequency)
== frequecy_processed.end())
{
//classify this word and make statistics
frequecy_processed.push_back(current_word_frequency);
word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
}
else
{
// do nothing
}
}
else
{
//do nothing
}
}
cout << endl; content_read.clear();
}
else
{
//do nothing
}
}
} endTime = clock(); //process time stop
process_time = (double)(endTime - startTime) / CLOCKS_PER_SEC; //calc whole words processed
int whole_words_analyzed = 0;
whole_words_analyzed = accumulate(words_analysis_array.begin(), words_analysis_array.end(), 0); //report result
cout << "\n////////// Report ////////// \n";
for (int i = 0;i< words_analysis_array.size();i++)
{
cout << report_str[i] <<"\t"<< words_analysis_array[i] << " (";
cout<<fixed<<setprecision(2)<<(float)words_analysis_array[i] * 100 / whole_words_analyzed << "%)" << endl;
}
cout << "\nWords totally analyzed: " << whole_words_analyzed << endl; //show run time
cout << "Map build time: " << build_map_time*1000 << "ms.\n";
cout << "Process time: " << process_time*1000 << "ms.\n";
cout << "////////////////////////////" << endl; //close file
COCA_txt.close();
USER_txt.close(); return 0;
}
基于COCA词频表的文本词汇分布测试工具v0.2的更多相关文章
- 基于COCA词频表的文本词汇分布测试工具v0.1
美国语言协会对美国人日常使用的英语单词做了一份详细的统计,按照日常使用的频率做成了一张表,称为COCA词频表.排名越低的单词使用频率越高,该表可以用来统计词汇量. 如果你的词汇量约为6000,那么这张 ...
- 基于Text-CNN模型的中文文本分类实战 流川枫 发表于AI星球订阅
Text-CNN 1.文本分类 转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...
- 基于Text-CNN模型的中文文本分类实战
Text-CNN 1.文本分类 转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...
- 基于jquery的bootstrap在线文本编辑器插件Summernote
Summernote是一个基于jquery的bootstrap超级简单WYSIWYG在线编辑器.Summernote非常的轻量级,大小只有30KB,支持Safari,Chrome,Firefox.Op ...
- Chinese-Text-Classification,用卷积神经网络基于 Tensorflow 实现的中文文本分类。
用卷积神经网络基于 Tensorflow 实现的中文文本分类 项目地址: https://github.com/fendouai/Chinese-Text-Classification 欢迎提问:ht ...
- Android版数据结构与算法(四):基于哈希表实现HashMap核心源码彻底分析
版权声明:本文出自汪磊的博客,未经作者允许禁止转载. 存储键值对我们首先想到HashMap,它的底层基于哈希表,采用数组存储数据,使用链表来解决哈希碰撞,它是线程不安全的,并且存储的key只能有一个为 ...
- HDFS的快照原理和Hbase基于快照的表修复
前一篇文章<HDFS和Hbase误删数据恢复>主要讲了hdfs的回收站机制和Hbase的删除策略.根据hbase的删除策略进行hbase的数据表恢复.本文主要介绍了hdfs的快照原理和根据 ...
- js语言评价--js 基于哈希表、原型链、作用域、属性类型可配置的多范式编程语言
js 基于哈希表.原型链.作用域.属性类型可配置的多范式编程语言 值类型.引用类型.直接赋值: 原型是以对象形式存在的类型信息. ECMA-262把对象定义为:无序属性的集合,其属性可以包含基本值,对 ...
- mysql中【update/Delete】update中无法用基于被更新表的子查询,You can't specify target table 'test1' for update in FROM clause.
关键词:mysql update,mysql delete update中无法用基于被更新表的子查询,You can't specify target table 'test1' for update ...
随机推荐
- OC基础--数据类型与表达式
前言 做iOS开发有3年了,从当初的小白到现在,断断续续看过很多资料,之前也写过一些博文来记录,但是感觉知识点都比较凌乱.所以最近准备抽时间把iOS开发的相关知识进行一个梳理,主要分为OC基础.UI控 ...
- vue学习笔记一
vue 颠覆式的开发方式 解疑 为什么要学习vue? 降低项目的复杂度 就业需要 vue难不难? 特别简单 相比于React,vue有什么优势? 前端三大框架:Angular.React.Vue Vu ...
- HDU-4417-Super Mario(线段树+离线处理)
Mario is world-famous plumber. His “burly” figure and amazing jumping ability reminded in our memory ...
- 我竟然才知道slf4j里还有个MDC
大家好久不见,我是walking.今天给大家带来一个日志方面的知识——MDC,不知道大家认识不,反正我是最近刚知道的 初见MDC 前两天看项目中的代码,无意中看到一个自定义的线程池 MDCThread ...
- Azure Storage 系列(四)在.Net 上使用Table Storage
一,引言 今天我们就不多说废话了,直接进入正题,Azure Table Storage.开始内容之前,我们先介绍一下Azure Table Storage. 1,什么是Azure Table Stor ...
- Linux系统小知识
换Linux系统快半年了,刚开始总是碰到各种各样的问题,虽然斗解决了,由于没有记录,过一段时间就忘了,故在这里记录一下. 选择国内镜像源: Manjaro有一个很好用的命令rankmirrors.ra ...
- 关于input框仿百度/google自动提示的方法
引入jquery-autocomplete文件 链接:https://pan.baidu.com/s/1hW0XBYH8ZgJgMSY1Ce6Pig 密码:tv5b $(function() { $( ...
- core的 Linq基本使用,简单模拟数据库多表的左右内连接的测试
1:先看效果: 2:部分代码截图 3:全部代码 1 using System; 2 using System.Collections.Generic; 3 using System.Linq; 4 n ...
- Win10更新后蓝牙出现故障的解决方法
昨天Win10自动更新后,我发现我的键盘突然就不管用了,检查了一下发现原来蓝牙没有打开,同时任务栏中的蓝牙图标也不见了. 不久之前,这样的情况已经出现过了一次,那次好像更新系统后就好了,但这次是系统更 ...
- 【深入理解JVM】学习笔记——-1、JVM基本结构
转载自:https://blog.csdn.net/singit/article/details/54920387?utm_source=blogkpcl11 什么是jvm?JVM的基本结构, 也就是 ...