基于COCA词频表的文本词汇分布测试工具v0.2

update:

简单整理了一下代码的组织。
处理的单词封装成类，单词的修正，信息的显示都作为其内的方法。

写得还比较糙，工具本身可以封装，还有对于单词的变形基本没什么处理，以后有时间再改。

项目托管到github上了。https://github.com/MorpheusDong/TextVocabularyAnalyzer

TypeDefine.h

#ifndef _TYPE_DEFINE_H_

#define _TYPE_DEFINE_H_

#include <iostream>

#include <fstream>

#include <string>

#include <array>

#include <vector>

#include <iterator>

#include <map>

using namespace std;

#define COCA_WORDS_NUM                       20201U

#define WORDS_HEAD_NUM                       26U

#define WORDS_HEAD_A                         0U

#define WORDS_HEAD_B                         1U

#define WORDS_HEAD_C                         2U

#define WORDS_HEAD_D                         3U

#define WORDS_HEAD_E                         4U

#define WORDS_HEAD_F                         5U

#define WORDS_HEAD_G                         6U

#define WORDS_HEAD_H                         7U

#define WORDS_HEAD_I                         8U

#define WORDS_HEAD_J                         9U

#define WORDS_HEAD_K                         10U

#define WORDS_HEAD_L                         11U

#define WORDS_HEAD_M                         12U

#define WORDS_HEAD_N                         13U

#define WORDS_HEAD_O                         14U

#define WORDS_HEAD_P                         15U

#define WORDS_HEAD_Q                         16U

#define WORDS_HEAD_R                         17U

#define WORDS_HEAD_S                         18U

#define WORDS_HEAD_T                         19U

#define WORDS_HEAD_U                         20U

#define WORDS_HEAD_V                         21U

#define WORDS_HEAD_W                         22U

#define WORDS_HEAD_X                         23U

#define WORDS_HEAD_Y                         24U

#define WORDS_HEAD_Z                         25U

#define USUAL_WORD_NUM                       17U

typedef enum WordFrequencyType

{

    WORD_UNDER_4000 = 0,

    WORD_4000_6000,

    WORD_6000_8000,

    WORD_8000_10000,

    WORD_10000_12000,

    WORD_12000_14000,

    WORD_14000_16000,

    WORD_OVER_16000,

    WORD_NOT_FOUND_COCA,

    WORD_LEVEL_NUM

}TagWordFrequencyType;

const string alphabet_str = "abcdefghijklmnopqrstuvwxyz";

const string report_str[WORD_LEVEL_NUM] = {

    "UNDER 4000: ",

    "4000-6000: ",

    "6000-8000: ",

    "8000-10000: ",

    "10000-12000: ",

    "12000-14000: ",

    "14000-16000: ",

    "16000-20000+: ",

    "\nNot found in COCA:"

};

//for usual words not included in COCA

const string usual_w_out_of_COCA_str[USUAL_WORD_NUM] =

{

    "s","is","are","re","was","were",

    "an","won","t","has","had","been",

    "did","does","cannot","got","men"

};

#endif

TextVocabularyAnalyzer.h

#ifndef _TEXT_VOCABULARY_ANALYZER_H_

#define _TEXT_VOCABULARY_ANALYZER_H_

#include "TypeDefine.h"

extern TagWordFrequencyType frequency_classify(const int wfrq);

extern void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag);

extern bool isaletter(const char& c);

class CLetters

{

private:

    string m_word;

public:

    CLetters();

    ~CLetters();

    void fill(vector<char>& vw);

    const string word();

    const char firstletter();

    void processing();

    bool usual_recheck();

    bool form_recheck();

};

#endif // !_TEXT_VOCABULARY_ANALYZER_H_

TextVocabularyAnalyzer.cpp

/* TextVocabularyAnalyzer.cpp */

#include <algorithm>

#include "TextVocabularyAnalyzer.h"

TagWordFrequencyType frequency_classify(const int wfrq)

{

    if (wfrq == 0)

    {

        return WORD_NOT_FOUND_COCA;

    }

    else if (wfrq > 0 && wfrq <= 4000)

    {

        return WORD_UNDER_4000;

    }

    else if (wfrq > 4000 && wfrq <= 6000)

    {

        return WORD_4000_6000;

    }

    else if (wfrq > 6000 && wfrq <= 8000)

    {

        return WORD_6000_8000;

    }

    else if (wfrq > 8000 && wfrq <= 10000)

    {

        return WORD_8000_10000;

    }

    else if (wfrq > 10000 && wfrq <= 12000)

    {

        return WORD_10000_12000;

    }

    else if (wfrq > 12000 && wfrq <= 14000)

    {

        return WORD_12000_14000;

    }

    else if (wfrq > 14000 && wfrq <= 16000)

    {

        return WORD_14000_16000;

    }

    else

    {

        return WORD_OVER_16000;

    }

}

void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag)

{

    switch (wfrq_tag)

    {

    case WORD_UNDER_4000:

    {

        wfrq_array[WORD_UNDER_4000] += 1;

        break;

    }

    case WORD_4000_6000:

    {

        wfrq_array[WORD_4000_6000] += 1;

        break;

    }

    case WORD_6000_8000:

    {

        wfrq_array[WORD_6000_8000] += 1;

        break;

    }

    case WORD_8000_10000:

    {

        wfrq_array[WORD_8000_10000] += 1;

        break;

    }

    case WORD_10000_12000:

    {

        wfrq_array[WORD_10000_12000] += 1;

        break;

    }

    case WORD_12000_14000:

    {

        wfrq_array[WORD_12000_14000] += 1;

        break;

    }

    case WORD_14000_16000:

    {

        wfrq_array[WORD_14000_16000] += 1;

        break;

    }

    case WORD_OVER_16000:

    {

        wfrq_array[WORD_OVER_16000] += 1;

        break;

    }

    default:

    {

        wfrq_array[WORD_NOT_FOUND_COCA] += 1;

        break;

    }

    }

}

bool isaletter(const char& c)

{

    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))

    {

        return true;

    }

    else

    {

        return false;

    }

}

//Class Cletters realization

CLetters::CLetters()

{

    m_word = "";

}

CLetters::~CLetters()

{

    //do nothing

}

void CLetters::fill(vector<char>& vw)

{

    //store the word with lower form

    m_word.assign(vw.begin(), vw.end());

    transform(m_word.begin(), m_word.end(), m_word.begin(), tolower);

}

const string CLetters::word()

{

    return m_word;

}

const char CLetters::firstletter()

{

    return m_word[0];

}

void CLetters::processing()

{

    cout << "Finding word \"" << m_word << "\"...\t";

}

bool CLetters::usual_recheck()

{

    //check if the word is usual

    bool RetVal = false;

    for (int i = 0; i < USUAL_WORD_NUM; i++)

    {

        if (m_word == usual_w_out_of_COCA_str[i])

        {

            RetVal = true;

        }

        else

        {

            //do nothing

        }

    }

    return RetVal;

}

bool CLetters::form_recheck()

{

    bool RetVal = false;

    if (m_word.length() > 3)

    {

        char e1, e2, e3;

        e3 = m_word[m_word.length() - 3];    //last but two letter

        e2 = m_word[m_word.length() - 2];    //last but one letter

        e1 = m_word[m_word.length() - 1];    //last letter

        if (e1 == 's')

        {

            m_word.erase(m_word.length() - 1);

            RetVal = true;

        }

        else if (e2 == 'e' && e1 == 'd')

        {

            m_word.erase(m_word.length() - 1);

            m_word.erase(m_word.length() - 1);

            RetVal = true;

        }

        else if (e3 == 'i' && e2 == 'n' && e1 == 'g')

        {

            m_word.erase(m_word.length() - 1);

            m_word.erase(m_word.length() - 1);

            m_word.erase(m_word.length() - 1);

            RetVal = true;

        }

        else

        {

            //do nothing

        }

    }

    return RetVal;

}

main.cpp

/* main .cpp */

#include <numeric>

#include <iomanip>

#include <ctime>

#include "TextVocabularyAnalyzer.h"

int main()

{

    //file init

    ifstream COCA_txt("D:\\COCA.txt");

    ifstream USER_txt("D:\\JobsSpeech.txt");

    //time init

    clock_t startTime, endTime;

    double build_map_time = 0;

    double process_time = 0;

    startTime = clock();    //build time start

    //build COCA words map

    map<string, int> COCA_WordsList[WORDS_HEAD_NUM];

    int readlines = 0;

    while (readlines < COCA_WORDS_NUM)

    {

        int frequency = 0; string word = "";

        COCA_txt >> frequency;

        COCA_txt >> word;

        //transform to lower uniformly

        transform(word.begin(), word.end(), word.begin(), tolower);

        //import every word

        for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)

        {

            //check word head

            if (word[0] == alphabet_str[whead])

            {

                //if a word already exists, only load its lower frequency

                if (COCA_WordsList[whead].find(word) == COCA_WordsList[whead].end())

                {

                    COCA_WordsList[whead].insert(make_pair(word, frequency));

                }

                else

                {

                    COCA_WordsList[whead][word] = frequency < COCA_WordsList[whead][word] ? frequency : COCA_WordsList[whead][word];

                }

            }

            else

            {

                // do nothing

            }

        }

        readlines++;

    }

    endTime = clock();    //build time stop

    build_map_time = (double)(endTime - startTime) / CLOCKS_PER_SEC;

    //user prompt

    cout << "COCA words list imported.\nPress any key to start frequency analysis...\n";

    cin.get();

    startTime = clock();    //process time start

    //find text words

    vector<char> content_read;

    CLetters word_readed;

    vector<int> frequecy_processed = { 0 };

    array<int, WORD_LEVEL_NUM> words_analysis_array{ 0 };

    char char_read = ' ';

    //get text char one by one

    while (USER_txt.get(char_read))

    {

        //only letters and '-' between letters will be received

        if (isaletter(char_read) || char_read == '-')

        {

            content_read.push_back(char_read);

        }

        else

        {

            //char which is not a letter marks the end of a word

            if (!content_read.empty())    //skip single letter

            {

                int current_word_frequency = 0;

                //assign letters to make the word

                word_readed.fill(content_read);

                word_readed.processing();

                cout << "Frequency:";

                //check the word's head and find its frequency in COCA list

                for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)

                {

                    if (word_readed.firstletter() == alphabet_str[whead])

                    {

                        cout << COCA_WordsList[whead][word_readed.word()];

                        current_word_frequency = COCA_WordsList[whead][word_readed.word()];

                        //check if the word has been processed

                        if (current_word_frequency == 0)

                        {

                            //addtional check

                            if (word_readed.usual_recheck())

                            {

                                word_frequency_analyze(words_analysis_array, WORD_UNDER_4000);

                            }

                            else if (word_readed.form_recheck())

                            {

                                current_word_frequency = COCA_WordsList[whead][word_readed.word()];    //try again

                                if (current_word_frequency > 0)

                                {

                                    frequecy_processed.push_back(current_word_frequency);

                                    word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));

                                }

                                else

                                {

                                    // do nothing

                                }

                            }

                            else

                            {

                                word_frequency_analyze(words_analysis_array, WORD_NOT_FOUND_COCA);

                            }

                        }

                        else if (find(frequecy_processed.begin(), frequecy_processed.end(), current_word_frequency)

                            == frequecy_processed.end())

                        {

                            //classify this word and make statistics

                            frequecy_processed.push_back(current_word_frequency);

                            word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));

                        }

                        else

                        {

                            // do nothing

                        }

                    }

                    else

                    {

                        //do nothing

                    }

                }

                cout << endl;

                content_read.clear();

            }

            else

            {

                //do nothing

            }

        }

    }

    endTime = clock();    //process time stop

    process_time = (double)(endTime - startTime) / CLOCKS_PER_SEC;

    //calc whole words processed

    int whole_words_analyzed = 0;

    whole_words_analyzed = accumulate(words_analysis_array.begin(), words_analysis_array.end(), 0);

    //report result

    cout << "\n////////// Report ////////// \n";

    for (int i = 0;i< words_analysis_array.size();i++)

    {

        cout << report_str[i] <<"\t"<< words_analysis_array[i] << " (";

        cout<<fixed<<setprecision(2)<<(float)words_analysis_array[i] * 100 / whole_words_analyzed << "%)" << endl;

    }

    cout << "\nWords totally analyzed: " << whole_words_analyzed << endl;

    //show run time

    cout << "Map build time: " << build_map_time*1000 << "ms.\n";

    cout << "Process time: " << process_time*1000 << "ms.\n";

    cout << "////////////////////////////" << endl;

    //close file

    COCA_txt.close();

    USER_txt.close();

    return 0;

}

基于COCA词频表的文本词汇分布测试工具v0.2的更多相关文章

基于COCA词频表的文本词汇分布测试工具v0.1
美国语言协会对美国人日常使用的英语单词做了一份详细的统计,按照日常使用的频率做成了一张表,称为COCA词频表.排名越低的单词使用频率越高,该表可以用来统计词汇量. 如果你的词汇量约为6000,那么这张 ...
基于Text-CNN模型的中文文本分类实战流川枫发表于AI星球订阅
Text-CNN 1.文本分类转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...
基于Text-CNN模型的中文文本分类实战
Text-CNN 1.文本分类转眼学生生涯就结束了,在家待就业期间正好有一段空闲期,可以对曾经感兴趣的一些知识点进行总结. 本文介绍NLP中文本分类任务中核心流程进行了系统的介绍,文末给出一个基于T ...
基于jquery的bootstrap在线文本编辑器插件Summernote
Summernote是一个基于jquery的bootstrap超级简单WYSIWYG在线编辑器.Summernote非常的轻量级,大小只有30KB,支持Safari,Chrome,Firefox.Op ...
Chinese-Text-Classification，用卷积神经网络基于 Tensorflow 实现的中文文本分类。
用卷积神经网络基于 Tensorflow 实现的中文文本分类项目地址: https://github.com/fendouai/Chinese-Text-Classification 欢迎提问:ht ...
Android版数据结构与算法(四):基于哈希表实现HashMap核心源码彻底分析
版权声明:本文出自汪磊的博客,未经作者允许禁止转载. 存储键值对我们首先想到HashMap,它的底层基于哈希表,采用数组存储数据,使用链表来解决哈希碰撞,它是线程不安全的,并且存储的key只能有一个为 ...
HDFS的快照原理和Hbase基于快照的表修复
前一篇文章<HDFS和Hbase误删数据恢复>主要讲了hdfs的回收站机制和Hbase的删除策略.根据hbase的删除策略进行hbase的数据表恢复.本文主要介绍了hdfs的快照原理和根据 ...
js语言评价--js 基于哈希表、原型链、作用域、属性类型可配置的多范式编程语言
js 基于哈希表.原型链.作用域.属性类型可配置的多范式编程语言值类型.引用类型.直接赋值: 原型是以对象形式存在的类型信息. ECMA-262把对象定义为:无序属性的集合,其属性可以包含基本值,对 ...
mysql中【update/Delete】update中无法用基于被更新表的子查询，You can't specify target table 'test1' for update in FROM clause.
关键词:mysql update,mysql delete update中无法用基于被更新表的子查询,You can't specify target table 'test1' for update ...

随机推荐

laravel发送邮件配置
1.设置发送方,即邮件服务器,可以使用163邮箱,设置smtp,开启后获取授权码 2.在env文件配置 MAIL_DRIVER=smtpMAIL_HOST=smtp.163.com //邮箱服务器M ...
Python爬虫实战点触验证码，模拟登陆bilibili
爬虫思路如下: 利用自动化爬虫工具 Selenium 模拟点击输入等操作来进行登录分析页面,获取点触验证码的点触图片,通过将图片发送给超级鹰打码平台识别后获取坐标信息根据超级鹰返回的数据,模拟坐标 ...
学完Python，我决定熬夜整理这篇总结
目录了解Python Python基础语法 Python数据结构数值字符串列表元组字典集合 Python控制流 if 判断语句 for 循环语句 while 循环语句 break 和 c ...
android开发之 listview中的item去掉分割线隐藏分割线
有三种方法: 1> 设置android:divider="@null" 2> android:divider="#00000000" #000000 ...
Oracle两个数据库互相访问-九五小庞
Oracle两个数据库互相访问
编程体系结构(02)：Java异常体系
本文源码:GitHub·点这里 || GitEE·点这里一.异常简介优秀的程序代码,都在追求高效,安全,和低错误率,但是程序中的异常是无法避免的,降低异常出现的频率是关键,异常出现如何处理是另一个 ...
.NET CORE命令行
目录 0. 基础命令行 1. 基础命令 2. SDK命令 3. 使用命令行创建. net Core项目 shanzm-2020年9月7日 22:00:00 0. 基础命令行 D:默认路径跳转到D盘 c ...
python图片转码为base64
import base64 def image_base64(): with open(image_path, 'rb', )as f: base64_data = base64.b64encode( ...
如何编写一个简单的Linux驱动（二）——设备操作集file_operations
前期知识如何编写一个简单的Linux驱动(一)--驱动的基本框架前言在上一篇文章中,我们学习了驱动的基本框架.这一章,我们会在上一章代码的基础上,继续对驱动的框架进行完善.要下载上一篇文章的全部 ...
JVM--先说本地方法接口
本地方法接口在讲Java虚拟机运行时数据区中本地方法栈之前,我们先来说说运行时数据区之外的一个叫本地方法接口的东西简称JNI(Java Native Interface) 简单来讲,一个Native ...

基于COCA词频表的文本词汇分布测试工具v0.2

基于COCA词频表的文本词汇分布测试工具v0.2的更多相关文章

随机推荐

热门专题