编译原理之正则表达式转NFA

本文转载自http://chriszz.sinaapp.com/?p=257

输入一个正则表达式，输出一个NFA。

我的做法：输入一个字符串表示正则，输出则是把输出到一个.dot文件中并将dot文件编译成pdf，fedora需要sudo yum install dot，然后evince XXX.pdf就可以查看生成的NFA了。

具体算法是按照龙书上的Tompson算法来的。

废话不多说，放码过来：

/*

Author：ChrisZZ(zchrissirhcz@gmail.com)

Time:2013-12-25 14:13:09

输入：正则表达式

输出：自动机

算法步骤：

1.把正则表达式转化为后缀表达式

2.把后缀表达式转化为NFA

3.用dot语言把NFA输出到PDF

参考：

1.Regular Expression Matching Can Be Simple And Fast

http://swtch.com/~rsc/regexp/regexp1.html

2.龙书 chap3.7.4 从正则表达式构造NFA

3.YCC学长的project中dot语言的使用

其他说明：

1.需要安装dot，并添加到系统path中

2.在windows下运行时，控制台因为编码不支持可能导致中文提示无法显示

*/

#include <iostream>

#include <string>

#include <stdio.h>

#include <stack>

#include <string.h>

#include <stdexcept>

#include <stdlib.h>

using namespace std;

const int Match = 256;

const int Split = 257;//表示epsilon分支

struct Paren{//括号结构体

    int natom;

    int nalt;

};

string re2post(string re){

    Paren paren;//括号

    stack<struct Paren>parenStk;

    string postExpr="";

    int i, len=re.length();

    int nalt=0, natom=0;

    const string invalidRegExp = "非法的正则表达式";

    for(i=0; i<len; i++){

        if(isspace(re[i])) continue;

        if(isalpha(re[i])){

            if(natom>1){

                natom--;

                postExpr = postExpr + '.';

            }

            natom++;

            postExpr = postExpr + re[i];

        }

        else if(re[i]=='('){

            if(natom>1){

                postExpr = postExpr + '.';

            }

            paren.natom = natom;

            paren.nalt = nalt;

            parenStk.push(paren);

            nalt = 0;

            natom = 0;

        }

        else if(re[i]==')'){

            if(natom==0 || parenStk.empty())

                throw runtime_error(invalidRegExp+":括号不匹配");

            while(--natom>0){//比如((a|b)(c|d))模式，当上一次匹配完倒数第二个右括号后，natom为2，需要添加'.'

                postExpr = postExpr + '.';

            }

            while(nalt-->0){

                postExpr = postExpr + '|';

            }

            paren=parenStk.top();

            parenStk.pop();

            natom = paren.natom;

            nalt = paren.nalt;

            natom++;

        }

        else if(re[i]=='*'){

            if(natom==0)

                throw runtime_error(invalidRegExp+":提前出现'*'");

            postExpr = postExpr + re[i];

        }

        else if(re[i]=='|'){

            if(natom==0) throw runtime_error(invalidRegExp+":提前出现'|'");

            while(--natom>0){

                postExpr = postExpr + '.';

            }

            nalt++;

        }

        else

            throw runtime_error(invalidRegExp);

    }

    if(!parenStk.empty())

        throw runtime_error(invalidRegExp+":括号不匹配");

    while(--natom>0){

        postExpr = postExpr + '.';

    }

    while(nalt-->0){

        postExpr = postExpr + '|';

    }

    return postExpr;

}

class NFA;

/*

* c<256表示edge权重为c；

* c=256表示终结状态，匹配成功

* c=257表示分支（split）

*/

class State{

    friend class NFA;

    friend void nfa2graph(State* head, const string& re);

public:

    State(int c=256, State* out=NULL, State* out1=NULL){

        this->c = c;

        this->out = out;

        this->out1 = out1;

        this->id = 0;

    }

    void setId(int id){

        this->id = id;

    }

private:

    int c;

    int id;//状态的编号

    State* out;//从本状态出去的状态集合的头指针

    State* out1;//两个分支的情况

};

class NFA{

public:

    NFA(){

        head = NULL;

        tail = NULL;

    }

    NFA(const int& c){

        tail = new State(Match, NULL, NULL);

        head = new State(c, tail, NULL);

    }

    void doCat(NFA& nfa){

        tail->out = nfa.head;

        tail->c = Split;

        tail = nfa.tail;

        nfa.head = NULL;

        nfa.tail = NULL;

    }

    void doUnion(NFA& nfa){

        State* newHead = new State(Split, head, nfa.head);

        State* newTail = new State(Match, NULL, NULL);

        tail->c = Split;

        tail->out = newTail;

        nfa.tail->c = Split;

        nfa.tail->out = newTail;

        tail = newTail;

        head = newHead;

        nfa.head = NULL;

        nfa.tail = NULL;

    }

    void doStar(){

        State* newTail = new State(Match, NULL, NULL);

        State* newHead = new State(Split, head, newTail);

        tail->c = Split;

        tail->out = newTail;

        tail->out1 = head;

        tail = newTail;

        head = newHead;

    }

    void nfa2graph(const string& re){

        char myfile[100];

        printf("请输入一个文件名，用来保存生成的NFA-graph(不必提供后缀):\n");

        scanf("%s", myfile);

        printf("已将DOT文件存储在\"%s.dot\",\n", myfile);

        printf("PDF文件则存储在\"%s.dot.pdf\".\n", myfile);

        int i;

        while(myfile[i]!='\0')

            i++;

        myfile[i] = '.';

        myfile[i+1] = 'd';

        myfile[i+2] = 'o';

        myfile[i+3] = 't';

        myfile[i+4] = '\0';

        FILE *file = fopen(myfile, "w");

        fputs("digraph {\n", file);

        fputs("\t\"", file);

        int len=re.length();

        for(i=0; i<len; i++){

            fprintf(file, "%c", re[i]);

        }

        fputs("\" [shape = plaintext]\n", file);

        fputs("\trankdir = LR\n", file);

        fputs("\t\"\" [shape = point]\n", file);

        fputs("\t\"\" -> 1 [label = Start]\n\n", file);

        int id = 1;

        char circle[2000];

        memset(circle, 0, sizeof(circle));

        State* p;

        stack<State*> staStk;

        head->setId(id++);

        staStk.push(head);

        while(!staStk.empty()){

            p = staStk.top();

            staStk.pop();

            char flag = 1;

            cout << "p->c=" << p->c << endl;

            if(p->c < Match){

                cout << "p->out->id=" << p->out->id << endl;

                if(p->out->id==0){

                    p->out->id = id++;

                    cout << "id=" << id << endl;                }

                else

                    flag = 0;

                fprintf(file, "\t%d -> %d [label = \"%c\"]\n", p->id, (p->out)->id, p->c);

                State *what = p->out;

                if(flag) //push(*what);

                    staStk.push(what);

            } else if(p->c == Match){

                circle[p->id] = 1;

            } else{     //对应Split的情形

                if(p->out->id==0)

                    p->out->id = id++;

                else

                    flag = 0;

                fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out->id);

                State *what = p->out;

                if(flag) staStk.push(what);

                if(p->out1!=NULL){

                    flag = 1;

                    if(p->out1->id==0)

                        p->out1->id = id++;

                    else

                        flag = 0;

                    fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out1->id);

                    what = p->out1;

                    if(flag) staStk.push(what);

                }

            }

        }

        for(i=1; i<id; i++){

            fprintf(file, "\t%d [shape = circle", i);

            if(circle[i])

                fputs(", peripheries = 2", file);

            fprintf(file, "]\n");

        }

        fputs("}", file);

        fclose(file);

        char cmd[108];

        sprintf(cmd, "dot %s -O -Tpdf", myfile);

        if(system(cmd)==0){

            printf("成功生成pdf图像！\n");

            //printf("Linux用户可以使用evince file.pdf &命令打开~\n");

        }

        else

            printf("悲剧！生成pdf图像时出现错误..\n");

    }

private:

    State* head;

    State* tail;

};

NFA post2nfa(const string& postExpr){

    stack<NFA> nfaStk;

    NFA e1, e2, e;

    int i, len=postExpr.length();

    for(i=0; i<len; i++){

        switch(postExpr[i]){

        case '.':

            e2 = nfaStk.top();

            nfaStk.pop();

            e1 = nfaStk.top();

            nfaStk.pop();

            e1.doCat(e2);

            nfaStk.push(e1);

            break;

        case '|':

            e2 = nfaStk.top();

            nfaStk.pop();

            e1 = nfaStk.top();

            nfaStk.pop();

            e1.doUnion(e2);

            nfaStk.push(e1);

            break;

        case '*':

            e = nfaStk.top();

            nfaStk.pop();

            e.doStar();

            nfaStk.push(e);

            break;

        default://

            NFA alpha(postExpr[i]);

            nfaStk.push(alpha);

        }

    }

    e = nfaStk.top();

    nfaStk.pop();

    if(!nfaStk.empty())

        throw runtime_error("未知错误");

    return e;

}

int main(){

    string re;

    while(true){

        cout << "请输入一个正则表达式:\n";

        cin >> re;

        string postExpr = re2post(re);

        cout << "postExpr is : " << postExpr << endl;

        NFA nfa = post2nfa(postExpr);

        nfa.nfa2graph(re);

        cout << "继续吗?(y/n)\n" << endl;

        char c;

        cin >> c;

        while(c!='y' && c!='n'){

            cout << "请输入'y'或'n'.\n";

            c=getchar();

        }

        if(c=='n')

            break;

    }

    cout << "Bye~\n";

    return 0;

}

编译原理之正则表达式转NFA的更多相关文章

正则表达式引擎的构建——基于编译原理DFA（龙书第三章）——3 计算4个函数
整个引擎代码在github上,地址为:https://github.com/sun2043430/RegularExpression_Engine.git nullable, firstpos, la ...
编译原理-NFA构造DFA
本题摘自北邮的编译原理与技术. 首先,根据此图构造状态转换表表中第一列第一行表示从第一个符号B通过任意个空转换能到达的节点,Ia表示由此行的状态数组({B,5,1}可以看作0状态)经过一个a可以到达 ...
编译原理--NFA/DFA
现成的, 讲义: https://www.cnblogs.com/AndyEvans/p/10240790.html https://www.cnblogs.com/AndyEvans/p/10241 ...
编译原理-词法分析05-正则表达式到DFA-01
编译原理-词法分析05-正则表达式到DFA 要经历正则表达式 --> NFA --> DFA 的过程. 0. 术语 Thompson构造Thompson Construction 利用ε ...
Compiler Theory(编译原理)、词法/语法/AST/中间代码优化在Webshell检测上的应用
catalog . 引论 . 构建一个编译器的相关科学 . 程序设计语言基础 . 一个简单的语法制导翻译器 . 简单表达式的翻译器(源代码示例) . 词法分析 . 生成中间代码 . 词法分析器的实现 ...
Stanford公开课《编译原理》学习笔记(1~4课)
目录一. 编译的基本流程二. Lexical Analysis(词法分析阶段) 2.1 Lexical Specification(分词原则) 2.2 Finite Automata (典型分词算 ...
编译原理_P1004
龙书相关知识点总结 //*************************引论***********************************// 1. 编译器(compiler):从一中语言( ...
跟vczh看实例学编译原理——三：Tinymoe与无歧义语法分析
文章中引用的代码均来自https://github.com/vczh/tinymoe. 看了前面的三篇文章,大家应该基本对Tinymoe的代码有一个初步的感觉了.在正确分析"print ...
跟vczh看实例学编译原理——二：实现Tinymoe的词法分析
文章中引用的代码均来自https://github.com/vczh/tinymoe. 实现Tinymoe的第一步自然是一个词法分析器.词法分析其所作的事情很简单,就是把一份代码分割成若干个tok ...

随机推荐

python【数据类型：列表与元组】
python列表: 定义一个列表:cities=['北京','上海','广州','深圳'] 注意:列表的下标0表示第一个元素,下标-1表示最后一个元素列表增加元素在列表末尾添加一个元素:citie ...
进化论VS中性突变理论
进化论VS中性突变理论查尔斯·罗伯特·达尔文(英语:CharlesRobert Darwin,1809年2月12日-1882年4月19日),英国生物学家,其“进化论”被列为19世纪自然科学的三大发现 ...
P3007 [USACO11JAN]大陆议会The Continental Cowngress
P3007 [USACO11JAN]大陆议会The Continental Cowngress 题意: 给出 n 个法案, m 头牛的意见, 每头牛有两个表决格式为 "支持或反对某法案&q ...
Oracle数据库代码指令简介
重大提醒!!!oracle里面的查询,一定要把查询名大写!!!就算你创建的时候是小写字母,查询的时候也一定要大写!!! 这是oracle的课后作业,弄懂这些也差不多了吧,不懂的可以去看我的SQL se ...
前端如何使用easy-mock模拟接口
1. 如何使用easy-mock // 获取 easy-mock 的模拟数据 getData () { // 开发环境使用 easy-mock 数据,正式环境使用 json 文件 if (proces ...
python 12306 车次数据获取
ssl._create_default_https_context = ssl._create_default_https_context train_data = '2018-10-20' head ...
HDU 4311 Meeting point-1 求一个点到其它点的曼哈顿距离之和
题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=4311 解题报告:在一个平面上有 n 个点,求一个点到其它的 n 个点的距离之和最小是多少. 首先不得不 ...
Uva 11549 - Calculator Conundrum 找规律加map
题目链接:http://uva.onlinejudge.org/index.php?option=com_onlinejudge&Itemid=8&page=show_problem& ...
【数据库】软件安全测试之SQL注入
这些年我们发现越来越多的公司开始注重安全测试了,为什么?因为安全测试可以在某种程度上可以排查掉你项目的一些安全漏洞,这样你的系统上线后才会相对安全,才有可能尽量避免来自外部的攻击.每一年互联网都会发生 ...
[转]QList内存释放
QList<T> 的释放分两种情况: 1.T的类型为非指针,这时候直接调用clear()方法就可以释放了,看如下测试代码 #include <QtCore/QCoreApplicat ...

编译原理之正则表达式转NFA

编译原理之正则表达式转NFA的更多相关文章

随机推荐

热门专题