编译器DIY——词法分析
在上一篇文章中已经介绍了读文件的操作,那么这一篇文章中将会细致解释词法分析。
在源文件里解析出的单词流必须识别为保留字,标识符,常量,操作符和界符五大类
1.显然我们须要列举出全部的保留字,而这里与保留字相似的那么就是标识符,在C语言中,保留字都是以小写字母开头,并且当中的字母仅仅能是小写字母,而标识符的第一个字母则必须为字符(小写大写皆可)后面能够接大写和小写字母和字符 ‘_’, 在我写的这个编译器中,标识符不能超过100,在C语言中的标识符定义的长度大小远远大于此。
2.对于常量,这里须要注意的是整型和浮点型常量。
3.运算符依照的是以下的表:
C语言运算符表
运算符依照优先级大小由上向下排列,在同一行的运算符具有同样优先级。第二行是全部的一元运算符。
运算符 | 解释 | 结合方式 |
() [] -> . | 括号(函数等),数组,两种结构成员訪问 | 由左向右 |
! ~ ++ -- + -
* & |
否定,按位否定,增量,减量,正负号,
间接,取地址 |
由右向左 |
* / % | 乘,除,取模 | 由左向右 |
+ - | 加,减 | 由左向右 |
<< >> | 左移,右移 | 由左向右 |
< <= >= > | 小于,小于等于,大于等于,大于 | 由左向右 |
== != | 等于,不等于 | 由左向右 |
& | 按位与 | 由左向右 |
^ | 按位异或 | 由左向右 |
| | 按位或 | 由左向右 |
&& | 逻辑与 | 由左向右 |
|| | 逻辑或 | 由左向右 |
? : | 条件 | 由右向左 |
= += -= *= /=
&= ^= |= <<= >>= |
各种赋值 | 由右向左 |
, | 逗号(顺序) | 由左向右 |
4.界符:“;”“{}”,单引號,双引號
接下来我介绍的是对保留字的归类,为了查找方便,将保留字依照a-z的顺序排好,根据数组的下标定位,降低寻找的时间
/*
* keyword.h
*
* Created on: Jun 12, 2014
*
*/ #ifndef KEYWORD_H_
#define KEYWORD_H_ struct keyword{
char *keyName;
}; static struct keyword key__[]={
{"__int64"},
{"end"}
}; static struct keyword key_A[]={
{"auto"},
{"end"}
};
static struct keyword key_B[]={
{"break"},
{"end"}
};
static struct keyword key_C[]={
{"case"},
{"char"},
{"const"},
{"continue"},
{"end"}
};
static struct keyword key_D[]={
{"default"},
{"do"},
{"double"},
{"end"}
};
static struct keyword key_E[]={
{"else"},
{"enum"},
{"extern"},
{"end"}
};
static struct keyword key_F[]={
{"float"},
{"for"},
{"end"}
};
static struct keyword key_G[]={
{"goto"},
{"end"}
};
static struct keyword key_H[]={
{"end"}
};
static struct keyword key_I[]={
{"if"},
{"int"},
{"end"}
};
static struct keyword key_J[]={
{"end"}
};
static struct keyword key_K[]={
{"end"}
};
static struct keyword key_L[]={
{"long"},
{"end"}
};
static struct keyword key_M[]={
{"end"}
};
static struct keyword key_N[]={
{"end"}
};
static struct keyword key_O[]={
{"end"}
};
static struct keyword key_P[]={
{"end"}
};
static struct keyword key_Q[]={
{"end"}
};
static struct keyword key_R[]={
{"register"},
{"return"},
{"end"}
};
static struct keyword key_S[]={
{"short"},
{"signed"},
{"sizeof"},
{"static"},
{"struct"},
{"switch"},
{"end"}
};
static struct keyword key_T[]={
{"typedef"},
{"end"}
};
static struct keyword key_U[]={
{"union"},
{"unsigned"},
{"end"}
};
static struct keyword key_V[]={
{"void"},
{"volatile"},
{"end"}
};
static struct keyword key_W[]={
{"while"},
{"end"}
};
static struct keyword key_X[]={
{"end"}
};
static struct keyword key_Y[]={
{"end"}
};
static struct keyword key_Z[]={
{"end"}
};
// size is 27
static struct keyword *keywords[]={
key__,key_A,key_B,key_C,key_D,key_E,
key_F,key_G,key_H,key_I,key_J,key_K,
key_L,key_M,key_N,key_O,key_P,key_Q,
key_R,key_S,key_T,key_U,key_V,key_W,
key_X,key_Y,key_Z
}; #endif /* KEYWORD_H_ */
以下是词法分析的源代码;
/*
* lex.h
*
* Created on: Jun 13, 2014
*
*/
#include "input.h"
#include "keyword.h" #define isDigit(c) (c>='0' && c<='9')
#define isUpperLetter(c) (c>='A' && c <='Z')
#define isLowerLetter(c) (c>='a' && c<='z')
#define isLetter(c) (isUpperLetter || isLowerLetter)
/*
* lex.c
*
* Created on: Jun 13, 2014
*
*/
#include "zcc.h"
#include "lex.h" #define curr source.cursor int getToken() {
char a[100];
int a_length, i, flag;
/*
*skip ' ','\n' and '\b'
*/
while (*curr == ' ' || *curr == 10 || *curr == 9) {
curr++;
if (*curr == END_OF_FILE) {
return -1;
}
}
/* name or keyword on first is a-z */
a_length=0;
if (*curr >= 'a' && *curr <= 'z') {
IDAndKey:
a_length = 0;
do {
a[a_length++] = *curr++;
} while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr)
|| *curr == '_');
a[a_length] = '\0';
i = 0;
flag = 0;
if (*a - 'a' <= 26 && *a - 'a' >= 0) {
while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) {
if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) {
flag = 1;
break;
}
i++;
}
if (flag == 1) {
printf("keyword is %s\n", a);
return 1;
} else {
printf("Identify is %s\n", a);
return 1;
}
} else {
printf("Identify is %s\n", a);
return 1;
}
} else if (isUpperLetter(*curr)) {
goto IDAndKey;
} else if (isDigit(*curr)) {
a_length = 0;
do {
a[a_length++] = *curr++;
} while (isDigit(*curr));
//float number
if (*curr == '.') {
do {
a[a_length++] = *curr++;
} while (isDigit(*curr));
a[a_length] = '\0';
printf("float number is %s\n", a);
return 1;
} else {
// number
a[a_length] = '\0';
printf("number is %s\n", a);
return 1;
}
/*
* Operator begin
* */
} else if (*curr == '<') {
a[a_length++] = *curr++;
if (*curr == '<') {
a[a_length++] = *curr++;
lastOperatorDeal:
a[a_length] = '\0';
printf("Operator is %s\n", a);
return 1;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '>') {
a[a_length++] = *curr++;
if (*curr == '>') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
} } else if (*curr == '=') {
a[a_length++] = *curr++;
if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '(') {
singleOperator:
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == ')') {
goto singleOperator;
} else if (*curr == '[') {
goto singleOperator;
} else if (*curr == ']') {
goto singleOperator;
} else if (*curr == '-') {
a[a_length++] = *curr++;
if (*curr == '>') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '-') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
}else if(*curr=='.'){
goto singleOperator;
}else if(*curr=='!'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='~'){
goto singleOperator;
}else if(*curr=='+'){
a[a_length++]=*curr++;
if(*curr=='+'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}else if(*curr=='-'){
a[a_length++]=*curr++;
if(*curr=='-'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}else if(*curr=='*'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='&'){
a[a_length++]=*curr++;
if(*curr=='&'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='/'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}if(*curr=='/'){
// skip line
while(*curr!='\n'){
if(*curr==END_OF_FILE)
return -1;
curr++;
}
}else if(*curr=='*'){
curr++;
// skip "/**/"
while(*curr!=END_OF_FILE)
{
if(*curr=='*' && *(curr+1)=='/'){
curr+=2;
break;
}
curr++;
}
}else{
goto lastOperatorDeal;
}
}else if(*curr=='%'){
a[a_length++]=*curr++;
if(*curr=='d'){
goto singleOperator;
}else if(*curr=='c'){
goto singleOperator;
}else if(*curr=='f'){
goto singleOperator;
}else if(*curr=='l'){
a[a_length++]=*curr++;
if(*curr=='d')
goto singleOperator;
else if(*curr=='f')
goto singleOperator;
else
goto singleOperator;
} }else if(*curr=='^'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='|'){
a[a_length++]=*curr++;
if(*curr=='|'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='?'){
goto singleOperator;
}else if(*curr==':'){
goto singleOperator;
}else if(*curr==','){
goto singleOperator;
}else if(*curr=='\\'){
a[a_length++]=*curr++;
if(*curr=='n'){
goto singleOperator;
}else {
goto lastOperatorDeal;
} }
/*
* Operator end
* */
/*
* delimiter begin
* */
else if(*curr=='{'){
singleDelimiter:
a[a_length++]=*curr++;
a[a_length]='\0';
printf("Delimiter is %s\n", a);
return 1;
}else if(*curr=='}'){
goto singleDelimiter;
}else if(*curr==';'){
goto singleDelimiter;
}else if(*curr=='\''){
goto singleDelimiter;
}else if(*curr=='\"'){
goto singleDelimiter;
}
}
这里实现了将单词分成五类流,并将单词打印出来,在后面的语法分析中将会使用到这里的单词流结果。
忘了说了,我将自己写的编译器命名为:ZCC,头文件都包括在zcc.h中(*^__^*) 嘻嘻……,想写个类似与gcc 一样奇妙的玩意。
最后看測试文档:
struct Student{
int a;
char* name;
} int main()
{
int a=123;
float a2=1.2345677;
int b=1+3;
for(int i=0; i < 100; i++)
a+=i;
printf("%d\n", a);
return 0;
}
測试结果:
keyword is struct
Identify is Student
Delimiter is {
keyword is int
Identify is a
Delimiter is ;
keyword is char
Operator is *
Identify is name
Delimiter is ;
Delimiter is }
keyword is int
Identify is main
Operator is (
Operator is )
Delimiter is {
keyword is int
Identify is a
Operator is =
number is 123
Delimiter is ;
keyword is float
Identify is a2
Operator is =
float number is 1.2345677
Delimiter is ;
keyword is int
Identify is b
Operator is =
number is 1
Operator is +
number is 3
Delimiter is ;
keyword is for
Operator is (
keyword is int
Identify is i
Operator is =
number is 0
Delimiter is ;
Identify is i
Operator is <
number is 100
Delimiter is ;
Identify is i
Operator is ++
Operator is )
Identify is a
Operator is +=
Identify is i
Delimiter is ;
Identify is printf
Operator is (
Delimiter is "
Operator is %d
Operator is \n
Delimiter is "
Operator is ,
Identify is a
Operator is )
Delimiter is ;
keyword is return
number is 0
Delimiter is ;
Delimiter is }
做到这里,能够告一小段落了,接下来做的事情就是语法分析。
编译器DIY——词法分析的更多相关文章
- 编译器DIY——读文件
编译器的前端词法分析:将源文件解析成一个个的单词流.为语法分析做准备. 在词法分析阶段,我们要做的就是将词分出来,而且确定单词的类型,一般的程序设计语言的单词符号能够份为下面5种: 1.keyword ...
- atitit.词法分析的实现token attilax总结
atitit.词法分析的实现token attilax总结 1. 词法分析(英语:lexical analysis)跟token 1 1.1. 扫描器 2 2. 单词流必须识别为保留字,标识符(变量) ...
- Atitit.注解解析(1)---------词法分析 attilax总结 java .net
Atitit.注解解析(1)---------词法分析 attilax总结 java .net 1. 应用场景:::因为要使用ui化的注解 1 2. 流程如下::: 词法分析(生成token流) & ...
- Atitit.注解and属性解析(2)---------语法分析 生成AST attilax总结 java .net
Atitit.注解and属性解析(2)---------语法分析 生成AST attilax总结 java .net 1. 应用场景:::因为要使用ui化的注解 1 2. 使用解释器方式来实现生成 ...
- Atitit. 解释器模式框架选型 and应用场景attilax总结 oao
Atitit. 解释器模式框架选型 and应用场景attilax总结 oao 1. 解释器模式结构描述 1 2. 如何实现(简单的解释器模式,仅仅通过词法分析即可实现,而无需token流进行处理. 2 ...
- 翻译器DIY它———算在英文文本中的单词数,字符和行数
咳咳.这部分应该是序列化编译器DIY的,然而,在这样做DIY第一次使用前flex 为了练练手,对于后者的理解是有帮助. 在word 我经常看到一个字计数功能,因此,它是如何实现,当然,首先想到的是要经 ...
- atitit.词法分析原理 词法分析器 (Lexer)
atitit.词法分析原理 词法分析器 (Lexer) 1. 词法分析(英语:lexical analysis)1 2. :实现词法分析程序的常用途径:自动生成,手工生成.[1] 2 2.1. 词法分 ...
- 15个C++项目列表
实验楼上有很多C++的实战项目,从简单到进阶,学习每个项目都可以掌握相应的知识点. 如果你还是C++新手的话,那么这个C++的项目列表你可以拿去练手实战开发,毕竟学编程动手实践是少不了的! 如果你不知 ...
- 从零开始山寨Caffe·伍:Protocol Buffer简易指南
你为Class外访问private对象而苦恼嘛?你为设计序列化格式而头疼嘛? ——欢迎体验Google Protocol Buffer 面向对象之封装性 历史遗留问题 面向对象中最矛盾的一个特性,就是 ...
随机推荐
- PHP利用递归法获取多级类别的树状数组
数据结构:category(id, pid, name),对应:信息ID,父项ID,类别名 测试数据: $aryCate = array( array('id' => 1, 'pid' => ...
- 使用mysql_query()方法操纵数据库以及综合实例
1.利用insert 语句添加记录 <? require('conn.php'); mysql_query( "insert into lyb ( title, content, au ...
- python初学笔记(三)
Unicode字符串 字符串还有一个编码问题. 因为计算机只能处理数字,如果要处理文本,就必须先把文本转换为数字才能处理.最早的计算机在设计时采用8个比特(bit)作为一个字节 (byte),所以,一 ...
- Activity之间定时跳转
起源:很多应用在打开时,首先会加载欢迎页面,经过几秒后再跳转到主页面. 下面,我通过两种不同的方式来实现页面的定时跳转. 第一种方式: 通过Timer类的schedule方法. 实现从MainActi ...
- Android SD卡创建文件和文件夹失败
原文:Android SD卡创建文件和文件夹失败 功能需要,尝试在本地sd卡上创建文件和文件夹的时候,报错,程序崩溃. 一般情况下,是忘记给予sd卡的读写权限.但是这里面权限已经给了,还是报错. 在网 ...
- poj2521---lose money
#include <stdio.h> #include <stdlib.h> int rever(int n) { return -n; } int main() { int ...
- Python的迭代器(iterator)和生成器(constructor)
一.迭代器(iterator) 1.迭代器的概述 在Python中,for循环可以用于Python中的任何类型,包括列表.元祖等等,实际上,for循环可用于任何“可迭代对象”,这其实就是迭代器 迭代器 ...
- 07.19 Linux命令 cd
情景:在用compass编写sass,cd进入目录后,想退出, 解决: cd.. 回到上一层目录 cd\ 回到根目录 cd 进入具体目录
- poj 1149 PIGS(最大流经典构图)
题目描述:迈克在一个养猪场工作,养猪场里有M 个猪圈,每个猪圈都上了锁.由于迈克没有钥匙,所以他不能打开任何一个猪圈.要买猪的顾客一个接一个来到养猪场,每个顾客有一些猪圈的钥匙,而且他们要买一定数量的 ...
- VBA 开发学习--基础语法3
VBA字符串函数列表 Trim(string) 去掉string左右两端空白 Ltrim(string) 去掉string左端空白 Rtrim(string) 去掉string右端空白 Len(str ...