开篇
所谓词法分析,就是将源代码按照构词规则分解成一系列单词符号。单词是语言中具有独立意义的最小单位,包括关键字、标识符、运算符、界符和常量等。
分析
源程序分解的单词可以归为以下几类:
1. 关键字
c语言关键字有不少,这里只列举其中一部分:
for,while,do,continue,if,else,
char,int,double,return
2. 变量名
变量名的规则为:
由字母数字以及下划线组成,开头必须为下划线或字母
用正规表达式描述为
letter = {a|b|...|z|A|B|...|Z}
digit = {0|1|...|9}
id = (letter|_ )(letter| _| digit)*
转化为NFA图为:
3. 数字常量
变量名的规则为:
由字母数字以及下划线组成,开头必须为下划线或字母
用正规表达式描述为
digit = {0|1|...|9}
id = digit((digit)*|.(digit)*)(ℇ | (E|e)digit(digit)*)
转化为NFA图为:
4. 界符和运算符
()[]{};+-*/等等
本人这里只加入了+-和=,因为*/以及&^%等都是类似的,这里只是试验,不需要全部写。
思路
根据有限自动机的概念,将整个源代码分解的规则转化为一个一个状态,用switch或if/else将状态之间的转移描述出来即可。这里省略了将常量以及变量名插入到符号表返回指针的那一步。
简单一个伪代码描述这个过程
初始化 状态=0switch(状态):{case 0:接收源码if(符合1规则)处理,状态=1else if(符合2规则)处理,状态=2else...case 1:case 2:case ...}
代码
#include <iostream>
#include <algorithm>
#include <vector>
#include <stack>
#include <cstdlib>
#include <cstdio>
using namespace std;const int KEY_NUM = 11;
const char* KEY_SET[] = {"for","while","do","continue","if","else","char","int","double","float","return"
};const char* FILE_NAME = "/Users/shiyi/abc.c";
char strBuffer[1026];
char strBox[200];void retract(char* &str)
{str--;
}char getChar(char* &str)
{return *str++;
}bool isNum(char ch)
{if(ch >= '0' && ch <= '9')return true;return false;
}bool isLetter(char ch)
{if(ch >= 'a' && ch <= 'z')return true;if(ch >= 'A' && ch <= 'Z')return true;return false;
}int findKey(char* ch)
{for(int i=0; i<KEY_NUM; i++){if(0 == strcmp(ch, KEY_SET[i]))return i+1;}return 0;
}void output(char* key, char* value)
{printf("< %s, %s >\n", key, value);
}//扫描
void scanner(char* str)
{int state = 0;char ch = ' ';int pos = 0;while(ch != '\0'){switch (state) {case 0:{ch = getChar(str);switch (ch) {case ' ':pos = 0;break;case '[':case ']':case '(':case ')':case '{':case '}': {char temp[2];temp[0] = ch;output(temp, "-");pos = 0;break;}case '\'': {state = 0;while((ch = getChar(str)) != '\'' && ch != '\0'){strBox[pos++] = ch;}if(ch == '\0'){//error}strBox[pos] = '\0';output("string", strBox);pos = 0;break;}case '"': {state = 0;while((ch = getChar(str)) != '"' && ch != '\0'){strBox[pos++] = ch;}if(ch == '\0'){output("error", strBox);}else{strBox[pos] = '\0';output("string", strBox);pos = 0;}break;}case '+': {state = 0;ch = getChar(str);switch (ch) {case '+':output("++", "-");pos = 0;break;case '=':output("+=", "-");pos = 0;break;default:retract(str);output("+", "-");pos = 0;break;}}case '-': {state = 0;ch = getChar(str);switch (ch) {case '-':output("--", "-");pos = 0;break;case '=':output("-=", "-");pos = 0;break;default:retract(str);output("-", "-");pos = 0;break;}}case '=': {state = 0;ch = getChar(str);switch (ch) {case '=':output("==", "-");pos = 0;break;default:retract(str);output("=", "-");pos = 0;break;}break;}default: {if(isNum(ch)){strBox[pos++] = ch;state = 2;}else if(isLetter(ch) || ch == '_'){strBox[pos++] = ch;state = 1;}break;}}break;}case 1:{while(true){ch = getChar(str);if(isLetter(ch) || isNum(ch) || ch == '_')strBox[pos++] = ch;else{strBox[pos] = '\0';int id = findKey(strBox);if(id == 0)output("id", strBox);elseoutput("key", strBox);retract(str);pos = 0;state = 0;break;}}break;}case 2:{while(true){ch = getChar(str);if(isNum(ch))strBox[pos++] = ch;else if(ch == '.'){strBox[pos++] = ch;state = 3;break;}else if(ch == 'E' || ch == 'e'){strBox[pos++] = ch;state = 4;break;}else{strBox[pos] = '\0';output("number", strBox);retract(str);pos = 0;state = 0;break;}}break;}case 3:{while(true){ch = getChar(str);if(isNum(ch))strBox[pos++] = ch;else if(ch == 'E' || ch == 'e'){strBox[pos++] = ch;state = 4;}else{strBox[pos] = '\0';output("number", strBox);retract(str);pos = 0;state = 0;break;}}break;}case 4:{while(true){ch = getChar(str);if(isNum(ch))strBox[pos++] = ch;else{strBox[pos] = '\0';output("number", strBox);retract(str);pos = 0;state = 0;break;}}break;}default:output("error", strBox);break;}}
}int main()
{FILE* file = fopen(FILE_NAME, "r");while(NULL != fgets(strBuffer, 1024, file)){scanner(strBuffer);}return 0;
}
测试结果
源程序
int main()
{int a = 5;for(int i=0; i<5; i++) {a++;}return 0;
}
输出结果
< key, int >
< id, main >
< (, - >
< ), - >
< {, - >
< key, int >
< id, a >
< =, - >
< number, 5 >
< key, for >
< (, - >
< key, int >
< id, i >
< =, - >
< number, 0 >
< id, i >
< number, 5 >
< id, i >
< ++, - >
< -, - >
< =, - >
< ), - >
< {, - >
< id, a >
< ++, - >
< -, - >
< =, - >
< }, - >
< key, return >
< number, 0 >
< }, - >
Program ended with exit code: 0