Scanning and Parsing C Code
#pragma once
#include
#include
#include
#include
#include
#include
#include
using namespace std;
#define N_RESERVED_WORDS 9
enum {
ift, elset, whilet, floatt, integert, chart, breakt, continuet, voidt,
addopt, mulopt, assignopt, relopt,
lparent, rparent, lbracet, rbracet, lbrkt, rbrkt, commat, semit, periodt,
valuet, valuert, idt, stringt, eoft, unknownt
};
const char *TYPE[] = {
"ift", "elset", "whilet", "floatt", "integert", "chart", "breakt", "continuet", "voidt",
"addopt", "mulopt", "assignopt", "relopt",
"lparent", "rparent", "lbracet", "rbracet", "lbrkt", "rbrkt", "commat", "semit", "periodt",
"valuet", "valuert", "idt", "stringt", "eoft", "unknownt"
};
const char *PRINT[] = {
"if", "else", "while", "float", "int", "char", "break", "continue", "void",
"+, - or ||", "*, /, %% or &&", "=", "==, !=, <, <=, > or >=",
"(", ")", "{", "}", "[", "]", ",", ";", ".",
"valuet", "valuert", "idt", "stringt", "eoft", "unknownt"
};
const char *RESERVED_WORDS[] = {"if","else","while","float","int","char","break","continue","void"};
int RVALS[] = {ift,elset,whilet,floatt,integert,chart,breakt,continuet,voidt};
// class used to perform the lexical analysis
class LexicalAnalyzer {
private:
string code; // variable to hold the code to analyze
int pos; // current character position in code
public:
int nline; // current line number in code
int Token;
string Lexeme;
int Value;
float ValueR;
string Literal;
LexicalAnalyzer(string filename)
{
code=loadFile(filename);
pos=0;
nline=1;
}
// function that returns true if the given character is a letter
bool isLetter(char c)
{
if(c>='a' && c<='z')
return true;
if(c>='A' && c<='Z')
return true;
return false;
}
// function that returns true if the given character is a digit
bool isDigit(char c)
{
if(c>='0' && c<='9')
return true;
return false;
}
// get the next token in the code file
void GetNextToken()
{
int start;
int state=0;
int type;
while(state>=0 && pos<=code.length())
{
char ch=code[pos++];
switch(state)
{
case 0: // initial state
start=pos-1;
if(isLetter(ch))
state=1;
else if(isDigit(ch))
state=2;
else
{
switch(ch)
{
case '/': state=5; break;
case '=': state=8; break;
case '!': state=9; break;
case '<': state=9; break;
case '>': state=9; break;
case '+':
case '-': state=10; break;
case '|': state=11; break;
case '*': state=12; break;
case '%': state=12; break;
case '&': state=13; break;
case '(': type=lparent; state=14; break;
case ')': type=rparent; state=14; break;
case '{': type=lbracet; state=14; break;
case '}': type=rbracet; state=14; break;
case '[': type=lbrkt; state=14; break;
case ']': type=rbrkt; state=14; break;
case ',': type=commat; state=14; break;
case ';': type=semit; state=14; break;
case '.': type=periodt; state=14; break;
case '"': state=15; break;
case '\n': nline++; break;
case ' ':
case '\t': break;
case 0: state=-100; break; // end of file
default:
cout << "Error: invalid character "<< ch <<" in line " << nline << endl;
exit(1);
}
}
break;
case 1: //idt
if(!isLetter(ch) && !isDigit(ch) && ch!='_')
{
state=-1; // end reading token
pos--; // try same char again
}
break;
case 2: //digits
if(!isDigit(ch))
{
if(ch!='.')
{
state=-2;
pos--; // try same char again
}
else
state=3;
}
break;
case 3: //numt
if(!isDigit(ch))
{
cout << "Error: unexpected character after . in line" << nline << endl;
exit(1);
}
else
state=4;
break;
case 4: // real
if(!isDigit(ch))
{
state=-3;
pos--; // try same char again
}
break;
case 5:
if(ch=='*')
state=6;
else
{
state=12; // mulop /
pos--; // try same char again
}
break;
case 6: // comment
if(ch=='*') // first char of ending comment
state=7;
break;
case 7:
if(ch=='/') // end of comment
state=0;
else
state=6;
break;
case 8:
if(ch=='=')
state=-4; // relop ==
else
{
type=assignopt;
state=-5; // assignop =
pos--; // try same char again
}
break;
case 9: // relop
if(ch=='=')
state=-4;
else
{
if(code[pos-2]=='!')
{
cout << "Error: invalid character ! in line " << nline << endl;
exit(1);
}
state=-4;
pos--;
}
break;
case 10: // addop
pos--; // try same char again
state=-6;
break;
case 11:
if(ch=='|')
state=-6; //addop ||
else
{
cout << "Error: invalid character | in line " << nline << endl;
exit(1);
}
break;
case 12: // mulop
pos--; // try same char again
state=-7;
break;
case 13:
if(ch=='&')
state=-7; //mulop &&
else
{
cout << "Error: invalid character & in line " << nline << endl;
exit(1);
}
break;
case 14: // other symbols
pos--; // try same char again
state=-8; //symbol
break;
case 15: //string literal
if(ch=='"')
state=-9;
else if(ch=='\n')
{
cout << "Error: missing closing \" for string in line " << nline << endl;
exit(1);
}
break;
}
}
if(state>0)
{
cout << "Error: unexpected end of file found in line " << nline << endl;
exit(1);
}
else
{
Lexeme=code.substr(start,pos-start);
switch(state)
{
case -1: // idt
Token=idt;
for(int i=0; i27)
{
cout << "Error: identifier " << Lexeme << " in line " << nline << " is too long."<< endl;
exit(1);
}
break;
case -2: // integer
Token=valuet;
Value=atoi(Lexeme.c_str());
break;
case -3: // real
Token=valuert;
ValueR=atof(Lexeme.c_str());
break;
case -4: // relop
Token=relopt;
break;
case -5: // assignop
Token=assignopt;
break;
case -6: //addop
Token=addopt;
break;
case -7: //mulop
Token=mulopt;
break;
case -8: //symbol
Token=type;
break;
case -9: //string
Token=stringt;
Literal=Lexeme;
break;
case -100: // eof
Token=eoft;
}
}
}
// read all the file and return it as a string
string loadFile(string filename)
{
ifstream input(filename.c_str());
if(!input.is_open())
{
cout << "Error opening file: " << filename << endl;
exit(1);
}
stringstream ss;
ss << input.rdbuf();
return ss.str();
}
string printCurrentToken()
{
stringstream ss;
switch(Token)
{
case valuet:
ss << Value;
break;
case valuert:
ss << ValueR;
break;
case stringt:
ss << "\"" << Literal << "\"";
break;
case eoft:
ss << "EOF" ;
break;
case idt:
ss << Lexeme;
break;
default:
ss << PRINT[Token];
}
return ss.str();
}
string printToken(int symbol)
{
stringstream ss;
switch(symbol)
{
case valuet:
ss << "Integer Value";
break;
case valuert:
ss << "Real Value";
break;
case stringt:
ss << "Literal";
break;
case eoft:
ss << "EOF" ;
break;
case idt:
ss << "Identifier";
break;
default:
ss << PRINT[symbol];
}
return ss.str();
}
};
#include "LexicalAnalyzer.h"
#include "Parser.h"
// main program
int main()
{
string filename;
cout << "Please enter the name of the file to load: ";
getline(cin,filename);
LexicalAnalyzer lex(filename); // create lexical analyzer
Parser parser(lex); // create parser
parser.parse(); // do the parsing
return 0;
}
#include
#include
// Recursive parser class
class Parser
{
private:
LexicalAnalyzer lex;
// if the current symbol is equal to the symbol needed, return true
// and advance to next symbol else return false
bool accept(int symbol)
{
if(lex.Token==symbol)
{
lex.GetNextToken();
return true;
}
else
return false;
}
// if the current symbol is equal to the expected symbol argument, return true
// and advance to next symbol else generates an error
void expect(int symbol)
{
if(!accept(symbol))
{
cout << "Error: unexpected symbol: " << lex.printCurrentToken() << " in line " << lex.nline << ", expected "<< lex.printToken(symbol) << endl;
exit(1);
}
}
// parse for the TYPE part from the grammar
bool parseType()
{
if(lex.Token==floatt || lex.Token==integert || lex.Token==chart)
{
lex.GetNextToken();
return true;
}
else
return false;
}
// parse for the PARAMTAIL part from the grammar
void parseParamTail()
{
if(accept(commat))
{
if(parseType())
{
expect(idt);
parseParamTail();
}
else
{
cout << "Error: unexpected symbol: " << lex.printCurrentToken() << " in line " << lex.nline << ", unknown type" << endl;
exit(1);
}
}
}
// parse for the PARAMLIST part from the grammar
void parseParamList()
{
if(parseType())
{
expect(idt);
parseParamTail();
}
}
// parse for the STAT_LIST part from the grammar
void parseStatList()
{
//empty
}
// parse for the RET_STAT part from the grammar
void parseRetStat()
{
//empty
}
// parse for the IDTAIL part from the grammar
void parseIdTail()
{
if(accept(commat))
{
expect(idt);
parseIdTail();
}
}
// parse for the IDLIST part from the grammar
void parseIdList()
{
expect(idt);
parseIdTail();
expect(semit);
parseDecl();
}
// parse for the DECL part from the grammar
void parseDecl()
{
if(parseType())
parseIdList();
}
// parse for the COMPOUND part from the grammar
void parseCompound()
{
expect(lbracet);
parseDecl();
parseStatList();
parseRetStat();
expect(rbracet);
}
// parse for the REST part from the grammar
void parseRest()
{
if(accept(lparent))
{
parseParamList();
expect(rparent);
parseCompound();
}
else
{
parseIdTail();
expect(semit);
parseProgram();
}
}
// parse for the PROG part from the grammar
void parseProgram()
{
if(parseType())
{
expect(idt);
parseRest();
parseProgram();
}
}
public:
// constructor for the class saves the lexical analyzer for using it in the parsing
Parser(LexicalAnalyzer Lexa) : lex(Lexa)
{
}
// parse is the main function used to start parsing the program in the lexer
void parse()
{
lex.GetNextToken();
parseProgram();
if(lex.Token==eoft)
cout << "Program parsing ended successfully." << endl;
else
cout << "Error: unexpected symbol: " << lex.printCurrentToken() << " in line " << lex.nline << ", expected end of file" << endl;
}
};