Recursive descent parsing
From CodeCodex
Contents
Implementations[edit]
C[edit]
typedef enum {ident, number, lparen, rparen, times, slash, plus, minus, eql, neq, lss, leq, gtr, geq, callsym, beginsym, semicolon, endsym, ifsym, whilesym, becomes, thensym, dosym, constsym, comma, varsym, procsym, period, oddsym} Symbol; Symbol sym; void getsym(void); void error(const char msg[]); void expression(void); int accept(Symbol s) { if (sym == s) { getsym(); return 1; } return 0; } int expect(Symbol s) { if (accept(s)) return 1; error("expect: unexpected symbol"); return 0; } void factor(void) { if (accept(ident)) { ; } else if (accept(number)) { ; } else if (accept(lparen)) { expression(); expect(rparen); } else { error("factor: syntax error"); getsym(); } } void term(void) { factor(); while (sym == times || sym == slash) { getsym(); factor(); } } void expression(void) { if (sym == plus || sym == minus) getsym(); term(); while (sym == plus || sym == minus) { getsym(); term(); } } void condition(void) { if (accept(oddsym)) { expression(); } else { expression(); if (sym == eql || sym == neq || sym == lss || sym == leq || sym == gtr || sym == geq) { getsym(); expression(); } else { error("condition:valid operator"); error("condition: invalid operator"); getsym(); } } } void statement(void) { if (accept(ident)) { expect(becomes); expression(); } else if (accept(callsym)) { expect(ident); } else if (accept(beginsym)) { do { statement(); } while (accept(semicolon)); expect(endsym); } else if (accept(ifsym)) { condition(); expect(thensym); statement(); } else if (accept(whilesym)) { condition(); expect(dosym); statement(); } } void block(void) { if (accept(constsym)) { do { expect(ident); expect(eql); expect(number); } while (accept(comma)); expect(semicolon); } if (accept(varsym)) { do { expect(ident); } while (accept(comma)); expect(semicolon); } while (accept(procsym)) { expect(ident); expect(semicolon); block(); expect(semicolon); } statement(); } void program(void) { getsym(); block(); expect(period); }
Java[edit]
This program illustrates a simple recursive descent parser which reads simple integer expressions in +, -, *, /, and ( ... ) and calculates the answer. The program is broken down as follows:
- A recursive descent parser.
- The main driver; reads from the console (not a file).
- The scanner.
- A class to define the allowable tokens.
Original Source:[1]
Parser.java[edit]
import java.io.*; /* This program illustrates recursive descent parsing using a pure procedural approach. The grammar: statement = { expression ";" } "." expression = term { ( "+" | "-" ) term } term = factor { ( "*" | "/" ) factor } factor = number | "(" expression ")" */ public class Parser { private Scanner scanner; public Parser(Scanner scanner) { this.scanner = scanner; } // Parser public void run ( ) { scanner.getToken( ); statement( ); } // run private void statement ( ) { // statement = { expression ";" } "." while(scanner.token != Token.period) { int value = expression( ); System.out.println("=> " + value); scanner.getToken( ); // flush ";" } // while } // statement private int expression ( ) { // expression = term { ( "+" | "-" ) term } int left = term( ); while (scanner.token == Token.plusop || scanner.token == Token.minusop) { int saveToken = scanner.token; scanner.getToken( ); switch (saveToken) { case Token.plusop: left += term( ); break; case Token.minusop: left -= term( ); break; } // switch } // while return left; } // expression private int term ( ) { // term = factor { ( "*" | "/" ) factor } int left = factor( ); while (scanner.token == Token.timesop || scanner.token == Token.divideop) { int saveToken = scanner.token; scanner.getToken( ); switch (saveToken) { case Token.timesop: left *= factor( ); break; case Token.divideop: left /= factor( ); break; } // switch } // while return left; } // term private int factor ( ) { // factor = number | "(" expression ")" int value = 0; switch (scanner.token) { case Token.number: value = scanner.number( ); scanner.getToken( ); // flush number break; case Token.lparen: scanner.getToken( ); value = expression( ); if (scanner.token != Token.rparen) scanner.error("Missing ')'"); scanner.getToken( ); // flush ")" break; default: scanner.error("Expecting number or ("); break; } // switch return value; } // factor } // class Parser
Scanner.java[edit]
import java.io.*; public class Scanner { private char ch = ' '; private char ident = ' '; private int intValue = 0; private Buffer buffer; public int token; public Scanner (DataInputStream in) { buffer = new Buffer(in); token = Token.semicolon; } // Scanner public int getToken ( ) { while (Character.isWhitespace(ch)) ch = buffer.get( ); if (Character.isLetter(ch)) { ident = Character.toLowerCase(ch); ch = buffer.get( ); token = Token.letter; } else if (Character.isDigit(ch)) { intValue = getNumber( ); token = Token.number; } else { switch (ch) { case ';' : ch = buffer.get( ); token = Token.semicolon; break; case '.' : ch = buffer.get( ); token = Token.period; break; case '+' : ch = buffer.get( ); token = Token.plusop; break; case '-' : ch = buffer.get( ); token = Token.minusop; break; case '*' : ch = buffer.get( ); token = Token.timesop; break; case '/' : ch = buffer.get( ); token = Token.divideop; break; case '=' : ch = buffer.get( ); token = Token.assignop; break; case '(' : ch = buffer.get( ); token = Token.lparen; break; case ')' : ch = buffer.get( ); token = Token.rparen; break; default : error ("Illegal character " + ch ); break; } // switch } // if return token; } // getToken public int number ( ) { return intValue; } // number public char letter ( ) { return ident; } // letter public void match (int which) { token = getToken( ); if (token != which) { error("Invalid token " + Token.toString(token) + "-- expecting " + Token.toString(which)); System.exit(1); } // if } // match public void error (String msg) { System.err.println(msg); System.exit(1); } // error private int getNumber ( ) { int rslt = 0; do { rslt = rslt * 10 + Character.digit(ch, 10); ch = buffer.get( ); } while (Character.isDigit(ch)); return rslt; } // getNumber } // Scanner class Buffer { private String line = ""; private int column = 0; private int lineNo = 0; private DataInputStream in; public Buffer (DataInputStream in) { this.in = in; } // Buffer public char get ( ) { column++; if (column >= line.length()) { try { line = in.readLine( ); } catch (Exception e) { System.err.println("Invalid read operation"); System.exit(1); } // try if (line == null) System.exit(0); column = 0; lineNo++; System.out.println(line); line = line + "\n"; } // if column return line.charAt(column); } // get } // class Buffer
Token.java[edit]
class Token { public static final int semicolon = 0; public static final int period = 1; public static final int plusop = 2; public static final int minusop = 3; public static final int timesop = 4; public static final int divideop = 5; public static final int assignop = 6; public static final int lparen = 7; public static final int rparen = 8; public static final int letter = 9; public static final int number = 10; private static String[] spelling = { ";", ".", "+", "-", "*", "/", "=", "(", ")", "letter", "number"}; public static String toString (int i) { if (i < 0 || i > number) return ""; return spelling[i]; } // toString } // Token
Test.java[edit]
import java.io.*; public class Test { public static void main(String args[]) { // command args ignored Parser parser = new Parser(new Scanner( new DataInputStream(System.in))); parser.run( ); System.out.println("done"); } // main } // class Test
OCaml[edit]
Recursive-descent parsers can be written in OCaml using the camlp4 stream-parsing syntax extension:
let rec factor = parser | [< ''('; e = expr; '')' >] -> e | [< 'c >] -> int_of_string (String.make 1 c) and term = parser | [< e1 = factor; e = term_aux e1 >] -> e and term_aux e1 = parser | [< ''*'; e2 = term >] -> e1 * e2 | [< ''/'; e2 = term >] -> e1 / e2 | [< >] -> e1 and expr = parser | [< e1 = term; e = expr_aux e1 >] -> e and expr_aux e1 = parser | [< ''+'; e2 = expr >] -> e1 + e2 | [< ''-'; e2 = expr >] -> e1 - e2 | [< >] -> e1 let rec statement = parser | [< e = expr; '';'; ''.'; stream >] -> Printf.printf "= %d\n%!" e; statement stream let rec read() = match input_char stdin with | ' ' | '\t' | '\n' -> [< read() >] | c -> [< 'c; read() >] let () = statement (read())
For example:
$ ocamlc -g -dtypes -pp camlp4o parser.ml -o parser $ ./parser 1+2+3;. = 6 1+2*3;. = 7 1+2*3+4;. = 11