Recursive descent parsing

From CodeCodex

Implementations[edit]

C[edit]

typedef enum {ident, number, lparen, rparen, times, slash, plus,
    minus, eql, neq, lss, leq, gtr, geq, callsym, beginsym, semicolon,
    endsym, ifsym, whilesym, becomes, thensym, dosym, constsym, comma,
    varsym, procsym, period, oddsym} Symbol;

Symbol sym;
void getsym(void);
void error(const char msg[]);
void expression(void);

int accept(Symbol s) {
    if (sym == s) {
        getsym();
        return 1;
    }
    return 0;
}

int expect(Symbol s) {
    if (accept(s))
        return 1;
    error("expect: unexpected symbol");
    return 0;
}

void factor(void) {
    if (accept(ident)) {
        ;
    } else if (accept(number)) {
        ;
    } else if (accept(lparen)) {
        expression();
        expect(rparen);
    } else {
        error("factor: syntax error");
        getsym();
    }
}

void term(void) {
    factor();
    while (sym == times || sym == slash) {
        getsym();
        factor();
    }
}

void expression(void) {
    if (sym == plus || sym == minus)
        getsym();
    term();
    while (sym == plus || sym == minus) {
        getsym();
        term();
    }
}

void condition(void) {
    if (accept(oddsym)) {
        expression();
    } else {
        expression();
        if (sym == eql || sym == neq || sym == lss ||
            sym == leq || sym == gtr || sym == geq) {
            getsym();
            expression();
        } else {
            error("condition:valid operator");
            error("condition: invalid operator");
            getsym();
        }
    }
}

void statement(void) {
    if (accept(ident)) {
        expect(becomes);
        expression();
    } else if (accept(callsym)) {
        expect(ident);
    } else if (accept(beginsym)) {
        do {
            statement();
        } while (accept(semicolon));
        expect(endsym);
    } else if (accept(ifsym)) {
        condition();
        expect(thensym);
        statement();
    } else if (accept(whilesym)) {
        condition();
        expect(dosym);
        statement();
    }
}

void block(void) {
    if (accept(constsym)) {
        do {
            expect(ident);
            expect(eql);
            expect(number);
        } while (accept(comma));
        expect(semicolon);
    }
    if (accept(varsym)) {
        do {
            expect(ident);
        } while (accept(comma));
        expect(semicolon);
    }
    while (accept(procsym)) {
        expect(ident);
        expect(semicolon);
        block();
        expect(semicolon);
    }
    statement();
}

void program(void) {
    getsym();
    block();
    expect(period);
}

Java[edit]

This program illustrates a simple recursive descent parser which reads simple integer expressions in +, -, *, /, and ( ... ) and calculates the answer. The program is broken down as follows:

  • A recursive descent parser.
  • The main driver; reads from the console (not a file).
  • The scanner.
  • A class to define the allowable tokens.

Original Source:[1]

Parser.java[edit]

import java.io.*;


/* This program illustrates recursive descent parsing using a
   pure procedural approach.

   The grammar:

   statement = { expression  ";" } "."  
   expression = term { ( "+" | "-" ) term }
   term      = factor { ( "*" | "/" ) factor }
   factor    = number | "(" expression ")"
    
*/

public class Parser {

    private Scanner scanner;


public Parser(Scanner scanner) {
    this.scanner = scanner;
} // Parser


public void run ( ) {
    scanner.getToken( );
    statement( );
} // run


private void statement ( ) {
    //   statement = { expression  ";" } "."  
    while(scanner.token != Token.period) {
	int value = expression( );
	System.out.println("=> " + value);
	scanner.getToken( );  // flush ";"
    } // while
} // statement


private int expression ( ) {
    //    expression = term { ( "+" | "-" ) term }
    int left = term( );
    while (scanner.token == Token.plusop || 
	   scanner.token == Token.minusop) {
	int saveToken = scanner.token;
	scanner.getToken( );
	switch (saveToken) {
	    case Token.plusop:
		left += term( );
		break;
	    case Token.minusop:
		left -= term( );
		break;
	} // switch
    } // while
    return left;
} // expression


private int term ( ) {
    //    term = factor { ( "*" | "/" ) factor }
    int left = factor( );
    while (scanner.token == Token.timesop || 
	   scanner.token == Token.divideop) {
	int saveToken = scanner.token;
	scanner.getToken( );
	switch (saveToken) {
	    case Token.timesop:
		left *= factor( );
		break;
	    case Token.divideop:
		left /= factor( );
		break;
	} // switch
    } // while
    return left;
} // term


private int factor ( ) {
    //    factor    = number | "(" expression ")"
    int value = 0;
    switch (scanner.token) {
	case Token.number:
	    value = scanner.number( );
	    scanner.getToken( );  // flush number
	    break;
	case Token.lparen:
	    scanner.getToken( );
	    value = expression( );
	    if (scanner.token != Token.rparen)
		scanner.error("Missing ')'");
	    scanner.getToken( );  // flush ")"
	    break;
	default:
	    scanner.error("Expecting number or (");
	    break;
    } // switch
    return value;
} // factor

} // class Parser

Scanner.java[edit]

import java.io.*;
public class Scanner {
private char ch = ' ';
private char ident = ' ';
private int intValue = 0;
private Buffer buffer;
public int token;

public Scanner (DataInputStream in) {
    buffer = new Buffer(in);
    token = Token.semicolon;
} // Scanner


public int getToken ( ) {
    while (Character.isWhitespace(ch))
	ch = buffer.get( );
    if (Character.isLetter(ch)) {
	ident = Character.toLowerCase(ch);
	ch = buffer.get( );
	token = Token.letter;
    }
    else if (Character.isDigit(ch)) {
	intValue = getNumber( );
	token = Token.number;
    } 
    else {
	switch (ch) {
	    case ';' : ch = buffer.get( );
		token = Token.semicolon;
		break;

	    case '.' : ch = buffer.get( );
		token = Token.period;
		break;

	    case '+' : ch = buffer.get( );
		token = Token.plusop;
		break;

	    case '-' : ch = buffer.get( );
		token = Token.minusop;
		break;

	    case '*' : ch = buffer.get( );
		token = Token.timesop;
		break;

	    case '/' : ch = buffer.get( );
		token = Token.divideop;
		break;

	    case '=' : ch = buffer.get( );
		token = Token.assignop;
		break;

	    case '(' : ch = buffer.get( );
		token = Token.lparen;
		break;

	    case ')' : ch = buffer.get( );
		token = Token.rparen;
		break;

	    default : error ("Illegal character " + ch );
		break;
	} // switch
    } // if
    return token;
} // getToken


public int number ( ) {
    return intValue;
} // number


public char letter ( ) {
    return ident;
} // letter


public void match (int which) {
    token = getToken( );
    if (token != which) {
	error("Invalid token " + Token.toString(token) +
	      "-- expecting " + Token.toString(which));
	System.exit(1);
    } // if
} // match


public void error (String msg) {
    System.err.println(msg);
    System.exit(1);
} // error


private int getNumber ( ) {
    int rslt = 0;
    do {
	rslt = rslt * 10 + Character.digit(ch, 10);
	ch = buffer.get( );
    } while (Character.isDigit(ch));
    return rslt;
} // getNumber

} // Scanner

class Buffer {
private String line = "";
private int column = 0;
private int lineNo = 0;
private DataInputStream  in;

public Buffer (DataInputStream in) {
    this.in = in;
} // Buffer


public char get ( ) {
    column++;
    if (column >= line.length()) {
	try {
	    line = in.readLine( );
	} catch (Exception e) {
	    System.err.println("Invalid read operation");
	    System.exit(1);
	} // try
	if (line == null) 
	    System.exit(0);
	column = 0;
	lineNo++;
	System.out.println(line);
	line = line + "\n";
    } // if column
    return line.charAt(column);
} // get

} // class Buffer

Token.java[edit]

class Token {

public static final int semicolon = 0;
public static final int period    = 1;
public static final int plusop    = 2;
public static final int minusop   = 3;
public static final int timesop   = 4;
public static final int divideop  = 5;
public static final int assignop  = 6;
public static final int lparen    = 7;
public static final int rparen    = 8;
public static final int letter    = 9;
public static final int number    = 10;

private static String[] spelling = {
    ";", ".", "+", "-", "*", "/", "=", "(", ")",
    "letter", "number"};

public static String toString (int i) {
    if (i < 0 || i > number)
	return "";
    return spelling[i];
} // toString

} // Token

Test.java[edit]

import java.io.*;

public class Test {

public static void main(String args[]) {
    // command args ignored
    Parser parser = new Parser(new Scanner(
	new DataInputStream(System.in)));
    parser.run( );
    System.out.println("done");
} // main
    
} // class Test

OCaml[edit]

Recursive-descent parsers can be written in OCaml using the camlp4 stream-parsing syntax extension:

let rec factor = parser
  | [< ''('; e = expr; '')' >] -> e
  | [< 'c >] -> int_of_string (String.make 1 c)

and term = parser
  | [< e1 = factor; e = term_aux e1 >] -> e
and term_aux e1 = parser
  | [< ''*'; e2 = term >] -> e1 * e2
  | [< ''/'; e2 = term >] -> e1 / e2
  | [< >] -> e1

and expr = parser
  | [< e1 = term; e = expr_aux e1 >] -> e
and expr_aux e1 = parser
  | [< ''+'; e2 = expr >] -> e1 + e2
  | [< ''-'; e2 = expr >] -> e1 - e2
  | [< >] -> e1

let rec statement = parser
  | [< e = expr; '';'; ''.'; stream >] ->
      Printf.printf "= %d\n%!" e;
      statement stream

let rec read() = match input_char stdin with
  | ' ' | '\t' | '\n' -> [< read() >]
  | c -> [< 'c; read() >]

let () = statement (read())

For example:

$ ocamlc -g -dtypes -pp camlp4o parser.ml -o parser
$ ./parser
1+2+3;.
= 6
1+2*3;.
= 7
1+2*3+4;.
= 11