Parsing and Syntax Analysis

Introduction

Parsing is the second phase of compilation. The parser takes tokens from the lexer and builds a structured representation (usually an Abstract Syntax Tree) according to the language's grammar. This reading covers context-free grammars, parsing algorithms, and parser implementation.

Learning Objectives

By the end of this reading, you will be able to:

  • Write context-free grammars for programming languages
  • Understand the difference between LL and LR parsing
  • Implement a recursive descent parser
  • Build Abstract Syntax Trees
  • Handle operator precedence and associativity

1. Context-Free Grammars

Formal Definition

A CFG consists of:

  • Terminals: Token types from the lexer
  • Non-terminals: Syntactic categories
  • Productions: Rules of the form A → α
  • Start symbol: The top-level non-terminal
// Simple expression grammar
expr    → term (('+' | '-') term)*
term    → factor (('*' | '/') factor)*
factor  → NUMBER | '(' expr ')' | IDENTIFIER

BNF Notation

<program>    ::= <statement>*
<statement>  ::= <if_stmt> | <while_stmt> | <assign_stmt> | <expr_stmt>
<if_stmt>    ::= 'if' <expr> ':' <block> ('else' ':' <block>)?
<while_stmt> ::= 'while' <expr> ':' <block>
<assign_stmt>::= IDENTIFIER '=' <expr>
<expr_stmt>  ::= <expr>
<block>      ::= INDENT <statement>+ DEDENT

Grammar in Python

from dataclasses import dataclass
from typing import List, Optional, Union

# Grammar representation
@dataclass
class Production:
    lhs: str  # Left-hand side (non-terminal)
    rhs: List[str]  # Right-hand side (terminals and non-terminals)

class Grammar:
    def __init__(self):
        self.productions: List[Production] = []
        self.start_symbol: str = None
        self.terminals: set = set()
        self.non_terminals: set = set()

    def add_production(self, lhs: str, *rhs: str):
        self.productions.append(Production(lhs, list(rhs)))
        self.non_terminals.add(lhs)
        for symbol in rhs:
            if symbol.isupper() or symbol in '()+-*/=<>':
                self.terminals.add(symbol)
            elif not symbol.startswith("'"):
                self.non_terminals.add(symbol)

# Example grammar
g = Grammar()
g.start_symbol = 'expr'
g.add_production('expr', 'term', 'expr_rest')
g.add_production('expr_rest', 'PLUS', 'term', 'expr_rest')
g.add_production('expr_rest', 'MINUS', 'term', 'expr_rest')
g.add_production('expr_rest')  # epsilon production
g.add_production('term', 'factor', 'term_rest')
g.add_production('term_rest', 'STAR', 'factor', 'term_rest')
g.add_production('term_rest', 'SLASH', 'factor', 'term_rest')
g.add_production('term_rest')  # epsilon
g.add_production('factor', 'NUMBER')
g.add_production('factor', 'LPAREN', 'expr', 'RPAREN')

2. Abstract Syntax Trees

AST Node Definitions

from dataclasses import dataclass
from typing import List, Optional, Any

# Base class for all AST nodes
@dataclass
class AST:
    pass

# Expressions
@dataclass
class NumberLiteral(AST):
    value: float

@dataclass
class StringLiteral(AST):
    value: str

@dataclass
class BoolLiteral(AST):
    value: bool

@dataclass
class Identifier(AST):
    name: str

@dataclass
class BinaryOp(AST):
    op: str
    left: AST
    right: AST

@dataclass
class UnaryOp(AST):
    op: str
    operand: AST

@dataclass
class Call(AST):
    callee: AST
    arguments: List[AST]

@dataclass
class Index(AST):
    obj: AST
    index: AST

# Statements
@dataclass
class ExpressionStmt(AST):
    expr: AST

@dataclass
class Assignment(AST):
    target: Identifier
    value: AST

@dataclass
class IfStmt(AST):
    condition: AST
    then_branch: List[AST]
    else_branch: Optional[List[AST]]

@dataclass
class WhileStmt(AST):
    condition: AST
    body: List[AST]

@dataclass
class ForStmt(AST):
    var: Identifier
    iterable: AST
    body: List[AST]

@dataclass
class FunctionDef(AST):
    name: str
    params: List[str]
    body: List[AST]

@dataclass
class Return(AST):
    value: Optional[AST]

@dataclass
class Program(AST):
    statements: List[AST]

AST Visualization

def print_ast(node: AST, indent: int = 0):
    """Pretty print an AST"""
    prefix = "  " * indent

    if isinstance(node, NumberLiteral):
        print(f"{prefix}Number({node.value})")
    elif isinstance(node, StringLiteral):
        print(f"{prefix}String({node.value!r})")
    elif isinstance(node, Identifier):
        print(f"{prefix}Identifier({node.name})")
    elif isinstance(node, BinaryOp):
        print(f"{prefix}BinaryOp({node.op})")
        print_ast(node.left, indent + 1)
        print_ast(node.right, indent + 1)
    elif isinstance(node, UnaryOp):
        print(f"{prefix}UnaryOp({node.op})")
        print_ast(node.operand, indent + 1)
    elif isinstance(node, Call):
        print(f"{prefix}Call")
        print(f"{prefix}  callee:")
        print_ast(node.callee, indent + 2)
        print(f"{prefix}  arguments:")
        for arg in node.arguments:
            print_ast(arg, indent + 2)
    elif isinstance(node, Assignment):
        print(f"{prefix}Assignment")
        print(f"{prefix}  target: {node.target.name}")
        print(f"{prefix}  value:")
        print_ast(node.value, indent + 2)
    elif isinstance(node, IfStmt):
        print(f"{prefix}If")
        print(f"{prefix}  condition:")
        print_ast(node.condition, indent + 2)
        print(f"{prefix}  then:")
        for stmt in node.then_branch:
            print_ast(stmt, indent + 2)
        if node.else_branch:
            print(f"{prefix}  else:")
            for stmt in node.else_branch:
                print_ast(stmt, indent + 2)
    elif isinstance(node, FunctionDef):
        print(f"{prefix}FunctionDef({node.name})")
        print(f"{prefix}  params: {node.params}")
        print(f"{prefix}  body:")
        for stmt in node.body:
            print_ast(stmt, indent + 2)
    elif isinstance(node, Program):
        print(f"{prefix}Program")
        for stmt in node.statements:
            print_ast(stmt, indent + 1)
    else:
        print(f"{prefix}{type(node).__name__}: {node}")

3. Recursive Descent Parser

Parser Structure

class Parser:
    """Recursive descent parser"""

    def __init__(self, tokens: List[Token]):
        self.tokens = tokens
        self.pos = 0

    def current(self) -> Token:
        """Get current token"""
        if self.pos < len(self.tokens):
            return self.tokens[self.pos]
        return self.tokens[-1]  # EOF

    def peek(self, offset: int = 0) -> Token:
        """Look ahead"""
        pos = self.pos + offset
        if pos < len(self.tokens):
            return self.tokens[pos]
        return self.tokens[-1]

    def advance(self) -> Token:
        """Consume current token and return it"""
        token = self.current()
        if self.pos < len(self.tokens):
            self.pos += 1
        return token

    def match(self, *types: TokenType) -> bool:
        """Check if current token matches any of the types"""
        return self.current().type in types

    def expect(self, token_type: TokenType) -> Token:
        """Consume token of expected type or raise error"""
        if self.current().type == token_type:
            return self.advance()
        raise SyntaxError(
            f"Expected {token_type.name}, got {self.current().type.name} "
            f"at line {self.current().line}"
        )

    def check(self, token_type: TokenType) -> bool:
        """Check current token type without consuming"""
        return self.current().type == token_type

    # ===== Grammar Rules =====

    def parse(self) -> Program:
        """program → statement* EOF"""
        statements = []
        while not self.check(TokenType.EOF):
            if self.check(TokenType.NEWLINE):
                self.advance()
                continue
            statements.append(self.statement())
        return Program(statements)

    def statement(self) -> AST:
        """statement → if_stmt | while_stmt | func_def | assignment | expr_stmt"""
        if self.match(TokenType.IF):
            return self.if_statement()
        elif self.match(TokenType.WHILE):
            return self.while_statement()
        elif self.match(TokenType.DEF):
            return self.function_def()
        elif self.match(TokenType.RETURN):
            return self.return_statement()
        elif self.match(TokenType.IDENTIFIER) and self.peek(1).type == TokenType.ASSIGN:
            return self.assignment()
        else:
            return self.expression_statement()

    def if_statement(self) -> IfStmt:
        """if_stmt → 'if' expr ':' block ('else' ':' block)?"""
        self.expect(TokenType.IF)
        condition = self.expression()
        self.expect(TokenType.COLON)
        self.expect(TokenType.NEWLINE)

        then_branch = self.block()

        else_branch = None
        if self.match(TokenType.ELSE):
            self.advance()
            self.expect(TokenType.COLON)
            self.expect(TokenType.NEWLINE)
            else_branch = self.block()

        return IfStmt(condition, then_branch, else_branch)

    def while_statement(self) -> WhileStmt:
        """while_stmt → 'while' expr ':' block"""
        self.expect(TokenType.WHILE)
        condition = self.expression()
        self.expect(TokenType.COLON)
        self.expect(TokenType.NEWLINE)
        body = self.block()
        return WhileStmt(condition, body)

    def function_def(self) -> FunctionDef:
        """func_def → 'def' IDENTIFIER '(' params? ')' ':' block"""
        self.expect(TokenType.DEF)
        name = self.expect(TokenType.IDENTIFIER).value
        self.expect(TokenType.LPAREN)

        params = []
        if not self.check(TokenType.RPAREN):
            params.append(self.expect(TokenType.IDENTIFIER).value)
            while self.match(TokenType.COMMA):
                self.advance()
                params.append(self.expect(TokenType.IDENTIFIER).value)

        self.expect(TokenType.RPAREN)
        self.expect(TokenType.COLON)
        self.expect(TokenType.NEWLINE)
        body = self.block()

        return FunctionDef(name, params, body)

    def return_statement(self) -> Return:
        """return_stmt → 'return' expr?"""
        self.expect(TokenType.RETURN)
        value = None
        if not self.check(TokenType.NEWLINE) and not self.check(TokenType.EOF):
            value = self.expression()
        return Return(value)

    def assignment(self) -> Assignment:
        """assignment → IDENTIFIER '=' expr"""
        name = self.expect(TokenType.IDENTIFIER)
        self.expect(TokenType.ASSIGN)
        value = self.expression()
        return Assignment(Identifier(name.value), value)

    def expression_statement(self) -> ExpressionStmt:
        """expr_stmt → expr"""
        expr = self.expression()
        return ExpressionStmt(expr)

    def block(self) -> List[AST]:
        """block → INDENT statement+ DEDENT (simplified: just statements until outdent)"""
        # Simplified: collect statements until we see something that ends the block
        statements = []
        while not self.check(TokenType.EOF):
            if self.check(TokenType.ELSE):
                break
            if self.check(TokenType.NEWLINE):
                self.advance()
                continue
            # Simple heuristic: stop at keywords that start new statements at outer level
            if self.current().line > self.tokens[0].line:
                # Check for dedent by seeing if we're at a new statement keyword at column 1
                pass
            statements.append(self.statement())
            # For simplicity, just take one statement in block
            break
        return statements

    # ===== Expressions =====

    def expression(self) -> AST:
        """expression → comparison"""
        return self.comparison()

    def comparison(self) -> AST:
        """comparison → addition (('<' | '>' | '<=' | '>=' | '==' | '!=') addition)*"""
        left = self.addition()

        while self.match(TokenType.LT, TokenType.GT, TokenType.LEQ,
                        TokenType.GEQ, TokenType.EQ, TokenType.NEQ):
            op = self.advance().value
            right = self.addition()
            left = BinaryOp(op, left, right)

        return left

    def addition(self) -> AST:
        """addition → multiplication (('+' | '-') multiplication)*"""
        left = self.multiplication()

        while self.match(TokenType.PLUS, TokenType.MINUS):
            op = self.advance().value
            right = self.multiplication()
            left = BinaryOp(op, left, right)

        return left

    def multiplication(self) -> AST:
        """multiplication → unary (('*' | '/') unary)*"""
        left = self.unary()

        while self.match(TokenType.STAR, TokenType.SLASH):
            op = self.advance().value
            right = self.unary()
            left = BinaryOp(op, left, right)

        return left

    def unary(self) -> AST:
        """unary → ('-' | '!') unary | call"""
        if self.match(TokenType.MINUS):
            op = self.advance().value
            operand = self.unary()
            return UnaryOp(op, operand)

        return self.call()

    def call(self) -> AST:
        """call → primary ('(' arguments? ')')*"""
        expr = self.primary()

        while True:
            if self.match(TokenType.LPAREN):
                self.advance()
                args = []
                if not self.check(TokenType.RPAREN):
                    args.append(self.expression())
                    while self.match(TokenType.COMMA):
                        self.advance()
                        args.append(self.expression())
                self.expect(TokenType.RPAREN)
                expr = Call(expr, args)
            else:
                break

        return expr

    def primary(self) -> AST:
        """primary → NUMBER | STRING | TRUE | FALSE | IDENTIFIER | '(' expr ')'"""
        if self.match(TokenType.INTEGER):
            return NumberLiteral(self.advance().value)
        elif self.match(TokenType.FLOAT):
            return NumberLiteral(self.advance().value)
        elif self.match(TokenType.STRING):
            return StringLiteral(self.advance().value)
        elif self.match(TokenType.TRUE):
            self.advance()
            return BoolLiteral(True)
        elif self.match(TokenType.FALSE):
            self.advance()
            return BoolLiteral(False)
        elif self.match(TokenType.IDENTIFIER):
            return Identifier(self.advance().value)
        elif self.match(TokenType.LPAREN):
            self.advance()
            expr = self.expression()
            self.expect(TokenType.RPAREN)
            return expr
        else:
            raise SyntaxError(
                f"Unexpected token {self.current().type.name} "
                f"at line {self.current().line}"
            )

# Example usage
source = "x = 1 + 2 * 3"
lexer = Lexer(source)
tokens = [t for t in lexer.tokenize() if t.type != TokenType.NEWLINE]
parser = Parser(tokens)
ast = parser.parse()
print_ast(ast)

4. Operator Precedence

Precedence Climbing

class PrecedenceParser:
    """Expression parser using precedence climbing"""

    # Operator precedence (higher = binds tighter)
    PRECEDENCE = {
        '==': 1, '!=': 1,
        '<': 2, '>': 2, '<=': 2, '>=': 2,
        '+': 3, '-': 3,
        '*': 4, '/': 4,
    }

    # Associativity
    RIGHT_ASSOC = set()  # Add operators like ** for right associativity

    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0

    def current(self):
        return self.tokens[self.pos] if self.pos < len(self.tokens) else None

    def advance(self):
        token = self.current()
        self.pos += 1
        return token

    def parse_expression(self, min_precedence=0):
        """Parse expression with precedence climbing"""
        left = self.parse_atom()

        while True:
            token = self.current()
            if token is None or token.type == TokenType.EOF:
                break

            op = token.value
            if op not in self.PRECEDENCE:
                break

            prec = self.PRECEDENCE[op]
            if prec < min_precedence:
                break

            self.advance()  # Consume operator

            # Right associativity: use same precedence
            # Left associativity: use higher precedence
            next_min = prec if op in self.RIGHT_ASSOC else prec + 1
            right = self.parse_expression(next_min)

            left = BinaryOp(op, left, right)

        return left

    def parse_atom(self):
        """Parse atomic expression"""
        token = self.current()

        if token.type == TokenType.INTEGER:
            self.advance()
            return NumberLiteral(token.value)
        elif token.type == TokenType.IDENTIFIER:
            self.advance()
            return Identifier(token.value)
        elif token.type == TokenType.LPAREN:
            self.advance()
            expr = self.parse_expression()
            self.advance()  # RPAREN
            return expr

        raise SyntaxError(f"Unexpected token: {token}")

Pratt Parser

class PrattParser:
    """Top-down operator precedence parser (Pratt parser)"""

    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0

        # Prefix parselets (for literals, identifiers, unary operators)
        self.prefix_parselets = {}

        # Infix parselets (for binary operators)
        self.infix_parselets = {}

    def register_prefix(self, token_type, parselet):
        self.prefix_parselets[token_type] = parselet

    def register_infix(self, token_type, precedence, parselet):
        self.infix_parselets[token_type] = (precedence, parselet)

    def current(self):
        return self.tokens[self.pos] if self.pos < len(self.tokens) else None

    def advance(self):
        token = self.current()
        self.pos += 1
        return token

    def get_precedence(self):
        """Get precedence of current token"""
        token = self.current()
        if token and token.type in self.infix_parselets:
            return self.infix_parselets[token.type][0]
        return 0

    def parse_expression(self, precedence=0):
        """Parse expression with given precedence"""
        token = self.advance()

        if token.type not in self.prefix_parselets:
            raise SyntaxError(f"Cannot parse {token}")

        left = self.prefix_parselets[token.type](self, token)

        while precedence < self.get_precedence():
            token = self.advance()
            _, parselet = self.infix_parselets[token.type]
            left = parselet(self, left, token)

        return left

# Setup Pratt parser
def setup_pratt_parser(tokens):
    parser = PrattParser(tokens)

    # Prefix parselets
    parser.register_prefix(TokenType.INTEGER,
        lambda p, t: NumberLiteral(t.value))
    parser.register_prefix(TokenType.IDENTIFIER,
        lambda p, t: Identifier(t.value))
    parser.register_prefix(TokenType.LPAREN,
        lambda p, t: (expr := p.parse_expression(), p.advance(), expr)[0])
    parser.register_prefix(TokenType.MINUS,
        lambda p, t: UnaryOp('-', p.parse_expression(5)))  # High precedence

    # Infix parselets (binary operators)
    def binary_op(parser, left, token):
        prec, _ = parser.infix_parselets[token.type]
        right = parser.parse_expression(prec)
        return BinaryOp(token.value, left, right)

    parser.register_infix(TokenType.PLUS, 1, binary_op)
    parser.register_infix(TokenType.MINUS, 1, binary_op)
    parser.register_infix(TokenType.STAR, 2, binary_op)
    parser.register_infix(TokenType.SLASH, 2, binary_op)

    return parser

5. Error Handling and Recovery

class ParserError(Exception):
    def __init__(self, message, token):
        self.message = message
        self.token = token
        super().__init__(f"{message} at line {token.line}:{token.column}")

class RecoveringParser(Parser):
    """Parser with error recovery"""

    def __init__(self, tokens):
        super().__init__(tokens)
        self.errors = []

    def error(self, message):
        """Record error and return to allow recovery"""
        error = ParserError(message, self.current())
        self.errors.append(error)
        return error

    def synchronize(self):
        """Synchronize to next statement boundary"""
        self.advance()

        while not self.check(TokenType.EOF):
            if self.peek(-1).type == TokenType.NEWLINE:
                return
            if self.current().type in (
                TokenType.IF, TokenType.WHILE, TokenType.FOR,
                TokenType.DEF, TokenType.RETURN, TokenType.CLASS
            ):
                return
            self.advance()

    def statement(self):
        """Parse statement with recovery"""
        try:
            return super().statement()
        except SyntaxError as e:
            self.error(str(e))
            self.synchronize()
            return None

    def parse(self):
        """Parse program, collecting all errors"""
        statements = []
        while not self.check(TokenType.EOF):
            if self.check(TokenType.NEWLINE):
                self.advance()
                continue
            stmt = self.statement()
            if stmt:
                statements.append(stmt)

        if self.errors:
            print(f"Parsing completed with {len(self.errors)} error(s):")
            for error in self.errors:
                print(f"  {error}")

        return Program(statements)

Exercises

Basic

  1. Write a grammar for a simple calculator with +, -, *, /, and parentheses.

  2. Extend the parser to handle:

    • Array literals: [1, 2, 3]
    • Dictionary literals: {"key": value}
  3. Add support for for loops with the syntax for x in iterable:.

Intermediate

  1. Implement a parser for JSON that builds an AST.

  2. Add error recovery that can continue parsing after errors.

  3. Implement operator precedence parsing (Pratt parser) for a full expression grammar.

Advanced

  1. Write an LR(0) parser generator.

  2. Implement a parser for a language with significant whitespace (like Python).

  3. Add support for operator overloading and custom operators.


Summary

  • CFGs describe language syntax with productions
  • ASTs represent program structure for further processing
  • Recursive descent is intuitive and maps directly to grammar
  • Precedence climbing handles operator precedence cleanly
  • Pratt parsing provides flexibility for complex expressions
  • Error recovery allows reporting multiple errors per parse

Next Reading

Semantic Analysis and Type Checking →