Parsing and Syntax Analysis | Computer Science Fundamentals

Parsing is the second phase of compilation. The parser takes tokens from the lexer and builds a structured representation (usually an Abstract Syntax Tree) according to the language's grammar. ## 1. Context-Free Grammars

Formal Definition

A CFG consists of:

Terminals: Token types from the lexer
Non-terminals: Syntactic categories
Productions: Rules of the form A → α
Start symbol: The top-level non-terminal

// Simple expression grammar
expr    → term (('+' | '-') term)*
term    → factor (('*' | '/') factor)*
factor  → NUMBER | '(' expr ')' | IDENTIFIER

BNF Notation

<program>    ::= <statement>*
<statement>  ::= <if_stmt> | <while_stmt> | <assign_stmt> | <expr_stmt>
<if_stmt>    ::= 'if' <expr> ':' <block> ('else' ':' <block>)?
<while_stmt> ::= 'while' <expr> ':' <block>
<assign_stmt>::= IDENTIFIER '=' <expr>
<expr_stmt>  ::= <expr>
<block>      ::= INDENT <statement>+ DEDENT

Grammar in Python

from dataclasses import dataclass
from typing import List, Optional, Union

# Grammar representation
@dataclass
class Production:
    lhs: str  # Left-hand side (non-terminal)
    rhs: List[str]  # Right-hand side (terminals and non-terminals)

class Grammar:
    def __init__(self):
        self.productions: List[Production] = []
        self.start_symbol: str = None
        self.terminals: set = set()
        self.non_terminals: set = set()

    def add_production(self, lhs: str, *rhs: str):
        self.productions.append(Production(lhs, list(rhs)))
        self.non_terminals.add(lhs)
        for symbol in rhs:
            if symbol.isupper() or symbol in '()+-*/=<>':
                self.terminals.add(symbol)
            elif not symbol.startswith("'"):
                self.non_terminals.add(symbol)

# Example grammar
g = Grammar()
g.start_symbol = 'expr'
g.add_production('expr', 'term', 'expr_rest')
g.add_production('expr_rest', 'PLUS', 'term', 'expr_rest')
g.add_production('expr_rest', 'MINUS', 'term', 'expr_rest')
g.add_production('expr_rest')  # epsilon production
g.add_production('term', 'factor', 'term_rest')
g.add_production('term_rest', 'STAR', 'factor', 'term_rest')
g.add_production('term_rest', 'SLASH', 'factor', 'term_rest')
g.add_production('term_rest')  # epsilon
g.add_production('factor', 'NUMBER')
g.add_production('factor', 'LPAREN', 'expr', 'RPAREN')

2. Abstract Syntax Trees

AST Node Definitions

from dataclasses import dataclass
from typing import List, Optional, Any

# Base class for all AST nodes
@dataclass
class AST:
    pass

# Expressions
@dataclass
class NumberLiteral(AST):
    value: float

@dataclass
class StringLiteral(AST):
    value: str

@dataclass
class BoolLiteral(AST):
    value: bool

@dataclass
class Identifier(AST):
    name: str

@dataclass
class BinaryOp(AST):
    op: str
    left: AST
    right: AST

@dataclass
class UnaryOp(AST):
    op: str
    operand: AST

@dataclass
class Call(AST):
    callee: AST
    arguments: List[AST]

@dataclass
class Index(AST):
    obj: AST
    index: AST

# Statements
@dataclass
class ExpressionStmt(AST):
    expr: AST

@dataclass
class Assignment(AST):
    target: Identifier
    value: AST

@dataclass
class IfStmt(AST):
    condition: AST
    then_branch: List[AST]
    else_branch: Optional[List[AST]]

@dataclass
class WhileStmt(AST):
    condition: AST
    body: List[AST]

@dataclass
class ForStmt(AST):
    var: Identifier
    iterable: AST
    body: List[AST]

@dataclass
class FunctionDef(AST):
    name: str
    params: List[str]
    body: List[AST]

@dataclass
class Return(AST):
    value: Optional[AST]

@dataclass
class Program(AST):
    statements: List[AST]

AST Visualization

def print_ast(node: AST, indent: int = 0):
    """Pretty print an AST"""
    prefix = "  " * indent

    if isinstance(node, NumberLiteral):
        print(f"{prefix}Number({node.value})")
    elif isinstance(node, StringLiteral):
        print(f"{prefix}String({node.value!r})")
    elif isinstance(node, Identifier):
        print(f"{prefix}Identifier({node.name})")
    elif isinstance(node, BinaryOp):
        print(f"{prefix}BinaryOp({node.op})")
        print_ast(node.left, indent + 1)
        print_ast(node.right, indent + 1)
    elif isinstance(node, UnaryOp):
        print(f"{prefix}UnaryOp({node.op})")
        print_ast(node.operand, indent + 1)
    elif isinstance(node, Call):
        print(f"{prefix}Call")
        print(f"{prefix}  callee:")
        print_ast(node.callee, indent + 2)
        print(f"{prefix}  arguments:")
        for arg in node.arguments:
            print_ast(arg, indent + 2)
    elif isinstance(node, Assignment):
        print(f"{prefix}Assignment")
        print(f"{prefix}  target: {node.target.name}")
        print(f"{prefix}  value:")
        print_ast(node.value, indent + 2)
    elif isinstance(node, IfStmt):
        print(f"{prefix}If")
        print(f"{prefix}  condition:")
        print_ast(node.condition, indent + 2)
        print(f"{prefix}  then:")
        for stmt in node.then_branch:
            print_ast(stmt, indent + 2)
        if node.else_branch:
            print(f"{prefix}  else:")
            for stmt in node.else_branch:
                print_ast(stmt, indent + 2)
    elif isinstance(node, FunctionDef):
        print(f"{prefix}FunctionDef({node.name})")
        print(f"{prefix}  params: {node.params}")
        print(f"{prefix}  body:")
        for stmt in node.body:
            print_ast(stmt, indent + 2)
    elif isinstance(node, Program):
        print(f"{prefix}Program")
        for stmt in node.statements:
            print_ast(stmt, indent + 1)
    else:
        print(f"{prefix}{type(node).__name__}: {node}")

3. Recursive Descent Parser

Parser Structure

class Parser:
    """Recursive descent parser"""

    def __init__(self, tokens: List[Token]):
        self.tokens = tokens
        self.pos = 0

    def current(self) -> Token:
        """Get current token"""
        if self.pos < len(self.tokens):
            return self.tokens[self.pos]
        return self.tokens[-1]  # EOF

    def peek(self, offset: int = 0) -> Token:
        """Look ahead"""
        pos = self.pos + offset
        if pos < len(self.tokens):
            return self.tokens[pos]
        return self.tokens[-1]

    def advance(self) -> Token:
        """Consume current token and return it"""
        token = self.current()
        if self.pos < len(self.tokens):
            self.pos += 1
        return token

    def match(self, *types: TokenType) -> bool:
        """Check if current token matches any of the types"""
        return self.current().type in types

    def expect(self, token_type: TokenType) -> Token:
        """Consume token of expected type or raise error"""
        if self.current().type == token_type:
            return self.advance()
        raise SyntaxError(
            f"Expected {token_type.name}, got {self.current().type.name} "
            f"at line {self.current().line}"
        )

    def check(self, token_type: TokenType) -> bool:
        """Check current token type without consuming"""
        return self.current().type == token_type

    # ===== Grammar Rules =====

    def parse(self) -> Program:
        """program → statement* EOF"""
        statements = []
        while not self.check(TokenType.EOF):
            if self.check(TokenType.NEWLINE):
                self.advance()
                continue
            statements.append(self.statement())
        return Program(statements)

    def statement(self) -> AST:
        """statement → if_stmt | while_stmt | func_def | assignment | expr_stmt"""
        if self.match(TokenType.IF):
            return self.if_statement()
        elif self.match(TokenType.WHILE):
            return self.while_statement()
        elif self.match(TokenType.DEF):
            return self.function_def()
        elif self.match(TokenType.RETURN):
            return self.return_statement()
        elif self.match(TokenType.IDENTIFIER) and self.peek(1).type == TokenType.ASSIGN:
            return self.assignment()
        else:
            return self.expression_statement()

    def if_statement(self) -> IfStmt:
        """if_stmt → 'if' expr ':' block ('else' ':' block)?"""
        self.expect(TokenType.IF)
        condition = self.expression()
        self.expect(TokenType.COLON)
        self.expect(TokenType.NEWLINE)

        then_branch = self.block()

        else_branch = None
        if self.match(TokenType.ELSE):
            self.advance()
            self.expect(TokenType.COLON)
            self.expect(TokenType.NEWLINE)
            else_branch = self.block()

        return IfStmt(condition, then_branch, else_branch)

    def while_statement(self) -> WhileStmt:
        """while_stmt → 'while' expr ':' block"""
        self.expect(TokenType.WHILE)
        condition = self.expression()
        self.expect(TokenType.COLON)
        self.expect(TokenType.NEWLINE)
        body = self.block()
        return WhileStmt(condition, body)

    def function_def(self) -> FunctionDef:
        """func_def → 'def' IDENTIFIER '(' params? ')' ':' block"""
        self.expect(TokenType.DEF)
        name = self.expect(TokenType.IDENTIFIER).value
        self.expect(TokenType.LPAREN)

        params = []
        if not self.check(TokenType.RPAREN):
            params.append(self.expect(TokenType.IDENTIFIER).value)
            while self.match(TokenType.COMMA):
                self.advance()
                params.append(self.expect(TokenType.IDENTIFIER).value)

        self.expect(TokenType.RPAREN)
        self.expect(TokenType.COLON)
        self.expect(TokenType.NEWLINE)
        body = self.block()

        return FunctionDef(name, params, body)

    def return_statement(self) -> Return:
        """return_stmt → 'return' expr?"""
        self.expect(TokenType.RETURN)
        value = None
        if not self.check(TokenType.NEWLINE) and not self.check(TokenType.EOF):
            value = self.expression()
        return Return(value)

    def assignment(self) -> Assignment:
        """assignment → IDENTIFIER '=' expr"""
        name = self.expect(TokenType.IDENTIFIER)
        self.expect(TokenType.ASSIGN)
        value = self.expression()
        return Assignment(Identifier(name.value), value)

    def expression_statement(self) -> ExpressionStmt:
        """expr_stmt → expr"""
        expr = self.expression()
        return ExpressionStmt(expr)

    def block(self) -> List[AST]:
        """block → INDENT statement+ DEDENT (simplified: just statements until outdent)"""
        # Simplified: collect statements until we see something that ends the block
        statements = []
        while not self.check(TokenType.EOF):
            if self.check(TokenType.ELSE):
                break
            if self.check(TokenType.NEWLINE):
                self.advance()
                continue
            # Simple heuristic: stop at keywords that start new statements at outer level
            if self.current().line > self.tokens[0].line:
                # Check for dedent by seeing if we're at a new statement keyword at column 1
                pass
            statements.append(self.statement())
            # For simplicity, just take one statement in block
            break
        return statements

    # ===== Expressions =====

    def expression(self) -> AST:
        """expression → comparison"""
        return self.comparison()

    def comparison(self) -> AST:
        """comparison → addition (('<' | '>' | '<=' | '>=' | '==' | '!=') addition)*"""
        left = self.addition()

        while self.match(TokenType.LT, TokenType.GT, TokenType.LEQ,
                        TokenType.GEQ, TokenType.EQ, TokenType.NEQ):
            op = self.advance().value
            right = self.addition()
            left = BinaryOp(op, left, right)

        return left

    def addition(self) -> AST:
        """addition → multiplication (('+' | '-') multiplication)*"""
        left = self.multiplication()

        while self.match(TokenType.PLUS, TokenType.MINUS):
            op = self.advance().value
            right = self.multiplication()
            left = BinaryOp(op, left, right)

        return left

    def multiplication(self) -> AST:
        """multiplication → unary (('*' | '/') unary)*"""
        left = self.unary()

        while self.match(TokenType.STAR, TokenType.SLASH):
            op = self.advance().value
            right = self.unary()
            left = BinaryOp(op, left, right)

        return left

    def unary(self) -> AST:
        """unary → ('-' | '!') unary | call"""
        if self.match(TokenType.MINUS):
            op = self.advance().value
            operand = self.unary()
            return UnaryOp(op, operand)

        return self.call()

    def call(self) -> AST:
        """call → primary ('(' arguments? ')')*"""
        expr = self.primary()

        while True:
            if self.match(TokenType.LPAREN):
                self.advance()
                args = []
                if not self.check(TokenType.RPAREN):
                    args.append(self.expression())
                    while self.match(TokenType.COMMA):
                        self.advance()
                        args.append(self.expression())
                self.expect(TokenType.RPAREN)
                expr = Call(expr, args)
            else:
                break

        return expr

    def primary(self) -> AST:
        """primary → NUMBER | STRING | TRUE | FALSE | IDENTIFIER | '(' expr ')'"""
        if self.match(TokenType.INTEGER):
            return NumberLiteral(self.advance().value)
        elif self.match(TokenType.FLOAT):
            return NumberLiteral(self.advance().value)
        elif self.match(TokenType.STRING):
            return StringLiteral(self.advance().value)
        elif self.match(TokenType.TRUE):
            self.advance()
            return BoolLiteral(True)
        elif self.match(TokenType.FALSE):
            self.advance()
            return BoolLiteral(False)
        elif self.match(TokenType.IDENTIFIER):
            return Identifier(self.advance().value)
        elif self.match(TokenType.LPAREN):
            self.advance()
            expr = self.expression()
            self.expect(TokenType.RPAREN)
            return expr
        else:
            raise SyntaxError(
                f"Unexpected token {self.current().type.name} "
                f"at line {self.current().line}"
            )

# Example usage
source = "x = 1 + 2 * 3"
lexer = Lexer(source)
tokens = [t for t in lexer.tokenize() if t.type != TokenType.NEWLINE]
parser = Parser(tokens)
ast = parser.parse()
print_ast(ast)

4. Operator Precedence

Precedence Climbing

class PrecedenceParser:
    """Expression parser using precedence climbing"""

    # Operator precedence (higher = binds tighter)
    PRECEDENCE = {
        '==': 1, '!=': 1,
        '<': 2, '>': 2, '<=': 2, '>=': 2,
        '+': 3, '-': 3,
        '*': 4, '/': 4,
    }

    # Associativity
    RIGHT_ASSOC = set()  # Add operators like ** for right associativity

    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0

    def current(self):
        return self.tokens[self.pos] if self.pos < len(self.tokens) else None

    def advance(self):
        token = self.current()
        self.pos += 1
        return token

    def parse_expression(self, min_precedence=0):
        """Parse expression with precedence climbing"""
        left = self.parse_atom()

        while True:
            token = self.current()
            if token is None or token.type == TokenType.EOF:
                break

            op = token.value
            if op not in self.PRECEDENCE:
                break

            prec = self.PRECEDENCE[op]
            if prec < min_precedence:
                break

            self.advance()  # Consume operator

            # Right associativity: use same precedence
            # Left associativity: use higher precedence
            next_min = prec if op in self.RIGHT_ASSOC else prec + 1
            right = self.parse_expression(next_min)

            left = BinaryOp(op, left, right)

        return left

    def parse_atom(self):
        """Parse atomic expression"""
        token = self.current()

        if token.type == TokenType.INTEGER:
            self.advance()
            return NumberLiteral(token.value)
        elif token.type == TokenType.IDENTIFIER:
            self.advance()
            return Identifier(token.value)
        elif token.type == TokenType.LPAREN:
            self.advance()
            expr = self.parse_expression()
            self.advance()  # RPAREN
            return expr

        raise SyntaxError(f"Unexpected token: {token}")

Pratt Parser

class PrattParser:
    """Top-down operator precedence parser (Pratt parser)"""

    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0

        # Prefix parselets (for literals, identifiers, unary operators)
        self.prefix_parselets = {}

        # Infix parselets (for binary operators)
        self.infix_parselets = {}

    def register_prefix(self, token_type, parselet):
        self.prefix_parselets[token_type] = parselet

    def register_infix(self, token_type, precedence, parselet):
        self.infix_parselets[token_type] = (precedence, parselet)

    def current(self):
        return self.tokens[self.pos] if self.pos < len(self.tokens) else None

    def advance(self):
        token = self.current()
        self.pos += 1
        return token

    def get_precedence(self):
        """Get precedence of current token"""
        token = self.current()
        if token and token.type in self.infix_parselets:
            return self.infix_parselets[token.type][0]
        return 0

    def parse_expression(self, precedence=0):
        """Parse expression with given precedence"""
        token = self.advance()

        if token.type not in self.prefix_parselets:
            raise SyntaxError(f"Cannot parse {token}")

        left = self.prefix_parselets[token.type](self, token)

        while precedence < self.get_precedence():
            token = self.advance()
            _, parselet = self.infix_parselets[token.type]
            left = parselet(self, left, token)

        return left

# Setup Pratt parser
def setup_pratt_parser(tokens):
    parser = PrattParser(tokens)

    # Prefix parselets
    parser.register_prefix(TokenType.INTEGER,
        lambda p, t: NumberLiteral(t.value))
    parser.register_prefix(TokenType.IDENTIFIER,
        lambda p, t: Identifier(t.value))
    parser.register_prefix(TokenType.LPAREN,
        lambda p, t: (expr := p.parse_expression(), p.advance(), expr)[0])
    parser.register_prefix(TokenType.MINUS,
        lambda p, t: UnaryOp('-', p.parse_expression(5)))  # High precedence

    # Infix parselets (binary operators)
    def binary_op(parser, left, token):
        prec, _ = parser.infix_parselets[token.type]
        right = parser.parse_expression(prec)
        return BinaryOp(token.value, left, right)

    parser.register_infix(TokenType.PLUS, 1, binary_op)
    parser.register_infix(TokenType.MINUS, 1, binary_op)
    parser.register_infix(TokenType.STAR, 2, binary_op)
    parser.register_infix(TokenType.SLASH, 2, binary_op)

    return parser

5. Error Handling and Recovery

class ParserError(Exception):
    def __init__(self, message, token):
        self.message = message
        self.token = token
        super().__init__(f"{message} at line {token.line}:{token.column}")

class RecoveringParser(Parser):
    """Parser with error recovery"""

    def __init__(self, tokens):
        super().__init__(tokens)
        self.errors = []

    def error(self, message):
        """Record error and return to allow recovery"""
        error = ParserError(message, self.current())
        self.errors.append(error)
        return error

    def synchronize(self):
        """Synchronize to next statement boundary"""
        self.advance()

        while not self.check(TokenType.EOF):
            if self.peek(-1).type == TokenType.NEWLINE:
                return
            if self.current().type in (
                TokenType.IF, TokenType.WHILE, TokenType.FOR,
                TokenType.DEF, TokenType.RETURN, TokenType.CLASS
            ):
                return
            self.advance()

    def statement(self):
        """Parse statement with recovery"""
        try:
            return super().statement()
        except SyntaxError as e:
            self.error(str(e))
            self.synchronize()
            return None

    def parse(self):
        """Parse program, collecting all errors"""
        statements = []
        while not self.check(TokenType.EOF):
            if self.check(TokenType.NEWLINE):
                self.advance()
                continue
            stmt = self.statement()
            if stmt:
                statements.append(stmt)

        if self.errors:
            print(f"Parsing completed with {len(self.errors)} error(s):")
            for error in self.errors:
                print(f"  {error}")

        return Program(statements)

Exercises

Basic

Write a grammar for a simple calculator with +, -, *, /, and parentheses.
Extend the parser to handle:
- Array literals: [1, 2, 3]
- Dictionary literals: {"key": value}
Add support for for loops with the syntax for x in iterable:.

Intermediate

Implement a parser for JSON that builds an AST.
Add error recovery that can continue parsing after errors.
Implement operator precedence parsing (Pratt parser) for a full expression grammar.

Advanced

Write an LR(0) parser generator.
Implement a parser for a language with significant whitespace (like Python).
Add support for operator overloading and custom operators.

Summary

CFGs describe language syntax with productions
ASTs represent program structure for further processing
Recursive descent is intuitive and maps directly to grammar
Precedence climbing handles operator precedence cleanly
Pratt parsing provides flexibility for complex expressions
Error recovery allows reporting multiple errors per parse

Next Reading

Semantic Analysis and Type Checking →