Parsing and Syntax Analysis
Introduction
Parsing is the second phase of compilation. The parser takes tokens from the lexer and builds a structured representation (usually an Abstract Syntax Tree) according to the language's grammar. This reading covers context-free grammars, parsing algorithms, and parser implementation.
Learning Objectives
By the end of this reading, you will be able to:
- Write context-free grammars for programming languages
- Understand the difference between LL and LR parsing
- Implement a recursive descent parser
- Build Abstract Syntax Trees
- Handle operator precedence and associativity
1. Context-Free Grammars
Formal Definition
A CFG consists of:
- Terminals: Token types from the lexer
- Non-terminals: Syntactic categories
- Productions: Rules of the form A → α
- Start symbol: The top-level non-terminal
// Simple expression grammar
expr → term (('+' | '-') term)*
term → factor (('*' | '/') factor)*
factor → NUMBER | '(' expr ')' | IDENTIFIER
BNF Notation
<program> ::= <statement>*
<statement> ::= <if_stmt> | <while_stmt> | <assign_stmt> | <expr_stmt>
<if_stmt> ::= 'if' <expr> ':' <block> ('else' ':' <block>)?
<while_stmt> ::= 'while' <expr> ':' <block>
<assign_stmt>::= IDENTIFIER '=' <expr>
<expr_stmt> ::= <expr>
<block> ::= INDENT <statement>+ DEDENT
Grammar in Python
from dataclasses import dataclass
from typing import List, Optional, Union
# Grammar representation
@dataclass
class Production:
lhs: str # Left-hand side (non-terminal)
rhs: List[str] # Right-hand side (terminals and non-terminals)
class Grammar:
def __init__(self):
self.productions: List[Production] = []
self.start_symbol: str = None
self.terminals: set = set()
self.non_terminals: set = set()
def add_production(self, lhs: str, *rhs: str):
self.productions.append(Production(lhs, list(rhs)))
self.non_terminals.add(lhs)
for symbol in rhs:
if symbol.isupper() or symbol in '()+-*/=<>':
self.terminals.add(symbol)
elif not symbol.startswith("'"):
self.non_terminals.add(symbol)
# Example grammar
g = Grammar()
g.start_symbol = 'expr'
g.add_production('expr', 'term', 'expr_rest')
g.add_production('expr_rest', 'PLUS', 'term', 'expr_rest')
g.add_production('expr_rest', 'MINUS', 'term', 'expr_rest')
g.add_production('expr_rest') # epsilon production
g.add_production('term', 'factor', 'term_rest')
g.add_production('term_rest', 'STAR', 'factor', 'term_rest')
g.add_production('term_rest', 'SLASH', 'factor', 'term_rest')
g.add_production('term_rest') # epsilon
g.add_production('factor', 'NUMBER')
g.add_production('factor', 'LPAREN', 'expr', 'RPAREN')
2. Abstract Syntax Trees
AST Node Definitions
from dataclasses import dataclass
from typing import List, Optional, Any
# Base class for all AST nodes
@dataclass
class AST:
pass
# Expressions
@dataclass
class NumberLiteral(AST):
value: float
@dataclass
class StringLiteral(AST):
value: str
@dataclass
class BoolLiteral(AST):
value: bool
@dataclass
class Identifier(AST):
name: str
@dataclass
class BinaryOp(AST):
op: str
left: AST
right: AST
@dataclass
class UnaryOp(AST):
op: str
operand: AST
@dataclass
class Call(AST):
callee: AST
arguments: List[AST]
@dataclass
class Index(AST):
obj: AST
index: AST
# Statements
@dataclass
class ExpressionStmt(AST):
expr: AST
@dataclass
class Assignment(AST):
target: Identifier
value: AST
@dataclass
class IfStmt(AST):
condition: AST
then_branch: List[AST]
else_branch: Optional[List[AST]]
@dataclass
class WhileStmt(AST):
condition: AST
body: List[AST]
@dataclass
class ForStmt(AST):
var: Identifier
iterable: AST
body: List[AST]
@dataclass
class FunctionDef(AST):
name: str
params: List[str]
body: List[AST]
@dataclass
class Return(AST):
value: Optional[AST]
@dataclass
class Program(AST):
statements: List[AST]
AST Visualization
def print_ast(node: AST, indent: int = 0):
"""Pretty print an AST"""
prefix = " " * indent
if isinstance(node, NumberLiteral):
print(f"{prefix}Number({node.value})")
elif isinstance(node, StringLiteral):
print(f"{prefix}String({node.value!r})")
elif isinstance(node, Identifier):
print(f"{prefix}Identifier({node.name})")
elif isinstance(node, BinaryOp):
print(f"{prefix}BinaryOp({node.op})")
print_ast(node.left, indent + 1)
print_ast(node.right, indent + 1)
elif isinstance(node, UnaryOp):
print(f"{prefix}UnaryOp({node.op})")
print_ast(node.operand, indent + 1)
elif isinstance(node, Call):
print(f"{prefix}Call")
print(f"{prefix} callee:")
print_ast(node.callee, indent + 2)
print(f"{prefix} arguments:")
for arg in node.arguments:
print_ast(arg, indent + 2)
elif isinstance(node, Assignment):
print(f"{prefix}Assignment")
print(f"{prefix} target: {node.target.name}")
print(f"{prefix} value:")
print_ast(node.value, indent + 2)
elif isinstance(node, IfStmt):
print(f"{prefix}If")
print(f"{prefix} condition:")
print_ast(node.condition, indent + 2)
print(f"{prefix} then:")
for stmt in node.then_branch:
print_ast(stmt, indent + 2)
if node.else_branch:
print(f"{prefix} else:")
for stmt in node.else_branch:
print_ast(stmt, indent + 2)
elif isinstance(node, FunctionDef):
print(f"{prefix}FunctionDef({node.name})")
print(f"{prefix} params: {node.params}")
print(f"{prefix} body:")
for stmt in node.body:
print_ast(stmt, indent + 2)
elif isinstance(node, Program):
print(f"{prefix}Program")
for stmt in node.statements:
print_ast(stmt, indent + 1)
else:
print(f"{prefix}{type(node).__name__}: {node}")
3. Recursive Descent Parser
Parser Structure
class Parser:
"""Recursive descent parser"""
def __init__(self, tokens: List[Token]):
self.tokens = tokens
self.pos = 0
def current(self) -> Token:
"""Get current token"""
if self.pos < len(self.tokens):
return self.tokens[self.pos]
return self.tokens[-1] # EOF
def peek(self, offset: int = 0) -> Token:
"""Look ahead"""
pos = self.pos + offset
if pos < len(self.tokens):
return self.tokens[pos]
return self.tokens[-1]
def advance(self) -> Token:
"""Consume current token and return it"""
token = self.current()
if self.pos < len(self.tokens):
self.pos += 1
return token
def match(self, *types: TokenType) -> bool:
"""Check if current token matches any of the types"""
return self.current().type in types
def expect(self, token_type: TokenType) -> Token:
"""Consume token of expected type or raise error"""
if self.current().type == token_type:
return self.advance()
raise SyntaxError(
f"Expected {token_type.name}, got {self.current().type.name} "
f"at line {self.current().line}"
)
def check(self, token_type: TokenType) -> bool:
"""Check current token type without consuming"""
return self.current().type == token_type
# ===== Grammar Rules =====
def parse(self) -> Program:
"""program → statement* EOF"""
statements = []
while not self.check(TokenType.EOF):
if self.check(TokenType.NEWLINE):
self.advance()
continue
statements.append(self.statement())
return Program(statements)
def statement(self) -> AST:
"""statement → if_stmt | while_stmt | func_def | assignment | expr_stmt"""
if self.match(TokenType.IF):
return self.if_statement()
elif self.match(TokenType.WHILE):
return self.while_statement()
elif self.match(TokenType.DEF):
return self.function_def()
elif self.match(TokenType.RETURN):
return self.return_statement()
elif self.match(TokenType.IDENTIFIER) and self.peek(1).type == TokenType.ASSIGN:
return self.assignment()
else:
return self.expression_statement()
def if_statement(self) -> IfStmt:
"""if_stmt → 'if' expr ':' block ('else' ':' block)?"""
self.expect(TokenType.IF)
condition = self.expression()
self.expect(TokenType.COLON)
self.expect(TokenType.NEWLINE)
then_branch = self.block()
else_branch = None
if self.match(TokenType.ELSE):
self.advance()
self.expect(TokenType.COLON)
self.expect(TokenType.NEWLINE)
else_branch = self.block()
return IfStmt(condition, then_branch, else_branch)
def while_statement(self) -> WhileStmt:
"""while_stmt → 'while' expr ':' block"""
self.expect(TokenType.WHILE)
condition = self.expression()
self.expect(TokenType.COLON)
self.expect(TokenType.NEWLINE)
body = self.block()
return WhileStmt(condition, body)
def function_def(self) -> FunctionDef:
"""func_def → 'def' IDENTIFIER '(' params? ')' ':' block"""
self.expect(TokenType.DEF)
name = self.expect(TokenType.IDENTIFIER).value
self.expect(TokenType.LPAREN)
params = []
if not self.check(TokenType.RPAREN):
params.append(self.expect(TokenType.IDENTIFIER).value)
while self.match(TokenType.COMMA):
self.advance()
params.append(self.expect(TokenType.IDENTIFIER).value)
self.expect(TokenType.RPAREN)
self.expect(TokenType.COLON)
self.expect(TokenType.NEWLINE)
body = self.block()
return FunctionDef(name, params, body)
def return_statement(self) -> Return:
"""return_stmt → 'return' expr?"""
self.expect(TokenType.RETURN)
value = None
if not self.check(TokenType.NEWLINE) and not self.check(TokenType.EOF):
value = self.expression()
return Return(value)
def assignment(self) -> Assignment:
"""assignment → IDENTIFIER '=' expr"""
name = self.expect(TokenType.IDENTIFIER)
self.expect(TokenType.ASSIGN)
value = self.expression()
return Assignment(Identifier(name.value), value)
def expression_statement(self) -> ExpressionStmt:
"""expr_stmt → expr"""
expr = self.expression()
return ExpressionStmt(expr)
def block(self) -> List[AST]:
"""block → INDENT statement+ DEDENT (simplified: just statements until outdent)"""
# Simplified: collect statements until we see something that ends the block
statements = []
while not self.check(TokenType.EOF):
if self.check(TokenType.ELSE):
break
if self.check(TokenType.NEWLINE):
self.advance()
continue
# Simple heuristic: stop at keywords that start new statements at outer level
if self.current().line > self.tokens[0].line:
# Check for dedent by seeing if we're at a new statement keyword at column 1
pass
statements.append(self.statement())
# For simplicity, just take one statement in block
break
return statements
# ===== Expressions =====
def expression(self) -> AST:
"""expression → comparison"""
return self.comparison()
def comparison(self) -> AST:
"""comparison → addition (('<' | '>' | '<=' | '>=' | '==' | '!=') addition)*"""
left = self.addition()
while self.match(TokenType.LT, TokenType.GT, TokenType.LEQ,
TokenType.GEQ, TokenType.EQ, TokenType.NEQ):
op = self.advance().value
right = self.addition()
left = BinaryOp(op, left, right)
return left
def addition(self) -> AST:
"""addition → multiplication (('+' | '-') multiplication)*"""
left = self.multiplication()
while self.match(TokenType.PLUS, TokenType.MINUS):
op = self.advance().value
right = self.multiplication()
left = BinaryOp(op, left, right)
return left
def multiplication(self) -> AST:
"""multiplication → unary (('*' | '/') unary)*"""
left = self.unary()
while self.match(TokenType.STAR, TokenType.SLASH):
op = self.advance().value
right = self.unary()
left = BinaryOp(op, left, right)
return left
def unary(self) -> AST:
"""unary → ('-' | '!') unary | call"""
if self.match(TokenType.MINUS):
op = self.advance().value
operand = self.unary()
return UnaryOp(op, operand)
return self.call()
def call(self) -> AST:
"""call → primary ('(' arguments? ')')*"""
expr = self.primary()
while True:
if self.match(TokenType.LPAREN):
self.advance()
args = []
if not self.check(TokenType.RPAREN):
args.append(self.expression())
while self.match(TokenType.COMMA):
self.advance()
args.append(self.expression())
self.expect(TokenType.RPAREN)
expr = Call(expr, args)
else:
break
return expr
def primary(self) -> AST:
"""primary → NUMBER | STRING | TRUE | FALSE | IDENTIFIER | '(' expr ')'"""
if self.match(TokenType.INTEGER):
return NumberLiteral(self.advance().value)
elif self.match(TokenType.FLOAT):
return NumberLiteral(self.advance().value)
elif self.match(TokenType.STRING):
return StringLiteral(self.advance().value)
elif self.match(TokenType.TRUE):
self.advance()
return BoolLiteral(True)
elif self.match(TokenType.FALSE):
self.advance()
return BoolLiteral(False)
elif self.match(TokenType.IDENTIFIER):
return Identifier(self.advance().value)
elif self.match(TokenType.LPAREN):
self.advance()
expr = self.expression()
self.expect(TokenType.RPAREN)
return expr
else:
raise SyntaxError(
f"Unexpected token {self.current().type.name} "
f"at line {self.current().line}"
)
# Example usage
source = "x = 1 + 2 * 3"
lexer = Lexer(source)
tokens = [t for t in lexer.tokenize() if t.type != TokenType.NEWLINE]
parser = Parser(tokens)
ast = parser.parse()
print_ast(ast)
4. Operator Precedence
Precedence Climbing
class PrecedenceParser:
"""Expression parser using precedence climbing"""
# Operator precedence (higher = binds tighter)
PRECEDENCE = {
'==': 1, '!=': 1,
'<': 2, '>': 2, '<=': 2, '>=': 2,
'+': 3, '-': 3,
'*': 4, '/': 4,
}
# Associativity
RIGHT_ASSOC = set() # Add operators like ** for right associativity
def __init__(self, tokens):
self.tokens = tokens
self.pos = 0
def current(self):
return self.tokens[self.pos] if self.pos < len(self.tokens) else None
def advance(self):
token = self.current()
self.pos += 1
return token
def parse_expression(self, min_precedence=0):
"""Parse expression with precedence climbing"""
left = self.parse_atom()
while True:
token = self.current()
if token is None or token.type == TokenType.EOF:
break
op = token.value
if op not in self.PRECEDENCE:
break
prec = self.PRECEDENCE[op]
if prec < min_precedence:
break
self.advance() # Consume operator
# Right associativity: use same precedence
# Left associativity: use higher precedence
next_min = prec if op in self.RIGHT_ASSOC else prec + 1
right = self.parse_expression(next_min)
left = BinaryOp(op, left, right)
return left
def parse_atom(self):
"""Parse atomic expression"""
token = self.current()
if token.type == TokenType.INTEGER:
self.advance()
return NumberLiteral(token.value)
elif token.type == TokenType.IDENTIFIER:
self.advance()
return Identifier(token.value)
elif token.type == TokenType.LPAREN:
self.advance()
expr = self.parse_expression()
self.advance() # RPAREN
return expr
raise SyntaxError(f"Unexpected token: {token}")
Pratt Parser
class PrattParser:
"""Top-down operator precedence parser (Pratt parser)"""
def __init__(self, tokens):
self.tokens = tokens
self.pos = 0
# Prefix parselets (for literals, identifiers, unary operators)
self.prefix_parselets = {}
# Infix parselets (for binary operators)
self.infix_parselets = {}
def register_prefix(self, token_type, parselet):
self.prefix_parselets[token_type] = parselet
def register_infix(self, token_type, precedence, parselet):
self.infix_parselets[token_type] = (precedence, parselet)
def current(self):
return self.tokens[self.pos] if self.pos < len(self.tokens) else None
def advance(self):
token = self.current()
self.pos += 1
return token
def get_precedence(self):
"""Get precedence of current token"""
token = self.current()
if token and token.type in self.infix_parselets:
return self.infix_parselets[token.type][0]
return 0
def parse_expression(self, precedence=0):
"""Parse expression with given precedence"""
token = self.advance()
if token.type not in self.prefix_parselets:
raise SyntaxError(f"Cannot parse {token}")
left = self.prefix_parselets[token.type](self, token)
while precedence < self.get_precedence():
token = self.advance()
_, parselet = self.infix_parselets[token.type]
left = parselet(self, left, token)
return left
# Setup Pratt parser
def setup_pratt_parser(tokens):
parser = PrattParser(tokens)
# Prefix parselets
parser.register_prefix(TokenType.INTEGER,
lambda p, t: NumberLiteral(t.value))
parser.register_prefix(TokenType.IDENTIFIER,
lambda p, t: Identifier(t.value))
parser.register_prefix(TokenType.LPAREN,
lambda p, t: (expr := p.parse_expression(), p.advance(), expr)[0])
parser.register_prefix(TokenType.MINUS,
lambda p, t: UnaryOp('-', p.parse_expression(5))) # High precedence
# Infix parselets (binary operators)
def binary_op(parser, left, token):
prec, _ = parser.infix_parselets[token.type]
right = parser.parse_expression(prec)
return BinaryOp(token.value, left, right)
parser.register_infix(TokenType.PLUS, 1, binary_op)
parser.register_infix(TokenType.MINUS, 1, binary_op)
parser.register_infix(TokenType.STAR, 2, binary_op)
parser.register_infix(TokenType.SLASH, 2, binary_op)
return parser
5. Error Handling and Recovery
class ParserError(Exception):
def __init__(self, message, token):
self.message = message
self.token = token
super().__init__(f"{message} at line {token.line}:{token.column}")
class RecoveringParser(Parser):
"""Parser with error recovery"""
def __init__(self, tokens):
super().__init__(tokens)
self.errors = []
def error(self, message):
"""Record error and return to allow recovery"""
error = ParserError(message, self.current())
self.errors.append(error)
return error
def synchronize(self):
"""Synchronize to next statement boundary"""
self.advance()
while not self.check(TokenType.EOF):
if self.peek(-1).type == TokenType.NEWLINE:
return
if self.current().type in (
TokenType.IF, TokenType.WHILE, TokenType.FOR,
TokenType.DEF, TokenType.RETURN, TokenType.CLASS
):
return
self.advance()
def statement(self):
"""Parse statement with recovery"""
try:
return super().statement()
except SyntaxError as e:
self.error(str(e))
self.synchronize()
return None
def parse(self):
"""Parse program, collecting all errors"""
statements = []
while not self.check(TokenType.EOF):
if self.check(TokenType.NEWLINE):
self.advance()
continue
stmt = self.statement()
if stmt:
statements.append(stmt)
if self.errors:
print(f"Parsing completed with {len(self.errors)} error(s):")
for error in self.errors:
print(f" {error}")
return Program(statements)
Exercises
Basic
Write a grammar for a simple calculator with +, -, *, /, and parentheses.
Extend the parser to handle:
- Array literals:
[1, 2, 3] - Dictionary literals:
{"key": value}
- Array literals:
Add support for
forloops with the syntaxfor x in iterable:.
Intermediate
Implement a parser for JSON that builds an AST.
Add error recovery that can continue parsing after errors.
Implement operator precedence parsing (Pratt parser) for a full expression grammar.
Advanced
Write an LR(0) parser generator.
Implement a parser for a language with significant whitespace (like Python).
Add support for operator overloading and custom operators.
Summary
- CFGs describe language syntax with productions
- ASTs represent program structure for further processing
- Recursive descent is intuitive and maps directly to grammar
- Precedence climbing handles operator precedence cleanly
- Pratt parsing provides flexibility for complex expressions
- Error recovery allows reporting multiple errors per parse