Lexical Analysis (Scanning)
Introduction
Lexical analysis is the first phase of compilation. The lexer (or scanner) reads source code as a stream of characters and groups them into meaningful units called tokens. This reading covers regular expressions, finite automata, and lexer implementation.
Learning Objectives
By the end of this reading, you will be able to:
- Understand the role of lexical analysis in compilation
- Define token patterns using regular expressions
- Convert regular expressions to finite automata
- Implement a lexer from scratch
- Use lexer generator tools
1. Tokens and Lexemes
Definitions
- Lexeme: The actual character sequence in source code
- Token: A category + optional attribute value
- Pattern: A rule describing lexemes that match a token
Source: if (count >= 10)
Lexemes: if ( count >= 10 )
Tokens: IF LPAREN ID GEQ NUM RPAREN
Common Token Categories
from enum import Enum, auto
class TokenType(Enum):
# Keywords
IF = auto()
ELSE = auto()
WHILE = auto()
FOR = auto()
RETURN = auto()
DEF = auto()
CLASS = auto()
# Literals
INTEGER = auto()
FLOAT = auto()
STRING = auto()
TRUE = auto()
FALSE = auto()
# Identifiers
IDENTIFIER = auto()
# Operators
PLUS = auto()
MINUS = auto()
STAR = auto()
SLASH = auto()
ASSIGN = auto()
EQ = auto()
NEQ = auto()
LT = auto()
GT = auto()
LEQ = auto()
GEQ = auto()
# Delimiters
LPAREN = auto()
RPAREN = auto()
LBRACE = auto()
RBRACE = auto()
LBRACKET = auto()
RBRACKET = auto()
COMMA = auto()
SEMICOLON = auto()
COLON = auto()
DOT = auto()
# Special
NEWLINE = auto()
EOF = auto()
ERROR = auto()
class Token:
def __init__(self, type: TokenType, value, line: int, column: int):
self.type = type
self.value = value
self.line = line
self.column = column
def __repr__(self):
return f"Token({self.type.name}, {self.value!r}, {self.line}:{self.column})"
2. Regular Expressions
Pattern Notation
| Pattern | Meaning |
|---|---|
a | Literal character 'a' |
ab | Concatenation: 'a' followed by 'b' |
a|b | Alternation: 'a' or 'b' |
a* | Kleene star: zero or more 'a' |
a+ | One or more 'a' |
a? | Zero or one 'a' |
[abc] | Character class: 'a', 'b', or 'c' |
[a-z] | Range: any lowercase letter |
[^a] | Negation: any character except 'a' |
. | Any character |
\d | Digit: [0-9] |
\w | Word: [a-zA-Z0-9_] |
\s | Whitespace |
Token Patterns
# Token patterns as regular expressions
TOKEN_PATTERNS = {
# Keywords (must check before identifiers)
'IF': r'if',
'ELSE': r'else',
'WHILE': r'while',
'FOR': r'for',
'RETURN': r'return',
'DEF': r'def',
'CLASS': r'class',
'TRUE': r'true',
'FALSE': r'false',
# Literals
'FLOAT': r'\d+\.\d+',
'INTEGER': r'\d+',
'STRING': r'"[^"]*"|\'[^\']*\'',
# Identifiers
'IDENTIFIER': r'[a-zA-Z_][a-zA-Z0-9_]*',
# Operators (longer first!)
'EQ': r'==',
'NEQ': r'!=',
'LEQ': r'<=',
'GEQ': r'>=',
'LT': r'<',
'GT': r'>',
'ASSIGN': r'=',
'PLUS': r'\+',
'MINUS': r'-',
'STAR': r'\*',
'SLASH': r'/',
# Delimiters
'LPAREN': r'\(',
'RPAREN': r'\)',
'LBRACE': r'\{',
'RBRACE': r'\}',
'LBRACKET': r'\[',
'RBRACKET': r'\]',
'COMMA': r',',
'SEMICOLON': r';',
'COLON': r':',
'DOT': r'\.',
# Whitespace (usually skipped)
'WHITESPACE': r'[ \t]+',
'NEWLINE': r'\n',
'COMMENT': r'#[^\n]*',
}
3. Finite Automata
DFA (Deterministic Finite Automaton)
A DFA is defined by:
- States Q
- Alphabet Σ
- Transition function δ: Q × Σ → Q
- Start state q₀
- Accept states F
class DFA:
"""Deterministic Finite Automaton"""
def __init__(self):
self.states = set()
self.alphabet = set()
self.transitions = {} # (state, char) -> next_state
self.start_state = None
self.accept_states = set()
def add_transition(self, from_state, char, to_state):
self.states.add(from_state)
self.states.add(to_state)
self.alphabet.add(char)
self.transitions[(from_state, char)] = to_state
def accepts(self, string):
"""Check if DFA accepts the string"""
state = self.start_state
for char in string:
key = (state, char)
if key not in self.transitions:
return False
state = self.transitions[key]
return state in self.accept_states
def simulate(self, string):
"""Run DFA on string, return (accepted, final_state, chars_consumed)"""
state = self.start_state
last_accept = -1
last_accept_state = None
for i, char in enumerate(string):
key = (state, char)
if key not in self.transitions:
break
state = self.transitions[key]
if state in self.accept_states:
last_accept = i
last_accept_state = state
if last_accept >= 0:
return True, last_accept_state, last_accept + 1
return False, None, 0
# Example: DFA for identifier [a-zA-Z_][a-zA-Z0-9_]*
def make_identifier_dfa():
dfa = DFA()
dfa.start_state = 0
dfa.accept_states = {1}
# First character: letter or underscore
for c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_':
dfa.add_transition(0, c, 1)
# Subsequent: letter, digit, or underscore
for c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_':
dfa.add_transition(1, c, 1)
return dfa
id_dfa = make_identifier_dfa()
print(id_dfa.accepts("hello")) # True
print(id_dfa.accepts("hello123")) # True
print(id_dfa.accepts("123hello")) # False
print(id_dfa.accepts("_private")) # True
NFA (Nondeterministic Finite Automaton)
NFAs allow:
- Multiple transitions for same (state, char)
- ε-transitions (move without consuming input)
class NFA:
"""Nondeterministic Finite Automaton"""
def __init__(self):
self.states = set()
self.alphabet = set()
self.transitions = {} # (state, char) -> set of states
self.epsilon_transitions = {} # state -> set of states
self.start_state = None
self.accept_states = set()
def add_transition(self, from_state, char, to_state):
self.states.add(from_state)
self.states.add(to_state)
if char is not None:
self.alphabet.add(char)
key = (from_state, char)
if key not in self.transitions:
self.transitions[key] = set()
self.transitions[key].add(to_state)
def add_epsilon(self, from_state, to_state):
"""Add epsilon transition"""
self.states.add(from_state)
self.states.add(to_state)
if from_state not in self.epsilon_transitions:
self.epsilon_transitions[from_state] = set()
self.epsilon_transitions[from_state].add(to_state)
def epsilon_closure(self, states):
"""Find all states reachable via epsilon transitions"""
closure = set(states)
stack = list(states)
while stack:
state = stack.pop()
for next_state in self.epsilon_transitions.get(state, []):
if next_state not in closure:
closure.add(next_state)
stack.append(next_state)
return closure
def accepts(self, string):
"""Check if NFA accepts the string"""
current = self.epsilon_closure({self.start_state})
for char in string:
next_states = set()
for state in current:
next_states.update(self.transitions.get((state, char), set()))
current = self.epsilon_closure(next_states)
return bool(current & self.accept_states)
Thompson's Construction
Convert regex to NFA:
class RegexToNFA:
"""Convert regular expression to NFA using Thompson's construction"""
def __init__(self):
self.state_counter = 0
def new_state(self):
state = self.state_counter
self.state_counter += 1
return state
def literal(self, char):
"""NFA for single character"""
nfa = NFA()
start = self.new_state()
accept = self.new_state()
nfa.start_state = start
nfa.accept_states = {accept}
nfa.add_transition(start, char, accept)
return nfa
def concatenate(self, nfa1, nfa2):
"""Concatenate two NFAs"""
# Connect nfa1's accept states to nfa2's start via epsilon
for accept in nfa1.accept_states:
nfa1.add_epsilon(accept, nfa2.start_state)
# Merge
nfa1.states.update(nfa2.states)
nfa1.transitions.update(nfa2.transitions)
nfa1.epsilon_transitions.update(nfa2.epsilon_transitions)
nfa1.accept_states = nfa2.accept_states
return nfa1
def alternate(self, nfa1, nfa2):
"""Alternation (union) of two NFAs"""
nfa = NFA()
start = self.new_state()
accept = self.new_state()
nfa.start_state = start
nfa.accept_states = {accept}
# Epsilon from new start to both NFAs
nfa.add_epsilon(start, nfa1.start_state)
nfa.add_epsilon(start, nfa2.start_state)
# Epsilon from both accepts to new accept
for s in nfa1.accept_states:
nfa.add_epsilon(s, accept)
for s in nfa2.accept_states:
nfa.add_epsilon(s, accept)
# Merge
nfa.states.update(nfa1.states)
nfa.states.update(nfa2.states)
nfa.transitions.update(nfa1.transitions)
nfa.transitions.update(nfa2.transitions)
nfa.epsilon_transitions.update(nfa1.epsilon_transitions)
nfa.epsilon_transitions.update(nfa2.epsilon_transitions)
return nfa
def kleene_star(self, nfa1):
"""Kleene star of NFA"""
nfa = NFA()
start = self.new_state()
accept = self.new_state()
nfa.start_state = start
nfa.accept_states = {accept}
# Epsilon from new start to old start and to accept (empty string)
nfa.add_epsilon(start, nfa1.start_state)
nfa.add_epsilon(start, accept)
# Epsilon from old accepts to old start (repeat) and new accept
for s in nfa1.accept_states:
nfa.add_epsilon(s, nfa1.start_state)
nfa.add_epsilon(s, accept)
# Merge
nfa.states.update(nfa1.states)
nfa.transitions.update(nfa1.transitions)
nfa.epsilon_transitions.update(nfa1.epsilon_transitions)
return nfa
4. Implementing a Lexer
Simple Hand-Written Lexer
class Lexer:
"""Hand-written lexer for a simple language"""
def __init__(self, source: str):
self.source = source
self.pos = 0
self.line = 1
self.column = 1
def peek(self, offset=0):
"""Look at character at current position + offset"""
pos = self.pos + offset
if pos < len(self.source):
return self.source[pos]
return '\0'
def advance(self):
"""Move to next character"""
char = self.peek()
self.pos += 1
if char == '\n':
self.line += 1
self.column = 1
else:
self.column += 1
return char
def skip_whitespace(self):
"""Skip spaces and tabs"""
while self.peek() in ' \t':
self.advance()
def skip_comment(self):
"""Skip # comments"""
if self.peek() == '#':
while self.peek() not in '\n\0':
self.advance()
def read_string(self, quote):
"""Read string literal"""
self.advance() # Opening quote
start_line = self.line
start_col = self.column
result = []
while self.peek() != quote and self.peek() != '\0':
if self.peek() == '\\':
self.advance()
escape = self.advance()
if escape == 'n':
result.append('\n')
elif escape == 't':
result.append('\t')
elif escape == '\\':
result.append('\\')
elif escape == quote:
result.append(quote)
else:
result.append(escape)
else:
result.append(self.advance())
if self.peek() == '\0':
return Token(TokenType.ERROR, "Unterminated string", start_line, start_col)
self.advance() # Closing quote
return Token(TokenType.STRING, ''.join(result), start_line, start_col)
def read_number(self):
"""Read integer or float literal"""
start_line = self.line
start_col = self.column
result = []
while self.peek().isdigit():
result.append(self.advance())
# Check for float
if self.peek() == '.' and self.peek(1).isdigit():
result.append(self.advance()) # The '.'
while self.peek().isdigit():
result.append(self.advance())
return Token(TokenType.FLOAT, float(''.join(result)), start_line, start_col)
return Token(TokenType.INTEGER, int(''.join(result)), start_line, start_col)
def read_identifier(self):
"""Read identifier or keyword"""
start_line = self.line
start_col = self.column
result = []
while self.peek().isalnum() or self.peek() == '_':
result.append(self.advance())
lexeme = ''.join(result)
# Check for keywords
keywords = {
'if': TokenType.IF,
'else': TokenType.ELSE,
'while': TokenType.WHILE,
'for': TokenType.FOR,
'return': TokenType.RETURN,
'def': TokenType.DEF,
'class': TokenType.CLASS,
'true': TokenType.TRUE,
'false': TokenType.FALSE,
}
token_type = keywords.get(lexeme, TokenType.IDENTIFIER)
return Token(token_type, lexeme, start_line, start_col)
def next_token(self):
"""Get next token from source"""
self.skip_whitespace()
self.skip_comment()
if self.pos >= len(self.source):
return Token(TokenType.EOF, None, self.line, self.column)
start_line = self.line
start_col = self.column
char = self.peek()
# String
if char in '"\'':
return self.read_string(char)
# Number
if char.isdigit():
return self.read_number()
# Identifier/keyword
if char.isalpha() or char == '_':
return self.read_identifier()
# Two-character operators
two_char = self.source[self.pos:self.pos+2]
two_char_ops = {
'==': TokenType.EQ,
'!=': TokenType.NEQ,
'<=': TokenType.LEQ,
'>=': TokenType.GEQ,
}
if two_char in two_char_ops:
self.advance()
self.advance()
return Token(two_char_ops[two_char], two_char, start_line, start_col)
# Single-character tokens
single_char = {
'+': TokenType.PLUS,
'-': TokenType.MINUS,
'*': TokenType.STAR,
'/': TokenType.SLASH,
'=': TokenType.ASSIGN,
'<': TokenType.LT,
'>': TokenType.GT,
'(': TokenType.LPAREN,
')': TokenType.RPAREN,
'{': TokenType.LBRACE,
'}': TokenType.RBRACE,
'[': TokenType.LBRACKET,
']': TokenType.RBRACKET,
',': TokenType.COMMA,
';': TokenType.SEMICOLON,
':': TokenType.COLON,
'.': TokenType.DOT,
'\n': TokenType.NEWLINE,
}
if char in single_char:
self.advance()
return Token(single_char[char], char, start_line, start_col)
# Unknown character
self.advance()
return Token(TokenType.ERROR, f"Unexpected character: {char}", start_line, start_col)
def tokenize(self):
"""Return all tokens as a list"""
tokens = []
while True:
token = self.next_token()
tokens.append(token)
if token.type == TokenType.EOF:
break
return tokens
# Example usage
source = '''
def factorial(n):
if n <= 1:
return 1
return n * factorial(n - 1)
result = factorial(5)
'''
lexer = Lexer(source)
tokens = lexer.tokenize()
for token in tokens:
if token.type not in (TokenType.NEWLINE, TokenType.EOF):
print(token)
5. Regex-Based Lexer
import re
class RegexLexer:
"""Lexer using regular expressions"""
def __init__(self, rules):
"""
rules: list of (token_type, pattern) tuples
Order matters - first match wins
"""
self.rules = [(t, re.compile(p)) for t, p in rules]
def tokenize(self, source):
tokens = []
pos = 0
line = 1
line_start = 0
while pos < len(source):
# Try each rule
match = None
for token_type, pattern in self.rules:
match = pattern.match(source, pos)
if match:
value = match.group(0)
col = pos - line_start + 1
if token_type is not None: # None means skip
tokens.append(Token(token_type, value, line, col))
# Update position and line tracking
newlines = value.count('\n')
if newlines:
line += newlines
line_start = pos + value.rfind('\n') + 1
pos = match.end()
break
if not match:
# No rule matched
col = pos - line_start + 1
tokens.append(Token(TokenType.ERROR, source[pos], line, col))
pos += 1
tokens.append(Token(TokenType.EOF, None, line, pos - line_start + 1))
return tokens
# Define rules (order matters!)
rules = [
# Skip whitespace and comments
(None, r'[ \t]+'),
(None, r'#[^\n]*'),
# Keywords (before identifier!)
(TokenType.IF, r'if(?![a-zA-Z0-9_])'),
(TokenType.ELSE, r'else(?![a-zA-Z0-9_])'),
(TokenType.WHILE, r'while(?![a-zA-Z0-9_])'),
(TokenType.FOR, r'for(?![a-zA-Z0-9_])'),
(TokenType.RETURN, r'return(?![a-zA-Z0-9_])'),
(TokenType.DEF, r'def(?![a-zA-Z0-9_])'),
(TokenType.TRUE, r'true(?![a-zA-Z0-9_])'),
(TokenType.FALSE, r'false(?![a-zA-Z0-9_])'),
# Literals
(TokenType.FLOAT, r'\d+\.\d+'),
(TokenType.INTEGER, r'\d+'),
(TokenType.STRING, r'"[^"]*"|\'[^\']*\''),
# Identifier
(TokenType.IDENTIFIER, r'[a-zA-Z_][a-zA-Z0-9_]*'),
# Two-char operators
(TokenType.EQ, r'=='),
(TokenType.NEQ, r'!='),
(TokenType.LEQ, r'<='),
(TokenType.GEQ, r'>='),
# Single-char operators
(TokenType.ASSIGN, r'='),
(TokenType.PLUS, r'\+'),
(TokenType.MINUS, r'-'),
(TokenType.STAR, r'\*'),
(TokenType.SLASH, r'/'),
(TokenType.LT, r'<'),
(TokenType.GT, r'>'),
# Delimiters
(TokenType.LPAREN, r'\('),
(TokenType.RPAREN, r'\)'),
(TokenType.LBRACE, r'\{'),
(TokenType.RBRACE, r'\}'),
(TokenType.LBRACKET, r'\['),
(TokenType.RBRACKET, r'\]'),
(TokenType.COMMA, r','),
(TokenType.SEMICOLON, r';'),
(TokenType.COLON, r':'),
(TokenType.DOT, r'\.'),
(TokenType.NEWLINE, r'\n'),
]
lexer = RegexLexer(rules)
tokens = lexer.tokenize("x = 42 + y")
for t in tokens:
print(t)
6. Error Handling
class LexerError(Exception):
def __init__(self, message, line, column):
self.message = message
self.line = line
self.column = column
super().__init__(f"Lexer error at {line}:{column}: {message}")
class RobustLexer(Lexer):
"""Lexer with better error handling and recovery"""
def __init__(self, source):
super().__init__(source)
self.errors = []
def report_error(self, message):
error = LexerError(message, self.line, self.column)
self.errors.append(error)
return Token(TokenType.ERROR, message, self.line, self.column)
def synchronize(self):
"""Skip to next likely token boundary"""
while self.peek() not in ' \t\n\0;{}()':
self.advance()
def tokenize(self):
tokens = []
while True:
try:
token = self.next_token()
tokens.append(token)
if token.type == TokenType.ERROR:
self.synchronize()
if token.type == TokenType.EOF:
break
except Exception as e:
self.errors.append(LexerError(str(e), self.line, self.column))
self.synchronize()
if self.errors:
print(f"Lexer found {len(self.errors)} error(s):")
for error in self.errors:
print(f" {error}")
return tokens
Exercises
Basic
Write regular expressions for:
- C-style multi-line comments
/* ... */ - Hexadecimal numbers
0x1A3F - Email addresses (simplified)
- C-style multi-line comments
Extend the lexer to handle Python-style indentation (INDENT/DEDENT tokens).
Build a DFA that recognizes floating-point numbers.
Intermediate
Implement the subset construction algorithm to convert NFA to DFA.
Write a lexer for a simple JSON subset (objects, arrays, strings, numbers, true/false/null).
Add line/column tracking and produce helpful error messages.
Advanced
Implement a lexer generator that takes token specifications and produces a lexer.
Handle Unicode identifiers (letters from any language).
Implement the Aho-Corasick algorithm for multi-pattern matching.
Summary
- Lexical analysis breaks source code into tokens
- Tokens have a type (category) and value (lexeme)
- Regular expressions define token patterns
- DFAs efficiently recognize tokens
- NFAs are easier to construct, can be converted to DFAs
- Hand-written lexers give full control
- Regex-based lexers are easier to maintain