Lexical Analysis (Scanning)
Lexical analysis is the first phase of compilation. The lexer (or scanner) reads source code as a stream of characters and groups them into meaningful units called tokens. ## 1. Tokens and Lexemes
Definitions
- Lexeme: The actual character sequence in source code
- Token: A category + optional attribute value
- Pattern: A rule describing lexemes that match a token
Source: if (count >= 10)
Lexemes: if ( count >= 10 )
Tokens: IF LPAREN ID GEQ NUM RPAREN
Common Token Categories
from enum import Enum, auto
class TokenType(Enum):
# Keywords
IF = auto()
ELSE = auto()
WHILE = auto()
FOR = auto()
RETURN = auto()
DEF = auto()
CLASS = auto()
# Literals
INTEGER = auto()
FLOAT = auto()
STRING = auto()
TRUE = auto()
FALSE = auto()
# Identifiers
IDENTIFIER = auto()
# Operators
PLUS = auto()
MINUS = auto()
STAR = auto()
SLASH = auto()
ASSIGN = auto()
EQ = auto()
NEQ = auto()
LT = auto()
GT = auto()
LEQ = auto()
GEQ = auto()
# Delimiters
LPAREN = auto()
RPAREN = auto()
LBRACE = auto()
RBRACE = auto()
LBRACKET = auto()
RBRACKET = auto()
COMMA = auto()
SEMICOLON = auto()
COLON = auto()
DOT = auto()
# Special
NEWLINE = auto()
EOF = auto()
ERROR = auto()
class Token:
def __init__(self, type: TokenType, value, line: int, column: int):
self.type = type
self.value = value
self.line = line
self.column = column
def __repr__(self):
return f"Token({self.type.name}, {self.value!r}, {self.line}:{self.column})"
2. Regular Expressions
Pattern Notation
| Pattern | Meaning |
|---|---|
a | Literal character 'a' |
ab | Concatenation: 'a' followed by 'b' |
a|b | Alternation: 'a' or 'b' |
a* | Kleene star: zero or more 'a' |
a+ | One or more 'a' |
a? | Zero or one 'a' |
[abc] | Character class: 'a', 'b', or 'c' |
[a-z] | Range: any lowercase letter |
[^a] | Negation: any character except 'a' |
. | Any character |
\d | Digit: [0-9] |
\w | Word: [a-zA-Z0-9_] |
\s | Whitespace |
Token Patterns
# Token patterns as regular expressions
TOKEN_PATTERNS = {
# Keywords (must check before identifiers)
'IF': r'if',
'ELSE': r'else',
'WHILE': r'while',
'FOR': r'for',
'RETURN': r'return',
'DEF': r'def',
'CLASS': r'class',
'TRUE': r'true',
'FALSE': r'false',
# Literals
'FLOAT': r'\d+\.\d+',
'INTEGER': r'\d+',
'STRING': r'"[^"]*"|\'[^\']*\'',
# Identifiers
'IDENTIFIER': r'[a-zA-Z_][a-zA-Z0-9_]*',
# Operators (longer first!)
'EQ': r'==',
'NEQ': r'!=',
'LEQ': r'<=',
'GEQ': r'>=',
'LT': r'<',
'GT': r'>',
'ASSIGN': r'=',
'PLUS': r'\+',
'MINUS': r'-',
'STAR': r'\*',
'SLASH': r'/',
# Delimiters
'LPAREN': r'\(',
'RPAREN': r'\)',
'LBRACE': r'\{',
'RBRACE': r'\}',
'LBRACKET': r'\[',
'RBRACKET': r'\]',
'COMMA': r',',
'SEMICOLON': r';',
'COLON': r':',
'DOT': r'\.',
# Whitespace (usually skipped)
'WHITESPACE': r'[ \t]+',
'NEWLINE': r'\n',
'COMMENT': r'#[^\n]*',
}
3. Finite Automata
DFA (Deterministic Finite Automaton)
A DFA is defined by:
- States Q
- Alphabet Σ
- Transition function δ: Q × Σ → Q
- Start state q₀
- Accept states F
class DFA:
"""Deterministic Finite Automaton"""
def __init__(self):
self.states = set()
self.alphabet = set()
self.transitions = {} # (state, char) -> next_state
self.start_state = None
self.accept_states = set()
def add_transition(self, from_state, char, to_state):
self.states.add(from_state)
self.states.add(to_state)
self.alphabet.add(char)
self.transitions[(from_state, char)] = to_state
def accepts(self, string):
"""Check if DFA accepts the string"""
state = self.start_state
for char in string:
key = (state, char)
if key not in self.transitions:
return False
state = self.transitions[key]
return state in self.accept_states
def simulate(self, string):
"""Run DFA on string, return (accepted, final_state, chars_consumed)"""
state = self.start_state
last_accept = -1
last_accept_state = None
for i, char in enumerate(string):
key = (state, char)
if key not in self.transitions:
break
state = self.transitions[key]
if state in self.accept_states:
last_accept = i
last_accept_state = state
if last_accept >= 0:
return True, last_accept_state, last_accept + 1
return False, None, 0
# Example: DFA for identifier [a-zA-Z_][a-zA-Z0-9_]*
def make_identifier_dfa():
dfa = DFA()
dfa.start_state = 0
dfa.accept_states = {1}
# First character: letter or underscore
for c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_':
dfa.add_transition(0, c, 1)
# Subsequent: letter, digit, or underscore
for c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_':
dfa.add_transition(1, c, 1)
return dfa
id_dfa = make_identifier_dfa()
print(id_dfa.accepts("hello")) # True
print(id_dfa.accepts("hello123")) # True
print(id_dfa.accepts("123hello")) # False
print(id_dfa.accepts("_private")) # True
NFA (Nondeterministic Finite Automaton)
NFAs allow:
- Multiple transitions for same (state, char)
- ε-transitions (move without consuming input)
class NFA:
"""Nondeterministic Finite Automaton"""
def __init__(self):
self.states = set()
self.alphabet = set()
self.transitions = {} # (state, char) -> set of states
self.epsilon_transitions = {} # state -> set of states
self.start_state = None
self.accept_states = set()
def add_transition(self, from_state, char, to_state):
self.states.add(from_state)
self.states.add(to_state)
if char is not None:
self.alphabet.add(char)
key = (from_state, char)
if key not in self.transitions:
self.transitions[key] = set()
self.transitions[key].add(to_state)
def add_epsilon(self, from_state, to_state):
"""Add epsilon transition"""
self.states.add(from_state)
self.states.add(to_state)
if from_state not in self.epsilon_transitions:
self.epsilon_transitions[from_state] = set()
self.epsilon_transitions[from_state].add(to_state)
def epsilon_closure(self, states):
"""Find all states reachable via epsilon transitions"""
closure = set(states)
stack = list(states)
while stack:
state = stack.pop()
for next_state in self.epsilon_transitions.get(state, []):
if next_state not in closure:
closure.add(next_state)
stack.append(next_state)
return closure
def accepts(self, string):
"""Check if NFA accepts the string"""
current = self.epsilon_closure({self.start_state})
for char in string:
next_states = set()
for state in current:
next_states.update(self.transitions.get((state, char), set()))
current = self.epsilon_closure(next_states)
return bool(current & self.accept_states)
Thompson's Construction
Convert regex to NFA:
class RegexToNFA:
"""Convert regular expression to NFA using Thompson's construction"""
def __init__(self):
self.state_counter = 0
def new_state(self):
state = self.state_counter
self.state_counter += 1
return state
def literal(self, char):
"""NFA for single character"""
nfa = NFA()
start = self.new_state()
accept = self.new_state()
nfa.start_state = start
nfa.accept_states = {accept}
nfa.add_transition(start, char, accept)
return nfa
def concatenate(self, nfa1, nfa2):
"""Concatenate two NFAs"""
# Connect nfa1's accept states to nfa2's start via epsilon
for accept in nfa1.accept_states:
nfa1.add_epsilon(accept, nfa2.start_state)
# Merge
nfa1.states.update(nfa2.states)
nfa1.transitions.update(nfa2.transitions)
nfa1.epsilon_transitions.update(nfa2.epsilon_transitions)
nfa1.accept_states = nfa2.accept_states
return nfa1
def alternate(self, nfa1, nfa2):
"""Alternation (union) of two NFAs"""
nfa = NFA()
start = self.new_state()
accept = self.new_state()
nfa.start_state = start
nfa.accept_states = {accept}
# Epsilon from new start to both NFAs
nfa.add_epsilon(start, nfa1.start_state)
nfa.add_epsilon(start, nfa2.start_state)
# Epsilon from both accepts to new accept
for s in nfa1.accept_states:
nfa.add_epsilon(s, accept)
for s in nfa2.accept_states:
nfa.add_epsilon(s, accept)
# Merge
nfa.states.update(nfa1.states)
nfa.states.update(nfa2.states)
nfa.transitions.update(nfa1.transitions)
nfa.transitions.update(nfa2.transitions)
nfa.epsilon_transitions.update(nfa1.epsilon_transitions)
nfa.epsilon_transitions.update(nfa2.epsilon_transitions)
return nfa
def kleene_star(self, nfa1):
"""Kleene star of NFA"""
nfa = NFA()
start = self.new_state()
accept = self.new_state()
nfa.start_state = start
nfa.accept_states = {accept}
# Epsilon from new start to old start and to accept (empty string)
nfa.add_epsilon(start, nfa1.start_state)
nfa.add_epsilon(start, accept)
# Epsilon from old accepts to old start (repeat) and new accept
for s in nfa1.accept_states:
nfa.add_epsilon(s, nfa1.start_state)
nfa.add_epsilon(s, accept)
# Merge
nfa.states.update(nfa1.states)
nfa.transitions.update(nfa1.transitions)
nfa.epsilon_transitions.update(nfa1.epsilon_transitions)
return nfa
4. Implementing a Lexer
Simple Hand-Written Lexer
class Lexer:
"""Hand-written lexer for a simple language"""
def __init__(self, source: str):
self.source = source
self.pos = 0
self.line = 1
self.column = 1
def peek(self, offset=0):
"""Look at character at current position + offset"""
pos = self.pos + offset
if pos < len(self.source):
return self.source[pos]
return '\0'
def advance(self):
"""Move to next character"""
char = self.peek()
self.pos += 1
if char == '\n':
self.line += 1
self.column = 1
else:
self.column += 1
return char
def skip_whitespace(self):
"""Skip spaces and tabs"""
while self.peek() in ' \t':
self.advance()
def skip_comment(self):
"""Skip # comments"""
if self.peek() == '#':
while self.peek() not in '\n\0':
self.advance()
def read_string(self, quote):
"""Read string literal"""
self.advance() # Opening quote
start_line = self.line
start_col = self.column
result = []
while self.peek() != quote and self.peek() != '\0':
if self.peek() == '\\':
self.advance()
escape = self.advance()
if escape == 'n':
result.append('\n')
elif escape == 't':
result.append('\t')
elif escape == '\\':
result.append('\\')
elif escape == quote:
result.append(quote)
else:
result.append(escape)
else:
result.append(self.advance())
if self.peek() == '\0':
return Token(TokenType.ERROR, "Unterminated string", start_line, start_col)
self.advance() # Closing quote
return Token(TokenType.STRING, ''.join(result), start_line, start_col)
def read_number(self):
"""Read integer or float literal"""
start_line = self.line
start_col = self.column
result = []
while self.peek().isdigit():
result.append(self.advance())
# Check for float
if self.peek() == '.' and self.peek(1).isdigit():
result.append(self.advance()) # The '.'
while self.peek().isdigit():
result.append(self.advance())
return Token(TokenType.FLOAT, float(''.join(result)), start_line, start_col)
return Token(TokenType.INTEGER, int(''.join(result)), start_line, start_col)
def read_identifier(self):
"""Read identifier or keyword"""
start_line = self.line
start_col = self.column
result = []
while self.peek().isalnum() or self.peek() == '_':
result.append(self.advance())
lexeme = ''.join(result)
# Check for keywords
keywords = {
'if': TokenType.IF,
'else': TokenType.ELSE,
'while': TokenType.WHILE,
'for': TokenType.FOR,
'return': TokenType.RETURN,
'def': TokenType.DEF,
'class': TokenType.CLASS,
'true': TokenType.TRUE,
'false': TokenType.FALSE,
}
token_type = keywords.get(lexeme, TokenType.IDENTIFIER)
return Token(token_type, lexeme, start_line, start_col)
def next_token(self):
"""Get next token from source"""
self.skip_whitespace()
self.skip_comment()
if self.pos >= len(self.source):
return Token(TokenType.EOF, None, self.line, self.column)
start_line = self.line
start_col = self.column
char = self.peek()
# String
if char in '"\'':
return self.read_string(char)
# Number
if char.isdigit():
return self.read_number()
# Identifier/keyword
if char.isalpha() or char == '_':
return self.read_identifier()
# Two-character operators
two_char = self.source[self.pos:self.pos+2]
two_char_ops = {
'==': TokenType.EQ,
'!=': TokenType.NEQ,
'<=': TokenType.LEQ,
'>=': TokenType.GEQ,
}
if two_char in two_char_ops:
self.advance()
self.advance()
return Token(two_char_ops[two_char], two_char, start_line, start_col)
# Single-character tokens
single_char = {
'+': TokenType.PLUS,
'-': TokenType.MINUS,
'*': TokenType.STAR,
'/': TokenType.SLASH,
'=': TokenType.ASSIGN,
'<': TokenType.LT,
'>': TokenType.GT,
'(': TokenType.LPAREN,
')': TokenType.RPAREN,
'{': TokenType.LBRACE,
'}': TokenType.RBRACE,
'[': TokenType.LBRACKET,
']': TokenType.RBRACKET,
',': TokenType.COMMA,
';': TokenType.SEMICOLON,
':': TokenType.COLON,
'.': TokenType.DOT,
'\n': TokenType.NEWLINE,
}
if char in single_char:
self.advance()
return Token(single_char[char], char, start_line, start_col)
# Unknown character
self.advance()
return Token(TokenType.ERROR, f"Unexpected character: {char}", start_line, start_col)
def tokenize(self):
"""Return all tokens as a list"""
tokens = []
while True:
token = self.next_token()
tokens.append(token)
if token.type == TokenType.EOF:
break
return tokens
# Example usage
source = '''
def factorial(n):
if n <= 1:
return 1
return n * factorial(n - 1)
result = factorial(5)
'''
lexer = Lexer(source)
tokens = lexer.tokenize()
for token in tokens:
if token.type not in (TokenType.NEWLINE, TokenType.EOF):
print(token)
5. Regex-Based Lexer
import re
class RegexLexer:
"""Lexer using regular expressions"""
def __init__(self, rules):
"""
rules: list of (token_type, pattern) tuples
Order matters - first match wins
"""
self.rules = [(t, re.compile(p)) for t, p in rules]
def tokenize(self, source):
tokens = []
pos = 0
line = 1
line_start = 0
while pos < len(source):
# Try each rule
match = None
for token_type, pattern in self.rules:
match = pattern.match(source, pos)
if match:
value = match.group(0)
col = pos - line_start + 1
if token_type is not None: # None means skip
tokens.append(Token(token_type, value, line, col))
# Update position and line tracking
newlines = value.count('\n')
if newlines:
line += newlines
line_start = pos + value.rfind('\n') + 1
pos = match.end()
break
if not match:
# No rule matched
col = pos - line_start + 1
tokens.append(Token(TokenType.ERROR, source[pos], line, col))
pos += 1
tokens.append(Token(TokenType.EOF, None, line, pos - line_start + 1))
return tokens
# Define rules (order matters!)
rules = [
# Skip whitespace and comments
(None, r'[ \t]+'),
(None, r'#[^\n]*'),
# Keywords (before identifier!)
(TokenType.IF, r'if(?![a-zA-Z0-9_])'),
(TokenType.ELSE, r'else(?![a-zA-Z0-9_])'),
(TokenType.WHILE, r'while(?![a-zA-Z0-9_])'),
(TokenType.FOR, r'for(?![a-zA-Z0-9_])'),
(TokenType.RETURN, r'return(?![a-zA-Z0-9_])'),
(TokenType.DEF, r'def(?![a-zA-Z0-9_])'),
(TokenType.TRUE, r'true(?![a-zA-Z0-9_])'),
(TokenType.FALSE, r'false(?![a-zA-Z0-9_])'),
# Literals
(TokenType.FLOAT, r'\d+\.\d+'),
(TokenType.INTEGER, r'\d+'),
(TokenType.STRING, r'"[^"]*"|\'[^\']*\''),
# Identifier
(TokenType.IDENTIFIER, r'[a-zA-Z_][a-zA-Z0-9_]*'),
# Two-char operators
(TokenType.EQ, r'=='),
(TokenType.NEQ, r'!='),
(TokenType.LEQ, r'<='),
(TokenType.GEQ, r'>='),
# Single-char operators
(TokenType.ASSIGN, r'='),
(TokenType.PLUS, r'\+'),
(TokenType.MINUS, r'-'),
(TokenType.STAR, r'\*'),
(TokenType.SLASH, r'/'),
(TokenType.LT, r'<'),
(TokenType.GT, r'>'),
# Delimiters
(TokenType.LPAREN, r'\('),
(TokenType.RPAREN, r'\)'),
(TokenType.LBRACE, r'\{'),
(TokenType.RBRACE, r'\}'),
(TokenType.LBRACKET, r'\['),
(TokenType.RBRACKET, r'\]'),
(TokenType.COMMA, r','),
(TokenType.SEMICOLON, r';'),
(TokenType.COLON, r':'),
(TokenType.DOT, r'\.'),
(TokenType.NEWLINE, r'\n'),
]
lexer = RegexLexer(rules)
tokens = lexer.tokenize("x = 42 + y")
for t in tokens:
print(t)
6. Error Handling
class LexerError(Exception):
def __init__(self, message, line, column):
self.message = message
self.line = line
self.column = column
super().__init__(f"Lexer error at {line}:{column}: {message}")
class RobustLexer(Lexer):
"""Lexer with better error handling and recovery"""
def __init__(self, source):
super().__init__(source)
self.errors = []
def report_error(self, message):
error = LexerError(message, self.line, self.column)
self.errors.append(error)
return Token(TokenType.ERROR, message, self.line, self.column)
def synchronize(self):
"""Skip to next likely token boundary"""
while self.peek() not in ' \t\n\0;{}()':
self.advance()
def tokenize(self):
tokens = []
while True:
try:
token = self.next_token()
tokens.append(token)
if token.type == TokenType.ERROR:
self.synchronize()
if token.type == TokenType.EOF:
break
except Exception as e:
self.errors.append(LexerError(str(e), self.line, self.column))
self.synchronize()
if self.errors:
print(f"Lexer found {len(self.errors)} error(s):")
for error in self.errors:
print(f" {error}")
return tokens
Exercises
Basic
Write regular expressions for:
- C-style multi-line comments
/* ... */ - Hexadecimal numbers
0x1A3F - Email addresses (simplified)
- C-style multi-line comments
Extend the lexer to handle Python-style indentation (INDENT/DEDENT tokens).
Build a DFA that recognizes floating-point numbers.
Intermediate
Implement the subset construction algorithm to convert NFA to DFA.
Write a lexer for a simple JSON subset (objects, arrays, strings, numbers, true/false/null).
Add line/column tracking and produce helpful error messages.
Advanced
Implement a lexer generator that takes token specifications and produces a lexer.
Handle Unicode identifiers (letters from any language).
Implement the Aho-Corasick algorithm for multi-pattern matching.
Summary
- Lexical analysis breaks source code into tokens
- Tokens have a type (category) and value (lexeme)
- Regular expressions define token patterns
- DFAs efficiently recognize tokens
- NFAs are easier to construct, can be converted to DFAs
- Hand-written lexers give full control
- Regex-based lexers are easier to maintain