Supervised Learning
Introduction
Supervised learning algorithms learn a mapping from inputs to outputs using labeled training data. This reading covers fundamental supervised learning algorithms for both regression and classification tasks.
Learning Objectives
By the end of this reading, you will be able to:
- Implement linear and logistic regression
- Understand and build decision trees
- Apply k-Nearest Neighbors (KNN)
- Implement Support Vector Machines (SVMs)
- Choose appropriate algorithms for different problems
1. Linear Regression
Simple Linear Regression
import numpy as np
from typing import Tuple
class LinearRegression:
"""
Linear regression: y = Xw + b
Finds weights that minimize Mean Squared Error.
"""
def __init__(self):
self.weights = None
self.bias = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearRegression':
"""
Fit using Normal Equation: w = (X'X)^(-1) X'y
"""
# Add bias column
n_samples = X.shape[0]
X_b = np.column_stack([np.ones(n_samples), X])
# Solve normal equation
theta = np.linalg.solve(X_b.T @ X_b, X_b.T @ y)
self.bias = theta[0]
self.weights = theta[1:]
return self
def predict(self, X: np.ndarray) -> np.ndarray:
"""Predict using learned weights"""
return X @ self.weights + self.bias
def score(self, X: np.ndarray, y: np.ndarray) -> float:
"""R² score"""
y_pred = self.predict(X)
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
return 1 - (ss_res / ss_tot)
# Example
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X.flatten() + np.random.randn(100)
model = LinearRegression()
model.fit(X, y)
print(f"Weights: {model.weights}, Bias: {model.bias}")
print(f"R² Score: {model.score(X, y):.4f}")
Gradient Descent
class LinearRegressionGD:
"""Linear regression using gradient descent"""
def __init__(self, learning_rate: float = 0.01, n_iterations: int = 1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
self.loss_history = []
def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearRegressionGD':
n_samples, n_features = X.shape
# Initialize weights
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iter):
# Predictions
y_pred = X @ self.weights + self.bias
# Gradients
dw = (1/n_samples) * (X.T @ (y_pred - y))
db = (1/n_samples) * np.sum(y_pred - y)
# Update weights
self.weights -= self.lr * dw
self.bias -= self.lr * db
# Track loss
loss = np.mean((y_pred - y) ** 2)
self.loss_history.append(loss)
return self
def predict(self, X: np.ndarray) -> np.ndarray:
return X @ self.weights + self.bias
class SGDRegressor:
"""Stochastic Gradient Descent (mini-batch)"""
def __init__(self, learning_rate: float = 0.01,
n_iterations: int = 100, batch_size: int = 32):
self.lr = learning_rate
self.n_iter = n_iterations
self.batch_size = batch_size
self.weights = None
self.bias = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'SGDRegressor':
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for epoch in range(self.n_iter):
# Shuffle data
indices = np.random.permutation(n_samples)
for i in range(0, n_samples, self.batch_size):
batch_idx = indices[i:i + self.batch_size]
X_batch = X[batch_idx]
y_batch = y[batch_idx]
# Compute gradients on batch
y_pred = X_batch @ self.weights + self.bias
dw = (1/len(batch_idx)) * (X_batch.T @ (y_pred - y_batch))
db = (1/len(batch_idx)) * np.sum(y_pred - y_batch)
# Update
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict(self, X: np.ndarray) -> np.ndarray:
return X @ self.weights + self.bias
2. Logistic Regression
Binary Classification
class LogisticRegression:
"""
Logistic regression for binary classification.
P(y=1|x) = sigmoid(w'x + b)
"""
def __init__(self, learning_rate: float = 0.01,
n_iterations: int = 1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
@staticmethod
def sigmoid(z: np.ndarray) -> np.ndarray:
"""Sigmoid activation: 1 / (1 + e^(-z))"""
# Clip to prevent overflow
z = np.clip(z, -500, 500)
return 1 / (1 + np.exp(-z))
def fit(self, X: np.ndarray, y: np.ndarray) -> 'LogisticRegression':
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iter):
# Linear model
z = X @ self.weights + self.bias
# Predictions (probabilities)
y_pred = self.sigmoid(z)
# Gradients (from cross-entropy loss)
dw = (1/n_samples) * (X.T @ (y_pred - y))
db = (1/n_samples) * np.sum(y_pred - y)
# Update
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict_proba(self, X: np.ndarray) -> np.ndarray:
"""Predict probabilities"""
z = X @ self.weights + self.bias
return self.sigmoid(z)
def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
"""Predict class labels"""
return (self.predict_proba(X) >= threshold).astype(int)
def score(self, X: np.ndarray, y: np.ndarray) -> float:
"""Accuracy"""
return np.mean(self.predict(X) == y)
# Example
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)
model = LogisticRegression(learning_rate=0.1, n_iterations=1000)
model.fit(X, y)
print(f"Accuracy: {model.score(X, y):.4f}")
Multi-class Classification
class SoftmaxRegression:
"""
Softmax (multinomial logistic) regression for multi-class.
P(y=k|x) = exp(w_k'x) / sum_j(exp(w_j'x))
"""
def __init__(self, learning_rate: float = 0.01,
n_iterations: int = 1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
self.classes = None
@staticmethod
def softmax(z: np.ndarray) -> np.ndarray:
"""Softmax: convert logits to probabilities"""
exp_z = np.exp(z - np.max(z, axis=1, keepdims=True)) # Stability
return exp_z / np.sum(exp_z, axis=1, keepdims=True)
def fit(self, X: np.ndarray, y: np.ndarray) -> 'SoftmaxRegression':
n_samples, n_features = X.shape
self.classes = np.unique(y)
n_classes = len(self.classes)
# One-hot encode y
y_onehot = np.zeros((n_samples, n_classes))
for i, c in enumerate(self.classes):
y_onehot[y == c, i] = 1
# Initialize weights
self.weights = np.zeros((n_features, n_classes))
self.bias = np.zeros(n_classes)
for _ in range(self.n_iter):
# Linear model
z = X @ self.weights + self.bias
# Softmax probabilities
y_pred = self.softmax(z)
# Gradients
error = y_pred - y_onehot
dw = (1/n_samples) * (X.T @ error)
db = (1/n_samples) * np.sum(error, axis=0)
# Update
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict_proba(self, X: np.ndarray) -> np.ndarray:
z = X @ self.weights + self.bias
return self.softmax(z)
def predict(self, X: np.ndarray) -> np.ndarray:
proba = self.predict_proba(X)
return self.classes[np.argmax(proba, axis=1)]
3. Decision Trees
Classification Tree
from dataclasses import dataclass
from typing import Optional, Union
@dataclass
class TreeNode:
"""Node in decision tree"""
feature_idx: Optional[int] = None
threshold: Optional[float] = None
left: Optional['TreeNode'] = None
right: Optional['TreeNode'] = None
value: Optional[int] = None # Class label for leaf
class DecisionTreeClassifier:
"""Classification tree using Gini impurity"""
def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.root = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'DecisionTreeClassifier':
self.n_classes = len(np.unique(y))
self.root = self._grow_tree(X, y, depth=0)
return self
def _gini(self, y: np.ndarray) -> float:
"""Gini impurity: 1 - sum(p_k^2)"""
if len(y) == 0:
return 0
proportions = np.bincount(y) / len(y)
return 1 - np.sum(proportions ** 2)
def _best_split(self, X: np.ndarray, y: np.ndarray) -> tuple:
"""Find best split by minimizing weighted Gini"""
n_samples, n_features = X.shape
best_gain = -1
best_feature, best_threshold = None, None
current_gini = self._gini(y)
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
continue
# Weighted Gini of children
n_left = np.sum(left_mask)
n_right = np.sum(right_mask)
gini_left = self._gini(y[left_mask])
gini_right = self._gini(y[right_mask])
weighted_gini = (n_left * gini_left + n_right * gini_right) / n_samples
# Information gain
gain = current_gini - weighted_gini
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def _grow_tree(self, X: np.ndarray, y: np.ndarray, depth: int) -> TreeNode:
"""Recursively grow tree"""
n_samples = len(y)
# Stopping conditions
if (depth >= self.max_depth or
n_samples < self.min_samples_split or
len(np.unique(y)) == 1):
# Create leaf node
leaf_value = np.bincount(y).argmax()
return TreeNode(value=leaf_value)
# Find best split
feature, threshold = self._best_split(X, y)
if feature is None:
leaf_value = np.bincount(y).argmax()
return TreeNode(value=leaf_value)
# Split data
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
# Recursively grow children
left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)
return TreeNode(
feature_idx=feature,
threshold=threshold,
left=left_child,
right=right_child
)
def _predict_sample(self, x: np.ndarray, node: TreeNode) -> int:
"""Predict single sample"""
if node.value is not None:
return node.value
if x[node.feature_idx] <= node.threshold:
return self._predict_sample(x, node.left)
return self._predict_sample(x, node.right)
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x, self.root) for x in X])
def score(self, X: np.ndarray, y: np.ndarray) -> float:
return np.mean(self.predict(X) == y)
Regression Tree
class DecisionTreeRegressor:
"""Regression tree using MSE"""
def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.root = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'DecisionTreeRegressor':
self.root = self._grow_tree(X, y, depth=0)
return self
def _mse(self, y: np.ndarray) -> float:
"""Mean Squared Error"""
if len(y) == 0:
return 0
return np.var(y)
def _best_split(self, X: np.ndarray, y: np.ndarray) -> tuple:
"""Find split that minimizes MSE"""
n_samples, n_features = X.shape
best_mse = float('inf')
best_feature, best_threshold = None, None
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
continue
n_left = np.sum(left_mask)
n_right = np.sum(right_mask)
mse_left = self._mse(y[left_mask])
mse_right = self._mse(y[right_mask])
weighted_mse = (n_left * mse_left + n_right * mse_right) / n_samples
if weighted_mse < best_mse:
best_mse = weighted_mse
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def _grow_tree(self, X: np.ndarray, y: np.ndarray, depth: int) -> TreeNode:
n_samples = len(y)
if (depth >= self.max_depth or
n_samples < self.min_samples_split or
np.var(y) < 1e-7):
return TreeNode(value=np.mean(y))
feature, threshold = self._best_split(X, y)
if feature is None:
return TreeNode(value=np.mean(y))
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)
return TreeNode(
feature_idx=feature,
threshold=threshold,
left=left_child,
right=right_child
)
def _predict_sample(self, x: np.ndarray, node: TreeNode) -> float:
if node.value is not None:
return node.value
if x[node.feature_idx] <= node.threshold:
return self._predict_sample(x, node.left)
return self._predict_sample(x, node.right)
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x, self.root) for x in X])
4. K-Nearest Neighbors
from collections import Counter
class KNeighborsClassifier:
"""K-Nearest Neighbors classifier"""
def __init__(self, n_neighbors: int = 5, metric: str = 'euclidean'):
self.k = n_neighbors
self.metric = metric
def fit(self, X: np.ndarray, y: np.ndarray) -> 'KNeighborsClassifier':
"""Store training data (lazy learning)"""
self.X_train = X
self.y_train = y
return self
def _distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
"""Compute distance between points"""
if self.metric == 'euclidean':
return np.sqrt(np.sum((x1 - x2) ** 2))
elif self.metric == 'manhattan':
return np.sum(np.abs(x1 - x2))
else:
raise ValueError(f"Unknown metric: {self.metric}")
def _predict_sample(self, x: np.ndarray) -> int:
"""Predict single sample"""
# Compute distances to all training points
distances = [self._distance(x, x_train) for x_train in self.X_train]
# Get k nearest neighbors
k_indices = np.argsort(distances)[:self.k]
k_labels = self.y_train[k_indices]
# Majority vote
return Counter(k_labels).most_common(1)[0][0]
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x) for x in X])
def score(self, X: np.ndarray, y: np.ndarray) -> float:
return np.mean(self.predict(X) == y)
class KNeighborsRegressor:
"""K-Nearest Neighbors regressor"""
def __init__(self, n_neighbors: int = 5, weights: str = 'uniform'):
self.k = n_neighbors
self.weights = weights
def fit(self, X: np.ndarray, y: np.ndarray) -> 'KNeighborsRegressor':
self.X_train = X
self.y_train = y
return self
def _predict_sample(self, x: np.ndarray) -> float:
distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
k_indices = np.argsort(distances)[:self.k]
k_distances = distances[k_indices]
k_values = self.y_train[k_indices]
if self.weights == 'uniform':
return np.mean(k_values)
elif self.weights == 'distance':
# Weight by inverse distance
weights = 1 / (k_distances + 1e-8)
return np.sum(weights * k_values) / np.sum(weights)
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x) for x in X])
5. Support Vector Machines
Linear SVM
class LinearSVM:
"""
Linear Support Vector Machine using gradient descent.
Minimizes: (1/2)||w||² + C * sum(max(0, 1 - y_i(w·x_i + b)))
"""
def __init__(self, C: float = 1.0, learning_rate: float = 0.001,
n_iterations: int = 1000):
self.C = C
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearSVM':
n_samples, n_features = X.shape
# Convert labels to {-1, 1}
y_ = np.where(y <= 0, -1, 1)
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iter):
for i, x_i in enumerate(X):
# Check if correctly classified with margin
condition = y_[i] * (x_i @ self.weights + self.bias) >= 1
if condition:
# Correctly classified: only regularization gradient
self.weights -= self.lr * self.weights
else:
# Misclassified: regularization + hinge loss gradient
self.weights -= self.lr * (self.weights - self.C * y_[i] * x_i)
self.bias -= self.lr * (-self.C * y_[i])
return self
def predict(self, X: np.ndarray) -> np.ndarray:
linear_output = X @ self.weights + self.bias
return np.sign(linear_output)
def score(self, X: np.ndarray, y: np.ndarray) -> float:
y_ = np.where(y <= 0, -1, 1)
return np.mean(self.predict(X) == y_)
# Kernel trick concept
"""
For non-linear data, we can use the kernel trick:
1. Map data to higher dimension: φ(x)
2. SVM only needs dot products: φ(x)·φ(x')
3. Kernel function: K(x, x') = φ(x)·φ(x')
Common kernels:
- Linear: K(x,x') = x·x'
- Polynomial: K(x,x') = (γx·x' + r)^d
- RBF: K(x,x') = exp(-γ||x-x'||²)
"""
6. Naive Bayes
class GaussianNaiveBayes:
"""
Gaussian Naive Bayes classifier.
Assumes features are independent and normally distributed.
P(y|x) ∝ P(y) * ∏ P(x_i|y)
"""
def __init__(self):
self.classes = None
self.class_priors = None
self.means = None
self.variances = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'GaussianNaiveBayes':
n_samples, n_features = X.shape
self.classes = np.unique(y)
n_classes = len(self.classes)
# Calculate statistics for each class
self.class_priors = np.zeros(n_classes)
self.means = np.zeros((n_classes, n_features))
self.variances = np.zeros((n_classes, n_features))
for i, c in enumerate(self.classes):
X_c = X[y == c]
self.class_priors[i] = len(X_c) / n_samples
self.means[i] = np.mean(X_c, axis=0)
self.variances[i] = np.var(X_c, axis=0) + 1e-9 # Smoothing
return self
def _gaussian_pdf(self, x: np.ndarray, mean: np.ndarray,
var: np.ndarray) -> np.ndarray:
"""Gaussian probability density"""
return np.exp(-((x - mean) ** 2) / (2 * var)) / np.sqrt(2 * np.pi * var)
def _predict_sample(self, x: np.ndarray) -> int:
"""Predict class for single sample"""
posteriors = []
for i, c in enumerate(self.classes):
prior = np.log(self.class_priors[i])
likelihood = np.sum(np.log(
self._gaussian_pdf(x, self.means[i], self.variances[i])
))
posteriors.append(prior + likelihood)
return self.classes[np.argmax(posteriors)]
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x) for x in X])
def predict_proba(self, X: np.ndarray) -> np.ndarray:
"""Predict class probabilities"""
n_samples = len(X)
n_classes = len(self.classes)
log_probs = np.zeros((n_samples, n_classes))
for i in range(n_samples):
for j, c in enumerate(self.classes):
prior = np.log(self.class_priors[j])
likelihood = np.sum(np.log(
self._gaussian_pdf(X[i], self.means[j], self.variances[j])
))
log_probs[i, j] = prior + likelihood
# Convert to probabilities
log_probs -= np.max(log_probs, axis=1, keepdims=True)
probs = np.exp(log_probs)
return probs / np.sum(probs, axis=1, keepdims=True)
7. Model Selection
Choosing the Right Algorithm
"""
ALGORITHM SELECTION GUIDE:
Linear Regression:
- Target: Continuous
- Relationship: Linear
- Interpretability: High
- Speed: Fast
Logistic Regression:
- Target: Binary/Multi-class
- Relationship: Linear decision boundary
- Interpretability: High
- Speed: Fast
Decision Trees:
- Target: Any
- Relationship: Non-linear, axis-aligned splits
- Interpretability: High
- Speed: Fast (can overfit)
K-Nearest Neighbors:
- Target: Any
- Relationship: Any (local patterns)
- Interpretability: Low
- Speed: Slow for prediction (no training)
SVM:
- Target: Binary (primarily)
- Relationship: Linear (kernel for non-linear)
- Interpretability: Medium
- Speed: Slow for large datasets
Naive Bayes:
- Target: Classification
- Assumptions: Feature independence
- Interpretability: High
- Speed: Very fast
"""
def recommend_algorithm(n_samples: int, n_features: int,
problem_type: str, needs_interpretability: bool) -> list:
"""Recommend algorithms based on problem characteristics"""
recommendations = []
if problem_type == 'regression':
recommendations.append('LinearRegression')
if n_samples > 1000:
recommendations.append('DecisionTreeRegressor')
recommendations.append('KNeighborsRegressor')
elif problem_type == 'classification':
recommendations.append('LogisticRegression')
recommendations.append('GaussianNaiveBayes')
if n_samples < 10000:
recommendations.append('SVM')
recommendations.append('DecisionTreeClassifier')
recommendations.append('KNeighborsClassifier')
# Prioritize based on interpretability
if needs_interpretability:
# Move interpretable models to front
interpretable = ['LogisticRegression', 'LinearRegression',
'DecisionTreeClassifier', 'DecisionTreeRegressor',
'GaussianNaiveBayes']
recommendations = sorted(
recommendations,
key=lambda x: 0 if x in interpretable else 1
)
return recommendations
Exercises
Basic
Implement linear regression using gradient descent and compare with the normal equation.
Build a binary classifier using logistic regression on the Iris dataset (two classes).
Implement KNN from scratch and test with different values of k.
Intermediate
Create a decision tree and visualize it (print the tree structure).
Compare Naive Bayes with Logistic Regression on a text classification task.
Implement cross-validation to find the best hyperparameters for SVM.
Advanced
Implement a kernel SVM with the RBF kernel.
Build an ensemble of decision trees (Random Forest concept).
Create a multi-class SVM using one-vs-rest strategy.
Summary
- Linear regression minimizes MSE; logistic regression maximizes likelihood
- Decision trees split data recursively using information criteria
- KNN makes predictions based on nearest training examples
- SVMs maximize the margin between classes
- Naive Bayes applies Bayes' theorem with independence assumption
- Algorithm choice depends on data size, complexity, and requirements