Supervised Learning
Supervised learning algorithms learn a mapping from inputs to outputs using labeled training data. ## 1. Linear Regression
Simple Linear Regression
import numpy as np
from typing import Tuple
class LinearRegression:
"""
Linear regression: y = Xw + b
Finds weights that minimize Mean Squared Error.
"""
def __init__(self):
self.weights = None
self.bias = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearRegression':
"""
Fit using Normal Equation: w = (X'X)^(-1) X'y
"""
# Add bias column
n_samples = X.shape[0]
X_b = np.column_stack([np.ones(n_samples), X])
# Solve normal equation
theta = np.linalg.solve(X_b.T @ X_b, X_b.T @ y)
self.bias = theta[0]
self.weights = theta[1:]
return self
def predict(self, X: np.ndarray) -> np.ndarray:
"""Predict using learned weights"""
return X @ self.weights + self.bias
def score(self, X: np.ndarray, y: np.ndarray) -> float:
"""R² score"""
y_pred = self.predict(X)
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
return 1 - (ss_res / ss_tot)
# Example
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X.flatten() + np.random.randn(100)
model = LinearRegression()
model.fit(X, y)
print(f"Weights: {model.weights}, Bias: {model.bias}")
print(f"R² Score: {model.score(X, y):.4f}")
Gradient Descent
class LinearRegressionGD:
"""Linear regression using gradient descent"""
def __init__(self, learning_rate: float = 0.01, n_iterations: int = 1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
self.loss_history = []
def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearRegressionGD':
n_samples, n_features = X.shape
# Initialize weights
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iter):
# Predictions
y_pred = X @ self.weights + self.bias
# Gradients
dw = (1/n_samples) * (X.T @ (y_pred - y))
db = (1/n_samples) * np.sum(y_pred - y)
# Update weights
self.weights -= self.lr * dw
self.bias -= self.lr * db
# Track loss
loss = np.mean((y_pred - y) ** 2)
self.loss_history.append(loss)
return self
def predict(self, X: np.ndarray) -> np.ndarray:
return X @ self.weights + self.bias
class SGDRegressor:
"""Stochastic Gradient Descent (mini-batch)"""
def __init__(self, learning_rate: float = 0.01,
n_iterations: int = 100, batch_size: int = 32):
self.lr = learning_rate
self.n_iter = n_iterations
self.batch_size = batch_size
self.weights = None
self.bias = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'SGDRegressor':
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for epoch in range(self.n_iter):
# Shuffle data
indices = np.random.permutation(n_samples)
for i in range(0, n_samples, self.batch_size):
batch_idx = indices[i:i + self.batch_size]
X_batch = X[batch_idx]
y_batch = y[batch_idx]
# Compute gradients on batch
y_pred = X_batch @ self.weights + self.bias
dw = (1/len(batch_idx)) * (X_batch.T @ (y_pred - y_batch))
db = (1/len(batch_idx)) * np.sum(y_pred - y_batch)
# Update
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict(self, X: np.ndarray) -> np.ndarray:
return X @ self.weights + self.bias
2. Logistic Regression
Binary Classification
class LogisticRegression:
"""
Logistic regression for binary classification.
P(y=1|x) = sigmoid(w'x + b)
"""
def __init__(self, learning_rate: float = 0.01,
n_iterations: int = 1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
@staticmethod
def sigmoid(z: np.ndarray) -> np.ndarray:
"""Sigmoid activation: 1 / (1 + e^(-z))"""
# Clip to prevent overflow
z = np.clip(z, -500, 500)
return 1 / (1 + np.exp(-z))
def fit(self, X: np.ndarray, y: np.ndarray) -> 'LogisticRegression':
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iter):
# Linear model
z = X @ self.weights + self.bias
# Predictions (probabilities)
y_pred = self.sigmoid(z)
# Gradients (from cross-entropy loss)
dw = (1/n_samples) * (X.T @ (y_pred - y))
db = (1/n_samples) * np.sum(y_pred - y)
# Update
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict_proba(self, X: np.ndarray) -> np.ndarray:
"""Predict probabilities"""
z = X @ self.weights + self.bias
return self.sigmoid(z)
def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
"""Predict class labels"""
return (self.predict_proba(X) >= threshold).astype(int)
def score(self, X: np.ndarray, y: np.ndarray) -> float:
"""Accuracy"""
return np.mean(self.predict(X) == y)
# Example
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)
model = LogisticRegression(learning_rate=0.1, n_iterations=1000)
model.fit(X, y)
print(f"Accuracy: {model.score(X, y):.4f}")
Multi-class Classification
class SoftmaxRegression:
"""
Softmax (multinomial logistic) regression for multi-class.
P(y=k|x) = exp(w_k'x) / sum_j(exp(w_j'x))
"""
def __init__(self, learning_rate: float = 0.01,
n_iterations: int = 1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
self.classes = None
@staticmethod
def softmax(z: np.ndarray) -> np.ndarray:
"""Softmax: convert logits to probabilities"""
exp_z = np.exp(z - np.max(z, axis=1, keepdims=True)) # Stability
return exp_z / np.sum(exp_z, axis=1, keepdims=True)
def fit(self, X: np.ndarray, y: np.ndarray) -> 'SoftmaxRegression':
n_samples, n_features = X.shape
self.classes = np.unique(y)
n_classes = len(self.classes)
# One-hot encode y
y_onehot = np.zeros((n_samples, n_classes))
for i, c in enumerate(self.classes):
y_onehot[y == c, i] = 1
# Initialize weights
self.weights = np.zeros((n_features, n_classes))
self.bias = np.zeros(n_classes)
for _ in range(self.n_iter):
# Linear model
z = X @ self.weights + self.bias
# Softmax probabilities
y_pred = self.softmax(z)
# Gradients
error = y_pred - y_onehot
dw = (1/n_samples) * (X.T @ error)
db = (1/n_samples) * np.sum(error, axis=0)
# Update
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict_proba(self, X: np.ndarray) -> np.ndarray:
z = X @ self.weights + self.bias
return self.softmax(z)
def predict(self, X: np.ndarray) -> np.ndarray:
proba = self.predict_proba(X)
return self.classes[np.argmax(proba, axis=1)]
3. Decision Trees
Classification Tree
from dataclasses import dataclass
from typing import Optional, Union
@dataclass
class TreeNode:
"""Node in decision tree"""
feature_idx: Optional[int] = None
threshold: Optional[float] = None
left: Optional['TreeNode'] = None
right: Optional['TreeNode'] = None
value: Optional[int] = None # Class label for leaf
class DecisionTreeClassifier:
"""Classification tree using Gini impurity"""
def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.root = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'DecisionTreeClassifier':
self.n_classes = len(np.unique(y))
self.root = self._grow_tree(X, y, depth=0)
return self
def _gini(self, y: np.ndarray) -> float:
"""Gini impurity: 1 - sum(p_k^2)"""
if len(y) == 0:
return 0
proportions = np.bincount(y) / len(y)
return 1 - np.sum(proportions ** 2)
def _best_split(self, X: np.ndarray, y: np.ndarray) -> tuple:
"""Find best split by minimizing weighted Gini"""
n_samples, n_features = X.shape
best_gain = -1
best_feature, best_threshold = None, None
current_gini = self._gini(y)
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
continue
# Weighted Gini of children
n_left = np.sum(left_mask)
n_right = np.sum(right_mask)
gini_left = self._gini(y[left_mask])
gini_right = self._gini(y[right_mask])
weighted_gini = (n_left * gini_left + n_right * gini_right) / n_samples
# Information gain
gain = current_gini - weighted_gini
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def _grow_tree(self, X: np.ndarray, y: np.ndarray, depth: int) -> TreeNode:
"""Recursively grow tree"""
n_samples = len(y)
# Stopping conditions
if (depth >= self.max_depth or
n_samples < self.min_samples_split or
len(np.unique(y)) == 1):
# Create leaf node
leaf_value = np.bincount(y).argmax()
return TreeNode(value=leaf_value)
# Find best split
feature, threshold = self._best_split(X, y)
if feature is None:
leaf_value = np.bincount(y).argmax()
return TreeNode(value=leaf_value)
# Split data
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
# Recursively grow children
left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)
return TreeNode(
feature_idx=feature,
threshold=threshold,
left=left_child,
right=right_child
)
def _predict_sample(self, x: np.ndarray, node: TreeNode) -> int:
"""Predict single sample"""
if node.value is not None:
return node.value
if x[node.feature_idx] <= node.threshold:
return self._predict_sample(x, node.left)
return self._predict_sample(x, node.right)
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x, self.root) for x in X])
def score(self, X: np.ndarray, y: np.ndarray) -> float:
return np.mean(self.predict(X) == y)
Regression Tree
class DecisionTreeRegressor:
"""Regression tree using MSE"""
def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.root = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'DecisionTreeRegressor':
self.root = self._grow_tree(X, y, depth=0)
return self
def _mse(self, y: np.ndarray) -> float:
"""Mean Squared Error"""
if len(y) == 0:
return 0
return np.var(y)
def _best_split(self, X: np.ndarray, y: np.ndarray) -> tuple:
"""Find split that minimizes MSE"""
n_samples, n_features = X.shape
best_mse = float('inf')
best_feature, best_threshold = None, None
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
continue
n_left = np.sum(left_mask)
n_right = np.sum(right_mask)
mse_left = self._mse(y[left_mask])
mse_right = self._mse(y[right_mask])
weighted_mse = (n_left * mse_left + n_right * mse_right) / n_samples
if weighted_mse < best_mse:
best_mse = weighted_mse
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def _grow_tree(self, X: np.ndarray, y: np.ndarray, depth: int) -> TreeNode:
n_samples = len(y)
if (depth >= self.max_depth or
n_samples < self.min_samples_split or
np.var(y) < 1e-7):
return TreeNode(value=np.mean(y))
feature, threshold = self._best_split(X, y)
if feature is None:
return TreeNode(value=np.mean(y))
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)
return TreeNode(
feature_idx=feature,
threshold=threshold,
left=left_child,
right=right_child
)
def _predict_sample(self, x: np.ndarray, node: TreeNode) -> float:
if node.value is not None:
return node.value
if x[node.feature_idx] <= node.threshold:
return self._predict_sample(x, node.left)
return self._predict_sample(x, node.right)
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x, self.root) for x in X])
4. K-Nearest Neighbors
from collections import Counter
class KNeighborsClassifier:
"""K-Nearest Neighbors classifier"""
def __init__(self, n_neighbors: int = 5, metric: str = 'euclidean'):
self.k = n_neighbors
self.metric = metric
def fit(self, X: np.ndarray, y: np.ndarray) -> 'KNeighborsClassifier':
"""Store training data (lazy learning)"""
self.X_train = X
self.y_train = y
return self
def _distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
"""Compute distance between points"""
if self.metric == 'euclidean':
return np.sqrt(np.sum((x1 - x2) ** 2))
elif self.metric == 'manhattan':
return np.sum(np.abs(x1 - x2))
else:
raise ValueError(f"Unknown metric: {self.metric}")
def _predict_sample(self, x: np.ndarray) -> int:
"""Predict single sample"""
# Compute distances to all training points
distances = [self._distance(x, x_train) for x_train in self.X_train]
# Get k nearest neighbors
k_indices = np.argsort(distances)[:self.k]
k_labels = self.y_train[k_indices]
# Majority vote
return Counter(k_labels).most_common(1)[0][0]
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x) for x in X])
def score(self, X: np.ndarray, y: np.ndarray) -> float:
return np.mean(self.predict(X) == y)
class KNeighborsRegressor:
"""K-Nearest Neighbors regressor"""
def __init__(self, n_neighbors: int = 5, weights: str = 'uniform'):
self.k = n_neighbors
self.weights = weights
def fit(self, X: np.ndarray, y: np.ndarray) -> 'KNeighborsRegressor':
self.X_train = X
self.y_train = y
return self
def _predict_sample(self, x: np.ndarray) -> float:
distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
k_indices = np.argsort(distances)[:self.k]
k_distances = distances[k_indices]
k_values = self.y_train[k_indices]
if self.weights == 'uniform':
return np.mean(k_values)
elif self.weights == 'distance':
# Weight by inverse distance
weights = 1 / (k_distances + 1e-8)
return np.sum(weights * k_values) / np.sum(weights)
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x) for x in X])
5. Support Vector Machines
Linear SVM
class LinearSVM:
"""
Linear Support Vector Machine using gradient descent.
Minimizes: (1/2)||w||² + C * sum(max(0, 1 - y_i(w·x_i + b)))
"""
def __init__(self, C: float = 1.0, learning_rate: float = 0.001,
n_iterations: int = 1000):
self.C = C
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearSVM':
n_samples, n_features = X.shape
# Convert labels to {-1, 1}
y_ = np.where(y <= 0, -1, 1)
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iter):
for i, x_i in enumerate(X):
# Check if correctly classified with margin
condition = y_[i] * (x_i @ self.weights + self.bias) >= 1
if condition:
# Correctly classified: only regularization gradient
self.weights -= self.lr * self.weights
else:
# Misclassified: regularization + hinge loss gradient
self.weights -= self.lr * (self.weights - self.C * y_[i] * x_i)
self.bias -= self.lr * (-self.C * y_[i])
return self
def predict(self, X: np.ndarray) -> np.ndarray:
linear_output = X @ self.weights + self.bias
return np.sign(linear_output)
def score(self, X: np.ndarray, y: np.ndarray) -> float:
y_ = np.where(y <= 0, -1, 1)
return np.mean(self.predict(X) == y_)
# Kernel trick concept
"""
For non-linear data, we can use the kernel trick:
1. Map data to higher dimension: φ(x)
2. SVM only needs dot products: φ(x)·φ(x')
3. Kernel function: K(x, x') = φ(x)·φ(x')
Common kernels:
- Linear: K(x,x') = x·x'
- Polynomial: K(x,x') = (γx·x' + r)^d
- RBF: K(x,x') = exp(-γ||x-x'||²)
"""
6. Naive Bayes
class GaussianNaiveBayes:
"""
Gaussian Naive Bayes classifier.
Assumes features are independent and normally distributed.
P(y|x) ∝ P(y) * ∏ P(x_i|y)
"""
def __init__(self):
self.classes = None
self.class_priors = None
self.means = None
self.variances = None
def fit(self, X: np.ndarray, y: np.ndarray) -> 'GaussianNaiveBayes':
n_samples, n_features = X.shape
self.classes = np.unique(y)
n_classes = len(self.classes)
# Calculate statistics for each class
self.class_priors = np.zeros(n_classes)
self.means = np.zeros((n_classes, n_features))
self.variances = np.zeros((n_classes, n_features))
for i, c in enumerate(self.classes):
X_c = X[y == c]
self.class_priors[i] = len(X_c) / n_samples
self.means[i] = np.mean(X_c, axis=0)
self.variances[i] = np.var(X_c, axis=0) + 1e-9 # Smoothing
return self
def _gaussian_pdf(self, x: np.ndarray, mean: np.ndarray,
var: np.ndarray) -> np.ndarray:
"""Gaussian probability density"""
return np.exp(-((x - mean) ** 2) / (2 * var)) / np.sqrt(2 * np.pi * var)
def _predict_sample(self, x: np.ndarray) -> int:
"""Predict class for single sample"""
posteriors = []
for i, c in enumerate(self.classes):
prior = np.log(self.class_priors[i])
likelihood = np.sum(np.log(
self._gaussian_pdf(x, self.means[i], self.variances[i])
))
posteriors.append(prior + likelihood)
return self.classes[np.argmax(posteriors)]
def predict(self, X: np.ndarray) -> np.ndarray:
return np.array([self._predict_sample(x) for x in X])
def predict_proba(self, X: np.ndarray) -> np.ndarray:
"""Predict class probabilities"""
n_samples = len(X)
n_classes = len(self.classes)
log_probs = np.zeros((n_samples, n_classes))
for i in range(n_samples):
for j, c in enumerate(self.classes):
prior = np.log(self.class_priors[j])
likelihood = np.sum(np.log(
self._gaussian_pdf(X[i], self.means[j], self.variances[j])
))
log_probs[i, j] = prior + likelihood
# Convert to probabilities
log_probs -= np.max(log_probs, axis=1, keepdims=True)
probs = np.exp(log_probs)
return probs / np.sum(probs, axis=1, keepdims=True)
7. Model Selection
Choosing the Right Algorithm
"""
ALGORITHM SELECTION GUIDE:
Linear Regression:
- Target: Continuous
- Relationship: Linear
- Interpretability: High
- Speed: Fast
Logistic Regression:
- Target: Binary/Multi-class
- Relationship: Linear decision boundary
- Interpretability: High
- Speed: Fast
Decision Trees:
- Target: Any
- Relationship: Non-linear, axis-aligned splits
- Interpretability: High
- Speed: Fast (can overfit)
K-Nearest Neighbors:
- Target: Any
- Relationship: Any (local patterns)
- Interpretability: Low
- Speed: Slow for prediction (no training)
SVM:
- Target: Binary (primarily)
- Relationship: Linear (kernel for non-linear)
- Interpretability: Medium
- Speed: Slow for large datasets
Naive Bayes:
- Target: Classification
- Assumptions: Feature independence
- Interpretability: High
- Speed: Very fast
"""
def recommend_algorithm(n_samples: int, n_features: int,
problem_type: str, needs_interpretability: bool) -> list:
"""Recommend algorithms based on problem characteristics"""
recommendations = []
if problem_type == 'regression':
recommendations.append('LinearRegression')
if n_samples > 1000:
recommendations.append('DecisionTreeRegressor')
recommendations.append('KNeighborsRegressor')
elif problem_type == 'classification':
recommendations.append('LogisticRegression')
recommendations.append('GaussianNaiveBayes')
if n_samples < 10000:
recommendations.append('SVM')
recommendations.append('DecisionTreeClassifier')
recommendations.append('KNeighborsClassifier')
# Prioritize based on interpretability
if needs_interpretability:
# Move interpretable models to front
interpretable = ['LogisticRegression', 'LinearRegression',
'DecisionTreeClassifier', 'DecisionTreeRegressor',
'GaussianNaiveBayes']
recommendations = sorted(
recommendations,
key=lambda x: 0 if x in interpretable else 1
)
return recommendations
Exercises
Basic
Implement linear regression using gradient descent and compare with the normal equation.
Build a binary classifier using logistic regression on the Iris dataset (two classes).
Implement KNN from scratch and test with different values of k.
Intermediate
Create a decision tree and visualize it (print the tree structure).
Compare Naive Bayes with Logistic Regression on a text classification task.
Implement cross-validation to find the best hyperparameters for SVM.
Advanced
Implement a kernel SVM with the RBF kernel.
Build an ensemble of decision trees (Random Forest concept).
Create a multi-class SVM using one-vs-rest strategy.
Summary
- Linear regression minimizes MSE; logistic regression maximizes likelihood
- Decision trees split data recursively using information criteria
- KNN makes predictions based on nearest training examples
- SVMs maximize the margin between classes
- Naive Bayes applies Bayes' theorem with independence assumption
- Algorithm choice depends on data size, complexity, and requirements