Supervised Learning

Introduction

Supervised learning algorithms learn a mapping from inputs to outputs using labeled training data. This reading covers fundamental supervised learning algorithms for both regression and classification tasks.

Learning Objectives

By the end of this reading, you will be able to:

  • Implement linear and logistic regression
  • Understand and build decision trees
  • Apply k-Nearest Neighbors (KNN)
  • Implement Support Vector Machines (SVMs)
  • Choose appropriate algorithms for different problems

1. Linear Regression

Simple Linear Regression

import numpy as np
from typing import Tuple

class LinearRegression:
    """
    Linear regression: y = Xw + b

    Finds weights that minimize Mean Squared Error.
    """

    def __init__(self):
        self.weights = None
        self.bias = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearRegression':
        """
        Fit using Normal Equation: w = (X'X)^(-1) X'y
        """
        # Add bias column
        n_samples = X.shape[0]
        X_b = np.column_stack([np.ones(n_samples), X])

        # Solve normal equation
        theta = np.linalg.solve(X_b.T @ X_b, X_b.T @ y)

        self.bias = theta[0]
        self.weights = theta[1:]
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        """Predict using learned weights"""
        return X @ self.weights + self.bias

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """R² score"""
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - (ss_res / ss_tot)

# Example
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X.flatten() + np.random.randn(100)

model = LinearRegression()
model.fit(X, y)
print(f"Weights: {model.weights}, Bias: {model.bias}")
print(f"R² Score: {model.score(X, y):.4f}")

Gradient Descent

class LinearRegressionGD:
    """Linear regression using gradient descent"""

    def __init__(self, learning_rate: float = 0.01, n_iterations: int = 1000):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None
        self.loss_history = []

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearRegressionGD':
        n_samples, n_features = X.shape

        # Initialize weights
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iter):
            # Predictions
            y_pred = X @ self.weights + self.bias

            # Gradients
            dw = (1/n_samples) * (X.T @ (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            # Update weights
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

            # Track loss
            loss = np.mean((y_pred - y) ** 2)
            self.loss_history.append(loss)

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        return X @ self.weights + self.bias

class SGDRegressor:
    """Stochastic Gradient Descent (mini-batch)"""

    def __init__(self, learning_rate: float = 0.01,
                n_iterations: int = 100, batch_size: int = 32):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.batch_size = batch_size
        self.weights = None
        self.bias = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'SGDRegressor':
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for epoch in range(self.n_iter):
            # Shuffle data
            indices = np.random.permutation(n_samples)

            for i in range(0, n_samples, self.batch_size):
                batch_idx = indices[i:i + self.batch_size]
                X_batch = X[batch_idx]
                y_batch = y[batch_idx]

                # Compute gradients on batch
                y_pred = X_batch @ self.weights + self.bias
                dw = (1/len(batch_idx)) * (X_batch.T @ (y_pred - y_batch))
                db = (1/len(batch_idx)) * np.sum(y_pred - y_batch)

                # Update
                self.weights -= self.lr * dw
                self.bias -= self.lr * db

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        return X @ self.weights + self.bias

2. Logistic Regression

Binary Classification

class LogisticRegression:
    """
    Logistic regression for binary classification.

    P(y=1|x) = sigmoid(w'x + b)
    """

    def __init__(self, learning_rate: float = 0.01,
                n_iterations: int = 1000):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None

    @staticmethod
    def sigmoid(z: np.ndarray) -> np.ndarray:
        """Sigmoid activation: 1 / (1 + e^(-z))"""
        # Clip to prevent overflow
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'LogisticRegression':
        n_samples, n_features = X.shape

        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iter):
            # Linear model
            z = X @ self.weights + self.bias

            # Predictions (probabilities)
            y_pred = self.sigmoid(z)

            # Gradients (from cross-entropy loss)
            dw = (1/n_samples) * (X.T @ (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            # Update
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        return self

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """Predict probabilities"""
        z = X @ self.weights + self.bias
        return self.sigmoid(z)

    def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
        """Predict class labels"""
        return (self.predict_proba(X) >= threshold).astype(int)

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """Accuracy"""
        return np.mean(self.predict(X) == y)

# Example
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=10, random_state=42)

model = LogisticRegression(learning_rate=0.1, n_iterations=1000)
model.fit(X, y)
print(f"Accuracy: {model.score(X, y):.4f}")

Multi-class Classification

class SoftmaxRegression:
    """
    Softmax (multinomial logistic) regression for multi-class.

    P(y=k|x) = exp(w_k'x) / sum_j(exp(w_j'x))
    """

    def __init__(self, learning_rate: float = 0.01,
                n_iterations: int = 1000):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None
        self.classes = None

    @staticmethod
    def softmax(z: np.ndarray) -> np.ndarray:
        """Softmax: convert logits to probabilities"""
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Stability
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'SoftmaxRegression':
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # One-hot encode y
        y_onehot = np.zeros((n_samples, n_classes))
        for i, c in enumerate(self.classes):
            y_onehot[y == c, i] = 1

        # Initialize weights
        self.weights = np.zeros((n_features, n_classes))
        self.bias = np.zeros(n_classes)

        for _ in range(self.n_iter):
            # Linear model
            z = X @ self.weights + self.bias

            # Softmax probabilities
            y_pred = self.softmax(z)

            # Gradients
            error = y_pred - y_onehot
            dw = (1/n_samples) * (X.T @ error)
            db = (1/n_samples) * np.sum(error, axis=0)

            # Update
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        return self

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        z = X @ self.weights + self.bias
        return self.softmax(z)

    def predict(self, X: np.ndarray) -> np.ndarray:
        proba = self.predict_proba(X)
        return self.classes[np.argmax(proba, axis=1)]

3. Decision Trees

Classification Tree

from dataclasses import dataclass
from typing import Optional, Union

@dataclass
class TreeNode:
    """Node in decision tree"""
    feature_idx: Optional[int] = None
    threshold: Optional[float] = None
    left: Optional['TreeNode'] = None
    right: Optional['TreeNode'] = None
    value: Optional[int] = None  # Class label for leaf

class DecisionTreeClassifier:
    """Classification tree using Gini impurity"""

    def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'DecisionTreeClassifier':
        self.n_classes = len(np.unique(y))
        self.root = self._grow_tree(X, y, depth=0)
        return self

    def _gini(self, y: np.ndarray) -> float:
        """Gini impurity: 1 - sum(p_k^2)"""
        if len(y) == 0:
            return 0
        proportions = np.bincount(y) / len(y)
        return 1 - np.sum(proportions ** 2)

    def _best_split(self, X: np.ndarray, y: np.ndarray) -> tuple:
        """Find best split by minimizing weighted Gini"""
        n_samples, n_features = X.shape
        best_gain = -1
        best_feature, best_threshold = None, None

        current_gini = self._gini(y)

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                # Weighted Gini of children
                n_left = np.sum(left_mask)
                n_right = np.sum(right_mask)

                gini_left = self._gini(y[left_mask])
                gini_right = self._gini(y[right_mask])

                weighted_gini = (n_left * gini_left + n_right * gini_right) / n_samples

                # Information gain
                gain = current_gini - weighted_gini

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _grow_tree(self, X: np.ndarray, y: np.ndarray, depth: int) -> TreeNode:
        """Recursively grow tree"""
        n_samples = len(y)

        # Stopping conditions
        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            len(np.unique(y)) == 1):
            # Create leaf node
            leaf_value = np.bincount(y).argmax()
            return TreeNode(value=leaf_value)

        # Find best split
        feature, threshold = self._best_split(X, y)

        if feature is None:
            leaf_value = np.bincount(y).argmax()
            return TreeNode(value=leaf_value)

        # Split data
        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        # Recursively grow children
        left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        return TreeNode(
            feature_idx=feature,
            threshold=threshold,
            left=left_child,
            right=right_child
        )

    def _predict_sample(self, x: np.ndarray, node: TreeNode) -> int:
        """Predict single sample"""
        if node.value is not None:
            return node.value

        if x[node.feature_idx] <= node.threshold:
            return self._predict_sample(x, node.left)
        return self._predict_sample(x, node.right)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x, self.root) for x in X])

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        return np.mean(self.predict(X) == y)

Regression Tree

class DecisionTreeRegressor:
    """Regression tree using MSE"""

    def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'DecisionTreeRegressor':
        self.root = self._grow_tree(X, y, depth=0)
        return self

    def _mse(self, y: np.ndarray) -> float:
        """Mean Squared Error"""
        if len(y) == 0:
            return 0
        return np.var(y)

    def _best_split(self, X: np.ndarray, y: np.ndarray) -> tuple:
        """Find split that minimizes MSE"""
        n_samples, n_features = X.shape
        best_mse = float('inf')
        best_feature, best_threshold = None, None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                n_left = np.sum(left_mask)
                n_right = np.sum(right_mask)

                mse_left = self._mse(y[left_mask])
                mse_right = self._mse(y[right_mask])

                weighted_mse = (n_left * mse_left + n_right * mse_right) / n_samples

                if weighted_mse < best_mse:
                    best_mse = weighted_mse
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _grow_tree(self, X: np.ndarray, y: np.ndarray, depth: int) -> TreeNode:
        n_samples = len(y)

        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            np.var(y) < 1e-7):
            return TreeNode(value=np.mean(y))

        feature, threshold = self._best_split(X, y)

        if feature is None:
            return TreeNode(value=np.mean(y))

        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        return TreeNode(
            feature_idx=feature,
            threshold=threshold,
            left=left_child,
            right=right_child
        )

    def _predict_sample(self, x: np.ndarray, node: TreeNode) -> float:
        if node.value is not None:
            return node.value

        if x[node.feature_idx] <= node.threshold:
            return self._predict_sample(x, node.left)
        return self._predict_sample(x, node.right)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x, self.root) for x in X])

4. K-Nearest Neighbors

from collections import Counter

class KNeighborsClassifier:
    """K-Nearest Neighbors classifier"""

    def __init__(self, n_neighbors: int = 5, metric: str = 'euclidean'):
        self.k = n_neighbors
        self.metric = metric

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'KNeighborsClassifier':
        """Store training data (lazy learning)"""
        self.X_train = X
        self.y_train = y
        return self

    def _distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
        """Compute distance between points"""
        if self.metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        else:
            raise ValueError(f"Unknown metric: {self.metric}")

    def _predict_sample(self, x: np.ndarray) -> int:
        """Predict single sample"""
        # Compute distances to all training points
        distances = [self._distance(x, x_train) for x_train in self.X_train]

        # Get k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        k_labels = self.y_train[k_indices]

        # Majority vote
        return Counter(k_labels).most_common(1)[0][0]

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x) for x in X])

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        return np.mean(self.predict(X) == y)

class KNeighborsRegressor:
    """K-Nearest Neighbors regressor"""

    def __init__(self, n_neighbors: int = 5, weights: str = 'uniform'):
        self.k = n_neighbors
        self.weights = weights

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'KNeighborsRegressor':
        self.X_train = X
        self.y_train = y
        return self

    def _predict_sample(self, x: np.ndarray) -> float:
        distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        k_indices = np.argsort(distances)[:self.k]
        k_distances = distances[k_indices]
        k_values = self.y_train[k_indices]

        if self.weights == 'uniform':
            return np.mean(k_values)
        elif self.weights == 'distance':
            # Weight by inverse distance
            weights = 1 / (k_distances + 1e-8)
            return np.sum(weights * k_values) / np.sum(weights)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x) for x in X])

5. Support Vector Machines

Linear SVM

class LinearSVM:
    """
    Linear Support Vector Machine using gradient descent.

    Minimizes: (1/2)||w||² + C * sum(max(0, 1 - y_i(w·x_i + b)))
    """

    def __init__(self, C: float = 1.0, learning_rate: float = 0.001,
                n_iterations: int = 1000):
        self.C = C
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearSVM':
        n_samples, n_features = X.shape

        # Convert labels to {-1, 1}
        y_ = np.where(y <= 0, -1, 1)

        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iter):
            for i, x_i in enumerate(X):
                # Check if correctly classified with margin
                condition = y_[i] * (x_i @ self.weights + self.bias) >= 1

                if condition:
                    # Correctly classified: only regularization gradient
                    self.weights -= self.lr * self.weights
                else:
                    # Misclassified: regularization + hinge loss gradient
                    self.weights -= self.lr * (self.weights - self.C * y_[i] * x_i)
                    self.bias -= self.lr * (-self.C * y_[i])

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        linear_output = X @ self.weights + self.bias
        return np.sign(linear_output)

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        y_ = np.where(y <= 0, -1, 1)
        return np.mean(self.predict(X) == y_)

# Kernel trick concept
"""
For non-linear data, we can use the kernel trick:
1. Map data to higher dimension: φ(x)
2. SVM only needs dot products: φ(x)·φ(x')
3. Kernel function: K(x, x') = φ(x)·φ(x')

Common kernels:
- Linear: K(x,x') = x·x'
- Polynomial: K(x,x') = (γx·x' + r)^d
- RBF: K(x,x') = exp(-γ||x-x'||²)
"""

6. Naive Bayes

class GaussianNaiveBayes:
    """
    Gaussian Naive Bayes classifier.

    Assumes features are independent and normally distributed.
    P(y|x) ∝ P(y) * ∏ P(x_i|y)
    """

    def __init__(self):
        self.classes = None
        self.class_priors = None
        self.means = None
        self.variances = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'GaussianNaiveBayes':
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # Calculate statistics for each class
        self.class_priors = np.zeros(n_classes)
        self.means = np.zeros((n_classes, n_features))
        self.variances = np.zeros((n_classes, n_features))

        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.class_priors[i] = len(X_c) / n_samples
            self.means[i] = np.mean(X_c, axis=0)
            self.variances[i] = np.var(X_c, axis=0) + 1e-9  # Smoothing

        return self

    def _gaussian_pdf(self, x: np.ndarray, mean: np.ndarray,
                     var: np.ndarray) -> np.ndarray:
        """Gaussian probability density"""
        return np.exp(-((x - mean) ** 2) / (2 * var)) / np.sqrt(2 * np.pi * var)

    def _predict_sample(self, x: np.ndarray) -> int:
        """Predict class for single sample"""
        posteriors = []

        for i, c in enumerate(self.classes):
            prior = np.log(self.class_priors[i])
            likelihood = np.sum(np.log(
                self._gaussian_pdf(x, self.means[i], self.variances[i])
            ))
            posteriors.append(prior + likelihood)

        return self.classes[np.argmax(posteriors)]

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x) for x in X])

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """Predict class probabilities"""
        n_samples = len(X)
        n_classes = len(self.classes)
        log_probs = np.zeros((n_samples, n_classes))

        for i in range(n_samples):
            for j, c in enumerate(self.classes):
                prior = np.log(self.class_priors[j])
                likelihood = np.sum(np.log(
                    self._gaussian_pdf(X[i], self.means[j], self.variances[j])
                ))
                log_probs[i, j] = prior + likelihood

        # Convert to probabilities
        log_probs -= np.max(log_probs, axis=1, keepdims=True)
        probs = np.exp(log_probs)
        return probs / np.sum(probs, axis=1, keepdims=True)

7. Model Selection

Choosing the Right Algorithm

"""
ALGORITHM SELECTION GUIDE:

Linear Regression:
- Target: Continuous
- Relationship: Linear
- Interpretability: High
- Speed: Fast

Logistic Regression:
- Target: Binary/Multi-class
- Relationship: Linear decision boundary
- Interpretability: High
- Speed: Fast

Decision Trees:
- Target: Any
- Relationship: Non-linear, axis-aligned splits
- Interpretability: High
- Speed: Fast (can overfit)

K-Nearest Neighbors:
- Target: Any
- Relationship: Any (local patterns)
- Interpretability: Low
- Speed: Slow for prediction (no training)

SVM:
- Target: Binary (primarily)
- Relationship: Linear (kernel for non-linear)
- Interpretability: Medium
- Speed: Slow for large datasets

Naive Bayes:
- Target: Classification
- Assumptions: Feature independence
- Interpretability: High
- Speed: Very fast
"""

def recommend_algorithm(n_samples: int, n_features: int,
                       problem_type: str, needs_interpretability: bool) -> list:
    """Recommend algorithms based on problem characteristics"""
    recommendations = []

    if problem_type == 'regression':
        recommendations.append('LinearRegression')
        if n_samples > 1000:
            recommendations.append('DecisionTreeRegressor')
        recommendations.append('KNeighborsRegressor')

    elif problem_type == 'classification':
        recommendations.append('LogisticRegression')
        recommendations.append('GaussianNaiveBayes')

        if n_samples < 10000:
            recommendations.append('SVM')

        recommendations.append('DecisionTreeClassifier')
        recommendations.append('KNeighborsClassifier')

    # Prioritize based on interpretability
    if needs_interpretability:
        # Move interpretable models to front
        interpretable = ['LogisticRegression', 'LinearRegression',
                        'DecisionTreeClassifier', 'DecisionTreeRegressor',
                        'GaussianNaiveBayes']
        recommendations = sorted(
            recommendations,
            key=lambda x: 0 if x in interpretable else 1
        )

    return recommendations

Exercises

Basic

  1. Implement linear regression using gradient descent and compare with the normal equation.

  2. Build a binary classifier using logistic regression on the Iris dataset (two classes).

  3. Implement KNN from scratch and test with different values of k.

Intermediate

  1. Create a decision tree and visualize it (print the tree structure).

  2. Compare Naive Bayes with Logistic Regression on a text classification task.

  3. Implement cross-validation to find the best hyperparameters for SVM.

Advanced

  1. Implement a kernel SVM with the RBF kernel.

  2. Build an ensemble of decision trees (Random Forest concept).

  3. Create a multi-class SVM using one-vs-rest strategy.


Summary

  • Linear regression minimizes MSE; logistic regression maximizes likelihood
  • Decision trees split data recursively using information criteria
  • KNN makes predictions based on nearest training examples
  • SVMs maximize the margin between classes
  • Naive Bayes applies Bayes' theorem with independence assumption
  • Algorithm choice depends on data size, complexity, and requirements

Next Reading

Neural Networks →