Supervised Learning

Supervised learning algorithms learn a mapping from inputs to outputs using labeled training data. ## 1. Linear Regression

Simple Linear Regression

import numpy as np
from typing import Tuple

class LinearRegression:
    """
    Linear regression: y = Xw + b

    Finds weights that minimize Mean Squared Error.
    """

    def __init__(self):
        self.weights = None
        self.bias = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearRegression':
        """
        Fit using Normal Equation: w = (X'X)^(-1) X'y
        """
        # Add bias column
        n_samples = X.shape[0]
        X_b = np.column_stack([np.ones(n_samples), X])

        # Solve normal equation
        theta = np.linalg.solve(X_b.T @ X_b, X_b.T @ y)

        self.bias = theta[0]
        self.weights = theta[1:]
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        """Predict using learned weights"""
        return X @ self.weights + self.bias

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """R² score"""
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - (ss_res / ss_tot)

# Example
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X.flatten() + np.random.randn(100)

model = LinearRegression()
model.fit(X, y)
print(f"Weights: {model.weights}, Bias: {model.bias}")
print(f"R² Score: {model.score(X, y):.4f}")

Gradient Descent

class LinearRegressionGD:
    """Linear regression using gradient descent"""

    def __init__(self, learning_rate: float = 0.01, n_iterations: int = 1000):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None
        self.loss_history = []

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearRegressionGD':
        n_samples, n_features = X.shape

        # Initialize weights
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iter):
            # Predictions
            y_pred = X @ self.weights + self.bias

            # Gradients
            dw = (1/n_samples) * (X.T @ (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            # Update weights
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

            # Track loss
            loss = np.mean((y_pred - y) ** 2)
            self.loss_history.append(loss)

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        return X @ self.weights + self.bias

class SGDRegressor:
    """Stochastic Gradient Descent (mini-batch)"""

    def __init__(self, learning_rate: float = 0.01,
                n_iterations: int = 100, batch_size: int = 32):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.batch_size = batch_size
        self.weights = None
        self.bias = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'SGDRegressor':
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for epoch in range(self.n_iter):
            # Shuffle data
            indices = np.random.permutation(n_samples)

            for i in range(0, n_samples, self.batch_size):
                batch_idx = indices[i:i + self.batch_size]
                X_batch = X[batch_idx]
                y_batch = y[batch_idx]

                # Compute gradients on batch
                y_pred = X_batch @ self.weights + self.bias
                dw = (1/len(batch_idx)) * (X_batch.T @ (y_pred - y_batch))
                db = (1/len(batch_idx)) * np.sum(y_pred - y_batch)

                # Update
                self.weights -= self.lr * dw
                self.bias -= self.lr * db

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        return X @ self.weights + self.bias

2. Logistic Regression

Binary Classification

class LogisticRegression:
    """
    Logistic regression for binary classification.

    P(y=1|x) = sigmoid(w'x + b)
    """

    def __init__(self, learning_rate: float = 0.01,
                n_iterations: int = 1000):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None

    @staticmethod
    def sigmoid(z: np.ndarray) -> np.ndarray:
        """Sigmoid activation: 1 / (1 + e^(-z))"""
        # Clip to prevent overflow
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'LogisticRegression':
        n_samples, n_features = X.shape

        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iter):
            # Linear model
            z = X @ self.weights + self.bias

            # Predictions (probabilities)
            y_pred = self.sigmoid(z)

            # Gradients (from cross-entropy loss)
            dw = (1/n_samples) * (X.T @ (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            # Update
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        return self

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """Predict probabilities"""
        z = X @ self.weights + self.bias
        return self.sigmoid(z)

    def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
        """Predict class labels"""
        return (self.predict_proba(X) >= threshold).astype(int)

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """Accuracy"""
        return np.mean(self.predict(X) == y)

# Example
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=10, random_state=42)

model = LogisticRegression(learning_rate=0.1, n_iterations=1000)
model.fit(X, y)
print(f"Accuracy: {model.score(X, y):.4f}")

Multi-class Classification

class SoftmaxRegression:
    """
    Softmax (multinomial logistic) regression for multi-class.

    P(y=k|x) = exp(w_k'x) / sum_j(exp(w_j'x))
    """

    def __init__(self, learning_rate: float = 0.01,
                n_iterations: int = 1000):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None
        self.classes = None

    @staticmethod
    def softmax(z: np.ndarray) -> np.ndarray:
        """Softmax: convert logits to probabilities"""
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Stability
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'SoftmaxRegression':
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # One-hot encode y
        y_onehot = np.zeros((n_samples, n_classes))
        for i, c in enumerate(self.classes):
            y_onehot[y == c, i] = 1

        # Initialize weights
        self.weights = np.zeros((n_features, n_classes))
        self.bias = np.zeros(n_classes)

        for _ in range(self.n_iter):
            # Linear model
            z = X @ self.weights + self.bias

            # Softmax probabilities
            y_pred = self.softmax(z)

            # Gradients
            error = y_pred - y_onehot
            dw = (1/n_samples) * (X.T @ error)
            db = (1/n_samples) * np.sum(error, axis=0)

            # Update
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        return self

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        z = X @ self.weights + self.bias
        return self.softmax(z)

    def predict(self, X: np.ndarray) -> np.ndarray:
        proba = self.predict_proba(X)
        return self.classes[np.argmax(proba, axis=1)]

3. Decision Trees

Classification Tree

from dataclasses import dataclass
from typing import Optional, Union

@dataclass
class TreeNode:
    """Node in decision tree"""
    feature_idx: Optional[int] = None
    threshold: Optional[float] = None
    left: Optional['TreeNode'] = None
    right: Optional['TreeNode'] = None
    value: Optional[int] = None  # Class label for leaf

class DecisionTreeClassifier:
    """Classification tree using Gini impurity"""

    def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'DecisionTreeClassifier':
        self.n_classes = len(np.unique(y))
        self.root = self._grow_tree(X, y, depth=0)
        return self

    def _gini(self, y: np.ndarray) -> float:
        """Gini impurity: 1 - sum(p_k^2)"""
        if len(y) == 0:
            return 0
        proportions = np.bincount(y) / len(y)
        return 1 - np.sum(proportions ** 2)

    def _best_split(self, X: np.ndarray, y: np.ndarray) -> tuple:
        """Find best split by minimizing weighted Gini"""
        n_samples, n_features = X.shape
        best_gain = -1
        best_feature, best_threshold = None, None

        current_gini = self._gini(y)

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                # Weighted Gini of children
                n_left = np.sum(left_mask)
                n_right = np.sum(right_mask)

                gini_left = self._gini(y[left_mask])
                gini_right = self._gini(y[right_mask])

                weighted_gini = (n_left * gini_left + n_right * gini_right) / n_samples

                # Information gain
                gain = current_gini - weighted_gini

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _grow_tree(self, X: np.ndarray, y: np.ndarray, depth: int) -> TreeNode:
        """Recursively grow tree"""
        n_samples = len(y)

        # Stopping conditions
        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            len(np.unique(y)) == 1):
            # Create leaf node
            leaf_value = np.bincount(y).argmax()
            return TreeNode(value=leaf_value)

        # Find best split
        feature, threshold = self._best_split(X, y)

        if feature is None:
            leaf_value = np.bincount(y).argmax()
            return TreeNode(value=leaf_value)

        # Split data
        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        # Recursively grow children
        left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        return TreeNode(
            feature_idx=feature,
            threshold=threshold,
            left=left_child,
            right=right_child
        )

    def _predict_sample(self, x: np.ndarray, node: TreeNode) -> int:
        """Predict single sample"""
        if node.value is not None:
            return node.value

        if x[node.feature_idx] <= node.threshold:
            return self._predict_sample(x, node.left)
        return self._predict_sample(x, node.right)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x, self.root) for x in X])

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        return np.mean(self.predict(X) == y)

Regression Tree

class DecisionTreeRegressor:
    """Regression tree using MSE"""

    def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'DecisionTreeRegressor':
        self.root = self._grow_tree(X, y, depth=0)
        return self

    def _mse(self, y: np.ndarray) -> float:
        """Mean Squared Error"""
        if len(y) == 0:
            return 0
        return np.var(y)

    def _best_split(self, X: np.ndarray, y: np.ndarray) -> tuple:
        """Find split that minimizes MSE"""
        n_samples, n_features = X.shape
        best_mse = float('inf')
        best_feature, best_threshold = None, None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                n_left = np.sum(left_mask)
                n_right = np.sum(right_mask)

                mse_left = self._mse(y[left_mask])
                mse_right = self._mse(y[right_mask])

                weighted_mse = (n_left * mse_left + n_right * mse_right) / n_samples

                if weighted_mse < best_mse:
                    best_mse = weighted_mse
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _grow_tree(self, X: np.ndarray, y: np.ndarray, depth: int) -> TreeNode:
        n_samples = len(y)

        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            np.var(y) < 1e-7):
            return TreeNode(value=np.mean(y))

        feature, threshold = self._best_split(X, y)

        if feature is None:
            return TreeNode(value=np.mean(y))

        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        return TreeNode(
            feature_idx=feature,
            threshold=threshold,
            left=left_child,
            right=right_child
        )

    def _predict_sample(self, x: np.ndarray, node: TreeNode) -> float:
        if node.value is not None:
            return node.value

        if x[node.feature_idx] <= node.threshold:
            return self._predict_sample(x, node.left)
        return self._predict_sample(x, node.right)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x, self.root) for x in X])

4. K-Nearest Neighbors

from collections import Counter

class KNeighborsClassifier:
    """K-Nearest Neighbors classifier"""

    def __init__(self, n_neighbors: int = 5, metric: str = 'euclidean'):
        self.k = n_neighbors
        self.metric = metric

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'KNeighborsClassifier':
        """Store training data (lazy learning)"""
        self.X_train = X
        self.y_train = y
        return self

    def _distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
        """Compute distance between points"""
        if self.metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        else:
            raise ValueError(f"Unknown metric: {self.metric}")

    def _predict_sample(self, x: np.ndarray) -> int:
        """Predict single sample"""
        # Compute distances to all training points
        distances = [self._distance(x, x_train) for x_train in self.X_train]

        # Get k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        k_labels = self.y_train[k_indices]

        # Majority vote
        return Counter(k_labels).most_common(1)[0][0]

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x) for x in X])

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        return np.mean(self.predict(X) == y)

class KNeighborsRegressor:
    """K-Nearest Neighbors regressor"""

    def __init__(self, n_neighbors: int = 5, weights: str = 'uniform'):
        self.k = n_neighbors
        self.weights = weights

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'KNeighborsRegressor':
        self.X_train = X
        self.y_train = y
        return self

    def _predict_sample(self, x: np.ndarray) -> float:
        distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        k_indices = np.argsort(distances)[:self.k]
        k_distances = distances[k_indices]
        k_values = self.y_train[k_indices]

        if self.weights == 'uniform':
            return np.mean(k_values)
        elif self.weights == 'distance':
            # Weight by inverse distance
            weights = 1 / (k_distances + 1e-8)
            return np.sum(weights * k_values) / np.sum(weights)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x) for x in X])

5. Support Vector Machines

Linear SVM

class LinearSVM:
    """
    Linear Support Vector Machine using gradient descent.

    Minimizes: (1/2)||w||² + C * sum(max(0, 1 - y_i(w·x_i + b)))
    """

    def __init__(self, C: float = 1.0, learning_rate: float = 0.001,
                n_iterations: int = 1000):
        self.C = C
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearSVM':
        n_samples, n_features = X.shape

        # Convert labels to {-1, 1}
        y_ = np.where(y <= 0, -1, 1)

        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iter):
            for i, x_i in enumerate(X):
                # Check if correctly classified with margin
                condition = y_[i] * (x_i @ self.weights + self.bias) >= 1

                if condition:
                    # Correctly classified: only regularization gradient
                    self.weights -= self.lr * self.weights
                else:
                    # Misclassified: regularization + hinge loss gradient
                    self.weights -= self.lr * (self.weights - self.C * y_[i] * x_i)
                    self.bias -= self.lr * (-self.C * y_[i])

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        linear_output = X @ self.weights + self.bias
        return np.sign(linear_output)

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        y_ = np.where(y <= 0, -1, 1)
        return np.mean(self.predict(X) == y_)

# Kernel trick concept
"""
For non-linear data, we can use the kernel trick:
1. Map data to higher dimension: φ(x)
2. SVM only needs dot products: φ(x)·φ(x')
3. Kernel function: K(x, x') = φ(x)·φ(x')

Common kernels:
- Linear: K(x,x') = x·x'
- Polynomial: K(x,x') = (γx·x' + r)^d
- RBF: K(x,x') = exp(-γ||x-x'||²)
"""

6. Naive Bayes

class GaussianNaiveBayes:
    """
    Gaussian Naive Bayes classifier.

    Assumes features are independent and normally distributed.
    P(y|x) ∝ P(y) * ∏ P(x_i|y)
    """

    def __init__(self):
        self.classes = None
        self.class_priors = None
        self.means = None
        self.variances = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'GaussianNaiveBayes':
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # Calculate statistics for each class
        self.class_priors = np.zeros(n_classes)
        self.means = np.zeros((n_classes, n_features))
        self.variances = np.zeros((n_classes, n_features))

        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.class_priors[i] = len(X_c) / n_samples
            self.means[i] = np.mean(X_c, axis=0)
            self.variances[i] = np.var(X_c, axis=0) + 1e-9  # Smoothing

        return self

    def _gaussian_pdf(self, x: np.ndarray, mean: np.ndarray,
                     var: np.ndarray) -> np.ndarray:
        """Gaussian probability density"""
        return np.exp(-((x - mean) ** 2) / (2 * var)) / np.sqrt(2 * np.pi * var)

    def _predict_sample(self, x: np.ndarray) -> int:
        """Predict class for single sample"""
        posteriors = []

        for i, c in enumerate(self.classes):
            prior = np.log(self.class_priors[i])
            likelihood = np.sum(np.log(
                self._gaussian_pdf(x, self.means[i], self.variances[i])
            ))
            posteriors.append(prior + likelihood)

        return self.classes[np.argmax(posteriors)]

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_sample(x) for x in X])

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """Predict class probabilities"""
        n_samples = len(X)
        n_classes = len(self.classes)
        log_probs = np.zeros((n_samples, n_classes))

        for i in range(n_samples):
            for j, c in enumerate(self.classes):
                prior = np.log(self.class_priors[j])
                likelihood = np.sum(np.log(
                    self._gaussian_pdf(X[i], self.means[j], self.variances[j])
                ))
                log_probs[i, j] = prior + likelihood

        # Convert to probabilities
        log_probs -= np.max(log_probs, axis=1, keepdims=True)
        probs = np.exp(log_probs)
        return probs / np.sum(probs, axis=1, keepdims=True)

7. Model Selection

Choosing the Right Algorithm

"""
ALGORITHM SELECTION GUIDE:

Linear Regression:
- Target: Continuous
- Relationship: Linear
- Interpretability: High
- Speed: Fast

Logistic Regression:
- Target: Binary/Multi-class
- Relationship: Linear decision boundary
- Interpretability: High
- Speed: Fast

Decision Trees:
- Target: Any
- Relationship: Non-linear, axis-aligned splits
- Interpretability: High
- Speed: Fast (can overfit)

K-Nearest Neighbors:
- Target: Any
- Relationship: Any (local patterns)
- Interpretability: Low
- Speed: Slow for prediction (no training)

SVM:
- Target: Binary (primarily)
- Relationship: Linear (kernel for non-linear)
- Interpretability: Medium
- Speed: Slow for large datasets

Naive Bayes:
- Target: Classification
- Assumptions: Feature independence
- Interpretability: High
- Speed: Very fast
"""

def recommend_algorithm(n_samples: int, n_features: int,
                       problem_type: str, needs_interpretability: bool) -> list:
    """Recommend algorithms based on problem characteristics"""
    recommendations = []

    if problem_type == 'regression':
        recommendations.append('LinearRegression')
        if n_samples > 1000:
            recommendations.append('DecisionTreeRegressor')
        recommendations.append('KNeighborsRegressor')

    elif problem_type == 'classification':
        recommendations.append('LogisticRegression')
        recommendations.append('GaussianNaiveBayes')

        if n_samples < 10000:
            recommendations.append('SVM')

        recommendations.append('DecisionTreeClassifier')
        recommendations.append('KNeighborsClassifier')

    # Prioritize based on interpretability
    if needs_interpretability:
        # Move interpretable models to front
        interpretable = ['LogisticRegression', 'LinearRegression',
                        'DecisionTreeClassifier', 'DecisionTreeRegressor',
                        'GaussianNaiveBayes']
        recommendations = sorted(
            recommendations,
            key=lambda x: 0 if x in interpretable else 1
        )

    return recommendations

Exercises

Basic

  1. Implement linear regression using gradient descent and compare with the normal equation.

  2. Build a binary classifier using logistic regression on the Iris dataset (two classes).

  3. Implement KNN from scratch and test with different values of k.

Intermediate

  1. Create a decision tree and visualize it (print the tree structure).

  2. Compare Naive Bayes with Logistic Regression on a text classification task.

  3. Implement cross-validation to find the best hyperparameters for SVM.

Advanced

  1. Implement a kernel SVM with the RBF kernel.

  2. Build an ensemble of decision trees (Random Forest concept).

  3. Create a multi-class SVM using one-vs-rest strategy.


Summary

  • Linear regression minimizes MSE; logistic regression maximizes likelihood
  • Decision trees split data recursively using information criteria
  • KNN makes predictions based on nearest training examples
  • SVMs maximize the margin between classes
  • Naive Bayes applies Bayes' theorem with independence assumption
  • Algorithm choice depends on data size, complexity, and requirements

Next Reading

Neural Networks →