Neural Networks

Neural networks are computing systems inspired by biological neural networks. They learn representations of data through layers of interconnected nodes (neurons). ## 1. The Perceptron

Single Neuron

import numpy as np

class Perceptron:
    """
    Single-layer perceptron (linear classifier).

    output = activation(w·x + b)
    """

    def __init__(self, n_features: int, learning_rate: float = 0.01):
        self.lr = learning_rate
        self.weights = np.random.randn(n_features) * 0.01
        self.bias = 0

    def activation(self, z: float) -> int:
        """Step function"""
        return 1 if z >= 0 else 0

    def predict(self, x: np.ndarray) -> int:
        """Forward pass"""
        z = np.dot(self.weights, x) + self.bias
        return self.activation(z)

    def train(self, X: np.ndarray, y: np.ndarray, n_epochs: int = 100):
        """Perceptron learning algorithm"""
        for epoch in range(n_epochs):
            errors = 0
            for x_i, y_i in zip(X, y):
                # Predict
                y_pred = self.predict(x_i)

                # Update if wrong
                if y_pred != y_i:
                    error = y_i - y_pred
                    self.weights += self.lr * error * x_i
                    self.bias += self.lr * error
                    errors += 1

            if errors == 0:
                print(f"Converged at epoch {epoch}")
                break

# Example: AND gate
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 0, 0, 1])

perceptron = Perceptron(n_features=2, learning_rate=0.1)
perceptron.train(X, y)

for x_i in X:
    print(f"{x_i} -> {perceptron.predict(x_i)}")

Limitations

"""
Perceptron can only learn linearly separable patterns!

XOR problem (not linearly separable):
X = [[0,0], [0,1], [1,0], [1,1]]
y = [0, 1, 1, 0]

No line can separate 0s from 1s!

Solution: Multi-layer networks (MLP)
"""

2. Multi-Layer Perceptron (MLP)

Architecture

class NeuralNetwork:
    """
    Multi-layer perceptron with configurable layers.

    Architecture: Input -> Hidden(s) -> Output
    """

    def __init__(self, layer_sizes: list):
        """
        layer_sizes: [input_size, hidden1, hidden2, ..., output_size]
        """
        self.layer_sizes = layer_sizes
        self.n_layers = len(layer_sizes)
        self.weights = []
        self.biases = []

        # Initialize weights (Xavier initialization)
        for i in range(len(layer_sizes) - 1):
            n_in, n_out = layer_sizes[i], layer_sizes[i + 1]
            # Xavier: variance = 2 / (n_in + n_out)
            w = np.random.randn(n_in, n_out) * np.sqrt(2.0 / (n_in + n_out))
            b = np.zeros(n_out)
            self.weights.append(w)
            self.biases.append(b)

    def sigmoid(self, z: np.ndarray) -> np.ndarray:
        """Sigmoid activation"""
        return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

    def sigmoid_derivative(self, a: np.ndarray) -> np.ndarray:
        """Derivative of sigmoid: a * (1 - a)"""
        return a * (1 - a)

    def forward(self, X: np.ndarray) -> tuple:
        """
        Forward propagation.

        Returns activations at each layer for backprop.
        """
        activations = [X]
        a = X

        for i in range(self.n_layers - 1):
            z = a @ self.weights[i] + self.biases[i]
            a = self.sigmoid(z)
            activations.append(a)

        return activations

    def predict(self, X: np.ndarray) -> np.ndarray:
        """Get final layer output"""
        activations = self.forward(X)
        return activations[-1]

Backpropagation

class NeuralNetworkBackprop(NeuralNetwork):
    """Neural network with backpropagation training"""

    def __init__(self, layer_sizes: list, learning_rate: float = 0.1):
        super().__init__(layer_sizes)
        self.lr = learning_rate

    def backward(self, X: np.ndarray, y: np.ndarray,
                activations: list) -> tuple:
        """
        Backpropagation.

        Compute gradients of loss with respect to weights and biases.
        """
        n_samples = len(X)
        gradients_w = [np.zeros_like(w) for w in self.weights]
        gradients_b = [np.zeros_like(b) for b in self.biases]

        # Output layer error
        # Using MSE loss: dL/da = (a - y)
        # Chain rule: delta = dL/da * da/dz = (a - y) * sigmoid'(z)
        delta = (activations[-1] - y) * self.sigmoid_derivative(activations[-1])

        # Backpropagate through layers
        for i in range(self.n_layers - 2, -1, -1):
            # Gradients for this layer
            gradients_w[i] = activations[i].T @ delta / n_samples
            gradients_b[i] = np.mean(delta, axis=0)

            if i > 0:
                # Propagate error to previous layer
                delta = (delta @ self.weights[i].T) * \
                        self.sigmoid_derivative(activations[i])

        return gradients_w, gradients_b

    def train_step(self, X: np.ndarray, y: np.ndarray) -> float:
        """Single training step"""
        # Forward pass
        activations = self.forward(X)

        # Compute loss
        loss = np.mean((activations[-1] - y) ** 2)

        # Backward pass
        gradients_w, gradients_b = self.backward(X, y, activations)

        # Update weights
        for i in range(len(self.weights)):
            self.weights[i] -= self.lr * gradients_w[i]
            self.biases[i] -= self.lr * gradients_b[i]

        return loss

    def fit(self, X: np.ndarray, y: np.ndarray,
           n_epochs: int = 1000, verbose: bool = True):
        """Train the network"""
        for epoch in range(n_epochs):
            loss = self.train_step(X, y)

            if verbose and epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.6f}")

# Solve XOR problem!
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

nn = NeuralNetworkBackprop([2, 4, 1], learning_rate=1.0)
nn.fit(X, y, n_epochs=5000)

print("\nXOR Results:")
for x_i, y_i in zip(X, y):
    pred = nn.predict(x_i.reshape(1, -1))
    print(f"{x_i} -> {pred[0, 0]:.4f} (expected {y_i[0]})")

3. Activation Functions

class ActivationFunctions:
    """Common activation functions and their derivatives"""

    @staticmethod
    def sigmoid(z):
        """Range: (0, 1) - Good for output layer (binary classification)"""
        return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

    @staticmethod
    def sigmoid_derivative(a):
        return a * (1 - a)

    @staticmethod
    def tanh(z):
        """Range: (-1, 1) - Zero-centered, often better than sigmoid"""
        return np.tanh(z)

    @staticmethod
    def tanh_derivative(a):
        return 1 - a ** 2

    @staticmethod
    def relu(z):
        """Range: [0, inf) - Most popular for hidden layers"""
        return np.maximum(0, z)

    @staticmethod
    def relu_derivative(z):
        return (z > 0).astype(float)

    @staticmethod
    def leaky_relu(z, alpha=0.01):
        """Range: (-inf, inf) - Fixes dying ReLU problem"""
        return np.where(z > 0, z, alpha * z)

    @staticmethod
    def leaky_relu_derivative(z, alpha=0.01):
        return np.where(z > 0, 1, alpha)

    @staticmethod
    def softmax(z):
        """Multi-class output layer - probabilities sum to 1"""
        exp_z = np.exp(z - np.max(z, axis=-1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=-1, keepdims=True)

"""
Activation Function Guidelines:

HIDDEN LAYERS:
- ReLU: Default choice, fast, works well
- Leaky ReLU: If experiencing dying neurons
- tanh: Sometimes for RNNs

OUTPUT LAYER:
- Binary classification: Sigmoid
- Multi-class classification: Softmax
- Regression: Linear (no activation)

PROBLEMS:
- Sigmoid/tanh: Vanishing gradient for deep networks
- ReLU: Dying neurons (stuck at 0)
"""

4. Loss Functions

class LossFunctions:
    """Common loss functions"""

    @staticmethod
    def mse(y_true, y_pred):
        """Mean Squared Error - Regression"""
        return np.mean((y_true - y_pred) ** 2)

    @staticmethod
    def mse_derivative(y_true, y_pred):
        return 2 * (y_pred - y_true) / len(y_true)

    @staticmethod
    def binary_crossentropy(y_true, y_pred, epsilon=1e-15):
        """Binary Cross-Entropy - Binary Classification"""
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(
            y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)
        )

    @staticmethod
    def binary_crossentropy_derivative(y_true, y_pred, epsilon=1e-15):
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return (y_pred - y_true) / (y_pred * (1 - y_pred))

    @staticmethod
    def categorical_crossentropy(y_true, y_pred, epsilon=1e-15):
        """Categorical Cross-Entropy - Multi-class Classification"""
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

    @staticmethod
    def categorical_crossentropy_derivative(y_true, y_pred):
        """For softmax output, gradient simplifies to y_pred - y_true"""
        return y_pred - y_true

5. Optimizers

class Optimizer:
    """Base optimizer class"""
    def update(self, weights, biases, grad_w, grad_b):
        raise NotImplementedError

class SGD(Optimizer):
    """Stochastic Gradient Descent"""
    def __init__(self, learning_rate=0.01):
        self.lr = learning_rate

    def update(self, weights, biases, grad_w, grad_b):
        for i in range(len(weights)):
            weights[i] -= self.lr * grad_w[i]
            biases[i] -= self.lr * grad_b[i]
        return weights, biases

class Momentum(Optimizer):
    """SGD with Momentum"""
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.lr = learning_rate
        self.momentum = momentum
        self.velocity_w = None
        self.velocity_b = None

    def update(self, weights, biases, grad_w, grad_b):
        if self.velocity_w is None:
            self.velocity_w = [np.zeros_like(w) for w in weights]
            self.velocity_b = [np.zeros_like(b) for b in biases]

        for i in range(len(weights)):
            self.velocity_w[i] = self.momentum * self.velocity_w[i] - self.lr * grad_w[i]
            self.velocity_b[i] = self.momentum * self.velocity_b[i] - self.lr * grad_b[i]

            weights[i] += self.velocity_w[i]
            biases[i] += self.velocity_b[i]

        return weights, biases

class Adam(Optimizer):
    """Adaptive Moment Estimation"""
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m_w = None  # First moment
        self.v_w = None  # Second moment
        self.m_b = None
        self.v_b = None
        self.t = 0

    def update(self, weights, biases, grad_w, grad_b):
        self.t += 1

        if self.m_w is None:
            self.m_w = [np.zeros_like(w) for w in weights]
            self.v_w = [np.zeros_like(w) for w in weights]
            self.m_b = [np.zeros_like(b) for b in biases]
            self.v_b = [np.zeros_like(b) for b in biases]

        for i in range(len(weights)):
            # Update biased first moment
            self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * grad_w[i]
            self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * grad_b[i]

            # Update biased second moment
            self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (grad_w[i] ** 2)
            self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (grad_b[i] ** 2)

            # Bias correction
            m_w_hat = self.m_w[i] / (1 - self.beta1 ** self.t)
            v_w_hat = self.v_w[i] / (1 - self.beta2 ** self.t)
            m_b_hat = self.m_b[i] / (1 - self.beta1 ** self.t)
            v_b_hat = self.v_b[i] / (1 - self.beta2 ** self.t)

            # Update weights
            weights[i] -= self.lr * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
            biases[i] -= self.lr * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)

        return weights, biases

6. Regularization Techniques

class Regularization:
    """Regularization techniques to prevent overfitting"""

    @staticmethod
    def l2_penalty(weights, lambda_reg=0.01):
        """L2 regularization (weight decay)"""
        penalty = 0
        for w in weights:
            penalty += lambda_reg * np.sum(w ** 2)
        return penalty

    @staticmethod
    def l2_gradient(weights, lambda_reg=0.01):
        """Gradient of L2 penalty"""
        return [2 * lambda_reg * w for w in weights]

class Dropout:
    """Dropout regularization"""

    def __init__(self, rate=0.5):
        self.rate = rate
        self.mask = None

    def forward(self, x, training=True):
        """Apply dropout during training"""
        if training:
            self.mask = np.random.binomial(1, 1 - self.rate, size=x.shape)
            return x * self.mask / (1 - self.rate)  # Scale to maintain expected value
        return x

    def backward(self, grad):
        """Backpropagate through dropout"""
        return grad * self.mask / (1 - self.rate)

class BatchNormalization:
    """Batch normalization layer"""

    def __init__(self, n_features, momentum=0.9, epsilon=1e-8):
        self.gamma = np.ones(n_features)
        self.beta = np.zeros(n_features)
        self.momentum = momentum
        self.epsilon = epsilon

        # Running statistics
        self.running_mean = np.zeros(n_features)
        self.running_var = np.ones(n_features)

        # Cache for backward pass
        self.cache = None

    def forward(self, x, training=True):
        if training:
            mean = np.mean(x, axis=0)
            var = np.var(x, axis=0)

            # Update running statistics
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * var

            # Normalize
            x_norm = (x - mean) / np.sqrt(var + self.epsilon)
        else:
            x_norm = (x - self.running_mean) / np.sqrt(self.running_var + self.epsilon)

        return self.gamma * x_norm + self.beta

7. Complete Neural Network Framework

class DenseLayer:
    """Fully connected layer"""

    def __init__(self, input_size, output_size, activation='relu'):
        self.weights = np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
        self.biases = np.zeros(output_size)
        self.activation = activation

        # Cache for backprop
        self.input = None
        self.z = None
        self.a = None

    def activate(self, z):
        if self.activation == 'relu':
            return np.maximum(0, z)
        elif self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
        elif self.activation == 'tanh':
            return np.tanh(z)
        elif self.activation == 'softmax':
            exp_z = np.exp(z - np.max(z, axis=-1, keepdims=True))
            return exp_z / np.sum(exp_z, axis=-1, keepdims=True)
        else:  # linear
            return z

    def activate_derivative(self, a, z):
        if self.activation == 'relu':
            return (z > 0).astype(float)
        elif self.activation == 'sigmoid':
            return a * (1 - a)
        elif self.activation == 'tanh':
            return 1 - a ** 2
        else:  # linear, softmax handled separately
            return np.ones_like(a)

    def forward(self, x):
        self.input = x
        self.z = x @ self.weights + self.biases
        self.a = self.activate(self.z)
        return self.a

    def backward(self, grad_output):
        n_samples = len(self.input)

        # Gradient through activation
        if self.activation != 'softmax':
            grad_z = grad_output * self.activate_derivative(self.a, self.z)
        else:
            grad_z = grad_output  # Softmax + CE gradient is just (y_pred - y_true)

        # Gradients for weights and biases
        self.grad_weights = self.input.T @ grad_z / n_samples
        self.grad_biases = np.mean(grad_z, axis=0)

        # Gradient for previous layer
        return grad_z @ self.weights.T

class NeuralNet:
    """Neural network with arbitrary layers"""

    def __init__(self):
        self.layers = []

    def add(self, layer):
        self.layers.append(layer)

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, grad):
        for layer in reversed(self.layers):
            grad = layer.backward(grad)

    def train(self, X, y, epochs=1000, lr=0.01, batch_size=32):
        n_samples = len(X)

        for epoch in range(epochs):
            # Shuffle data
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            total_loss = 0
            for i in range(0, n_samples, batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]

                # Forward
                y_pred = self.forward(X_batch)

                # Loss
                loss = -np.mean(y_batch * np.log(y_pred + 1e-15))
                total_loss += loss

                # Backward
                grad = y_pred - y_batch  # Softmax + CE
                self.backward(grad)

                # Update
                for layer in self.layers:
                    if hasattr(layer, 'weights'):
                        layer.weights -= lr * layer.grad_weights
                        layer.biases -= lr * layer.grad_biases

            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

    def predict(self, X):
        return self.forward(X)

# Example: MNIST-like network
"""
model = NeuralNet()
model.add(DenseLayer(784, 128, activation='relu'))
model.add(DenseLayer(128, 64, activation='relu'))
model.add(DenseLayer(64, 10, activation='softmax'))

model.train(X_train, y_train, epochs=100, lr=0.1)
predictions = model.predict(X_test)
"""

8. Common Architectures

"""
FEEDFORWARD NETWORKS (MLP):
- Fully connected layers
- Good for: Tabular data, simple tasks
- Architecture: Input -> Dense -> Dense -> Output

CONVOLUTIONAL NEURAL NETWORKS (CNN):
- Convolutional layers extract spatial features
- Good for: Images, spatial data
- Architecture: Conv -> Pool -> Conv -> Pool -> Dense -> Output

RECURRENT NEURAL NETWORKS (RNN):
- Connections loop back to maintain state
- Good for: Sequences, time series, text
- Variants: LSTM, GRU

TRANSFORMERS:
- Self-attention mechanism
- Good for: NLP, sequences, images (ViT)
- Architecture: Multi-head attention + feedforward

AUTOENCODERS:
- Encoder compresses, decoder reconstructs
- Good for: Dimensionality reduction, anomaly detection
- Architecture: Encoder -> Bottleneck -> Decoder

GENERATIVE ADVERSARIAL NETWORKS (GAN):
- Generator creates, discriminator judges
- Good for: Image generation, data augmentation
- Architecture: Generator <-> Discriminator
"""

Exercises

Basic

  1. Implement a perceptron and train it on the AND gate.

  2. Build a 2-layer neural network and solve the XOR problem.

  3. Compare sigmoid, tanh, and ReLU activations on a simple classification task.

Intermediate

  1. Implement mini-batch gradient descent with momentum.

  2. Add L2 regularization to a neural network and observe the effect on overfitting.

  3. Implement dropout and compare training with and without it.

Advanced

  1. Build a neural network framework supporting arbitrary layer configurations.

  2. Implement batch normalization and measure training speedup.

  3. Create a simple autoencoder for dimensionality reduction.


Summary

  • Perceptrons can only solve linearly separable problems
  • Multi-layer networks with backpropagation can learn complex patterns
  • Activation functions introduce non-linearity (ReLU is most common)
  • Regularization (L2, dropout, batch norm) prevents overfitting
  • Optimizers like Adam adapt learning rates for faster convergence
  • Different architectures suit different problem types

Module Complete

This completes the Machine Learning Fundamentals module! You've learned the core concepts underlying modern machine learning systems.

← Previous: Supervised Learning | Back to Course Index