Neural Networks
Introduction
Neural networks are computing systems inspired by biological neural networks. They learn representations of data through layers of interconnected nodes (neurons). This reading covers the fundamentals of neural networks from perceptrons to deep learning.
Learning Objectives
By the end of this reading, you will be able to:
- Understand the perceptron and multi-layer networks
- Implement forward and backward propagation
- Apply activation functions appropriately
- Train networks using gradient descent
- Recognize common architectures and their uses
1. The Perceptron
Single Neuron
import numpy as np
class Perceptron:
"""
Single-layer perceptron (linear classifier).
output = activation(w·x + b)
"""
def __init__(self, n_features: int, learning_rate: float = 0.01):
self.lr = learning_rate
self.weights = np.random.randn(n_features) * 0.01
self.bias = 0
def activation(self, z: float) -> int:
"""Step function"""
return 1 if z >= 0 else 0
def predict(self, x: np.ndarray) -> int:
"""Forward pass"""
z = np.dot(self.weights, x) + self.bias
return self.activation(z)
def train(self, X: np.ndarray, y: np.ndarray, n_epochs: int = 100):
"""Perceptron learning algorithm"""
for epoch in range(n_epochs):
errors = 0
for x_i, y_i in zip(X, y):
# Predict
y_pred = self.predict(x_i)
# Update if wrong
if y_pred != y_i:
error = y_i - y_pred
self.weights += self.lr * error * x_i
self.bias += self.lr * error
errors += 1
if errors == 0:
print(f"Converged at epoch {epoch}")
break
# Example: AND gate
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 0, 0, 1])
perceptron = Perceptron(n_features=2, learning_rate=0.1)
perceptron.train(X, y)
for x_i in X:
print(f"{x_i} -> {perceptron.predict(x_i)}")
Limitations
"""
Perceptron can only learn linearly separable patterns!
XOR problem (not linearly separable):
X = [[0,0], [0,1], [1,0], [1,1]]
y = [0, 1, 1, 0]
No line can separate 0s from 1s!
Solution: Multi-layer networks (MLP)
"""
2. Multi-Layer Perceptron (MLP)
Architecture
class NeuralNetwork:
"""
Multi-layer perceptron with configurable layers.
Architecture: Input -> Hidden(s) -> Output
"""
def __init__(self, layer_sizes: list):
"""
layer_sizes: [input_size, hidden1, hidden2, ..., output_size]
"""
self.layer_sizes = layer_sizes
self.n_layers = len(layer_sizes)
self.weights = []
self.biases = []
# Initialize weights (Xavier initialization)
for i in range(len(layer_sizes) - 1):
n_in, n_out = layer_sizes[i], layer_sizes[i + 1]
# Xavier: variance = 2 / (n_in + n_out)
w = np.random.randn(n_in, n_out) * np.sqrt(2.0 / (n_in + n_out))
b = np.zeros(n_out)
self.weights.append(w)
self.biases.append(b)
def sigmoid(self, z: np.ndarray) -> np.ndarray:
"""Sigmoid activation"""
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
def sigmoid_derivative(self, a: np.ndarray) -> np.ndarray:
"""Derivative of sigmoid: a * (1 - a)"""
return a * (1 - a)
def forward(self, X: np.ndarray) -> tuple:
"""
Forward propagation.
Returns activations at each layer for backprop.
"""
activations = [X]
a = X
for i in range(self.n_layers - 1):
z = a @ self.weights[i] + self.biases[i]
a = self.sigmoid(z)
activations.append(a)
return activations
def predict(self, X: np.ndarray) -> np.ndarray:
"""Get final layer output"""
activations = self.forward(X)
return activations[-1]
Backpropagation
class NeuralNetworkBackprop(NeuralNetwork):
"""Neural network with backpropagation training"""
def __init__(self, layer_sizes: list, learning_rate: float = 0.1):
super().__init__(layer_sizes)
self.lr = learning_rate
def backward(self, X: np.ndarray, y: np.ndarray,
activations: list) -> tuple:
"""
Backpropagation.
Compute gradients of loss with respect to weights and biases.
"""
n_samples = len(X)
gradients_w = [np.zeros_like(w) for w in self.weights]
gradients_b = [np.zeros_like(b) for b in self.biases]
# Output layer error
# Using MSE loss: dL/da = (a - y)
# Chain rule: delta = dL/da * da/dz = (a - y) * sigmoid'(z)
delta = (activations[-1] - y) * self.sigmoid_derivative(activations[-1])
# Backpropagate through layers
for i in range(self.n_layers - 2, -1, -1):
# Gradients for this layer
gradients_w[i] = activations[i].T @ delta / n_samples
gradients_b[i] = np.mean(delta, axis=0)
if i > 0:
# Propagate error to previous layer
delta = (delta @ self.weights[i].T) * \
self.sigmoid_derivative(activations[i])
return gradients_w, gradients_b
def train_step(self, X: np.ndarray, y: np.ndarray) -> float:
"""Single training step"""
# Forward pass
activations = self.forward(X)
# Compute loss
loss = np.mean((activations[-1] - y) ** 2)
# Backward pass
gradients_w, gradients_b = self.backward(X, y, activations)
# Update weights
for i in range(len(self.weights)):
self.weights[i] -= self.lr * gradients_w[i]
self.biases[i] -= self.lr * gradients_b[i]
return loss
def fit(self, X: np.ndarray, y: np.ndarray,
n_epochs: int = 1000, verbose: bool = True):
"""Train the network"""
for epoch in range(n_epochs):
loss = self.train_step(X, y)
if verbose and epoch % 100 == 0:
print(f"Epoch {epoch}, Loss: {loss:.6f}")
# Solve XOR problem!
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
nn = NeuralNetworkBackprop([2, 4, 1], learning_rate=1.0)
nn.fit(X, y, n_epochs=5000)
print("\nXOR Results:")
for x_i, y_i in zip(X, y):
pred = nn.predict(x_i.reshape(1, -1))
print(f"{x_i} -> {pred[0, 0]:.4f} (expected {y_i[0]})")
3. Activation Functions
class ActivationFunctions:
"""Common activation functions and their derivatives"""
@staticmethod
def sigmoid(z):
"""Range: (0, 1) - Good for output layer (binary classification)"""
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
@staticmethod
def sigmoid_derivative(a):
return a * (1 - a)
@staticmethod
def tanh(z):
"""Range: (-1, 1) - Zero-centered, often better than sigmoid"""
return np.tanh(z)
@staticmethod
def tanh_derivative(a):
return 1 - a ** 2
@staticmethod
def relu(z):
"""Range: [0, inf) - Most popular for hidden layers"""
return np.maximum(0, z)
@staticmethod
def relu_derivative(z):
return (z > 0).astype(float)
@staticmethod
def leaky_relu(z, alpha=0.01):
"""Range: (-inf, inf) - Fixes dying ReLU problem"""
return np.where(z > 0, z, alpha * z)
@staticmethod
def leaky_relu_derivative(z, alpha=0.01):
return np.where(z > 0, 1, alpha)
@staticmethod
def softmax(z):
"""Multi-class output layer - probabilities sum to 1"""
exp_z = np.exp(z - np.max(z, axis=-1, keepdims=True))
return exp_z / np.sum(exp_z, axis=-1, keepdims=True)
"""
Activation Function Guidelines:
HIDDEN LAYERS:
- ReLU: Default choice, fast, works well
- Leaky ReLU: If experiencing dying neurons
- tanh: Sometimes for RNNs
OUTPUT LAYER:
- Binary classification: Sigmoid
- Multi-class classification: Softmax
- Regression: Linear (no activation)
PROBLEMS:
- Sigmoid/tanh: Vanishing gradient for deep networks
- ReLU: Dying neurons (stuck at 0)
"""
4. Loss Functions
class LossFunctions:
"""Common loss functions"""
@staticmethod
def mse(y_true, y_pred):
"""Mean Squared Error - Regression"""
return np.mean((y_true - y_pred) ** 2)
@staticmethod
def mse_derivative(y_true, y_pred):
return 2 * (y_pred - y_true) / len(y_true)
@staticmethod
def binary_crossentropy(y_true, y_pred, epsilon=1e-15):
"""Binary Cross-Entropy - Binary Classification"""
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return -np.mean(
y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)
)
@staticmethod
def binary_crossentropy_derivative(y_true, y_pred, epsilon=1e-15):
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return (y_pred - y_true) / (y_pred * (1 - y_pred))
@staticmethod
def categorical_crossentropy(y_true, y_pred, epsilon=1e-15):
"""Categorical Cross-Entropy - Multi-class Classification"""
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
@staticmethod
def categorical_crossentropy_derivative(y_true, y_pred):
"""For softmax output, gradient simplifies to y_pred - y_true"""
return y_pred - y_true
5. Optimizers
class Optimizer:
"""Base optimizer class"""
def update(self, weights, biases, grad_w, grad_b):
raise NotImplementedError
class SGD(Optimizer):
"""Stochastic Gradient Descent"""
def __init__(self, learning_rate=0.01):
self.lr = learning_rate
def update(self, weights, biases, grad_w, grad_b):
for i in range(len(weights)):
weights[i] -= self.lr * grad_w[i]
biases[i] -= self.lr * grad_b[i]
return weights, biases
class Momentum(Optimizer):
"""SGD with Momentum"""
def __init__(self, learning_rate=0.01, momentum=0.9):
self.lr = learning_rate
self.momentum = momentum
self.velocity_w = None
self.velocity_b = None
def update(self, weights, biases, grad_w, grad_b):
if self.velocity_w is None:
self.velocity_w = [np.zeros_like(w) for w in weights]
self.velocity_b = [np.zeros_like(b) for b in biases]
for i in range(len(weights)):
self.velocity_w[i] = self.momentum * self.velocity_w[i] - self.lr * grad_w[i]
self.velocity_b[i] = self.momentum * self.velocity_b[i] - self.lr * grad_b[i]
weights[i] += self.velocity_w[i]
biases[i] += self.velocity_b[i]
return weights, biases
class Adam(Optimizer):
"""Adaptive Moment Estimation"""
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.lr = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m_w = None # First moment
self.v_w = None # Second moment
self.m_b = None
self.v_b = None
self.t = 0
def update(self, weights, biases, grad_w, grad_b):
self.t += 1
if self.m_w is None:
self.m_w = [np.zeros_like(w) for w in weights]
self.v_w = [np.zeros_like(w) for w in weights]
self.m_b = [np.zeros_like(b) for b in biases]
self.v_b = [np.zeros_like(b) for b in biases]
for i in range(len(weights)):
# Update biased first moment
self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * grad_w[i]
self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * grad_b[i]
# Update biased second moment
self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (grad_w[i] ** 2)
self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (grad_b[i] ** 2)
# Bias correction
m_w_hat = self.m_w[i] / (1 - self.beta1 ** self.t)
v_w_hat = self.v_w[i] / (1 - self.beta2 ** self.t)
m_b_hat = self.m_b[i] / (1 - self.beta1 ** self.t)
v_b_hat = self.v_b[i] / (1 - self.beta2 ** self.t)
# Update weights
weights[i] -= self.lr * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
biases[i] -= self.lr * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)
return weights, biases
6. Regularization Techniques
class Regularization:
"""Regularization techniques to prevent overfitting"""
@staticmethod
def l2_penalty(weights, lambda_reg=0.01):
"""L2 regularization (weight decay)"""
penalty = 0
for w in weights:
penalty += lambda_reg * np.sum(w ** 2)
return penalty
@staticmethod
def l2_gradient(weights, lambda_reg=0.01):
"""Gradient of L2 penalty"""
return [2 * lambda_reg * w for w in weights]
class Dropout:
"""Dropout regularization"""
def __init__(self, rate=0.5):
self.rate = rate
self.mask = None
def forward(self, x, training=True):
"""Apply dropout during training"""
if training:
self.mask = np.random.binomial(1, 1 - self.rate, size=x.shape)
return x * self.mask / (1 - self.rate) # Scale to maintain expected value
return x
def backward(self, grad):
"""Backpropagate through dropout"""
return grad * self.mask / (1 - self.rate)
class BatchNormalization:
"""Batch normalization layer"""
def __init__(self, n_features, momentum=0.9, epsilon=1e-8):
self.gamma = np.ones(n_features)
self.beta = np.zeros(n_features)
self.momentum = momentum
self.epsilon = epsilon
# Running statistics
self.running_mean = np.zeros(n_features)
self.running_var = np.ones(n_features)
# Cache for backward pass
self.cache = None
def forward(self, x, training=True):
if training:
mean = np.mean(x, axis=0)
var = np.var(x, axis=0)
# Update running statistics
self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * mean
self.running_var = self.momentum * self.running_var + (1 - self.momentum) * var
# Normalize
x_norm = (x - mean) / np.sqrt(var + self.epsilon)
else:
x_norm = (x - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
return self.gamma * x_norm + self.beta
7. Complete Neural Network Framework
class DenseLayer:
"""Fully connected layer"""
def __init__(self, input_size, output_size, activation='relu'):
self.weights = np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
self.biases = np.zeros(output_size)
self.activation = activation
# Cache for backprop
self.input = None
self.z = None
self.a = None
def activate(self, z):
if self.activation == 'relu':
return np.maximum(0, z)
elif self.activation == 'sigmoid':
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
elif self.activation == 'tanh':
return np.tanh(z)
elif self.activation == 'softmax':
exp_z = np.exp(z - np.max(z, axis=-1, keepdims=True))
return exp_z / np.sum(exp_z, axis=-1, keepdims=True)
else: # linear
return z
def activate_derivative(self, a, z):
if self.activation == 'relu':
return (z > 0).astype(float)
elif self.activation == 'sigmoid':
return a * (1 - a)
elif self.activation == 'tanh':
return 1 - a ** 2
else: # linear, softmax handled separately
return np.ones_like(a)
def forward(self, x):
self.input = x
self.z = x @ self.weights + self.biases
self.a = self.activate(self.z)
return self.a
def backward(self, grad_output):
n_samples = len(self.input)
# Gradient through activation
if self.activation != 'softmax':
grad_z = grad_output * self.activate_derivative(self.a, self.z)
else:
grad_z = grad_output # Softmax + CE gradient is just (y_pred - y_true)
# Gradients for weights and biases
self.grad_weights = self.input.T @ grad_z / n_samples
self.grad_biases = np.mean(grad_z, axis=0)
# Gradient for previous layer
return grad_z @ self.weights.T
class NeuralNet:
"""Neural network with arbitrary layers"""
def __init__(self):
self.layers = []
def add(self, layer):
self.layers.append(layer)
def forward(self, x):
for layer in self.layers:
x = layer.forward(x)
return x
def backward(self, grad):
for layer in reversed(self.layers):
grad = layer.backward(grad)
def train(self, X, y, epochs=1000, lr=0.01, batch_size=32):
n_samples = len(X)
for epoch in range(epochs):
# Shuffle data
indices = np.random.permutation(n_samples)
X_shuffled = X[indices]
y_shuffled = y[indices]
total_loss = 0
for i in range(0, n_samples, batch_size):
X_batch = X_shuffled[i:i+batch_size]
y_batch = y_shuffled[i:i+batch_size]
# Forward
y_pred = self.forward(X_batch)
# Loss
loss = -np.mean(y_batch * np.log(y_pred + 1e-15))
total_loss += loss
# Backward
grad = y_pred - y_batch # Softmax + CE
self.backward(grad)
# Update
for layer in self.layers:
if hasattr(layer, 'weights'):
layer.weights -= lr * layer.grad_weights
layer.biases -= lr * layer.grad_biases
if epoch % 100 == 0:
print(f"Epoch {epoch}, Loss: {total_loss:.4f}")
def predict(self, X):
return self.forward(X)
# Example: MNIST-like network
"""
model = NeuralNet()
model.add(DenseLayer(784, 128, activation='relu'))
model.add(DenseLayer(128, 64, activation='relu'))
model.add(DenseLayer(64, 10, activation='softmax'))
model.train(X_train, y_train, epochs=100, lr=0.1)
predictions = model.predict(X_test)
"""
8. Common Architectures
"""
FEEDFORWARD NETWORKS (MLP):
- Fully connected layers
- Good for: Tabular data, simple tasks
- Architecture: Input -> Dense -> Dense -> Output
CONVOLUTIONAL NEURAL NETWORKS (CNN):
- Convolutional layers extract spatial features
- Good for: Images, spatial data
- Architecture: Conv -> Pool -> Conv -> Pool -> Dense -> Output
RECURRENT NEURAL NETWORKS (RNN):
- Connections loop back to maintain state
- Good for: Sequences, time series, text
- Variants: LSTM, GRU
TRANSFORMERS:
- Self-attention mechanism
- Good for: NLP, sequences, images (ViT)
- Architecture: Multi-head attention + feedforward
AUTOENCODERS:
- Encoder compresses, decoder reconstructs
- Good for: Dimensionality reduction, anomaly detection
- Architecture: Encoder -> Bottleneck -> Decoder
GENERATIVE ADVERSARIAL NETWORKS (GAN):
- Generator creates, discriminator judges
- Good for: Image generation, data augmentation
- Architecture: Generator <-> Discriminator
"""
Exercises
Basic
Implement a perceptron and train it on the AND gate.
Build a 2-layer neural network and solve the XOR problem.
Compare sigmoid, tanh, and ReLU activations on a simple classification task.
Intermediate
Implement mini-batch gradient descent with momentum.
Add L2 regularization to a neural network and observe the effect on overfitting.
Implement dropout and compare training with and without it.
Advanced
Build a neural network framework supporting arbitrary layer configurations.
Implement batch normalization and measure training speedup.
Create a simple autoencoder for dimensionality reduction.
Summary
- Perceptrons can only solve linearly separable problems
- Multi-layer networks with backpropagation can learn complex patterns
- Activation functions introduce non-linearity (ReLU is most common)
- Regularization (L2, dropout, batch norm) prevents overfitting
- Optimizers like Adam adapt learning rates for faster convergence
- Different architectures suit different problem types
Module Complete
This completes the Machine Learning Fundamentals module! You've learned the core concepts underlying modern machine learning systems.