Neural Networks
Neural networks are computing systems inspired by biological neural networks. They learn representations of data through layers of interconnected nodes (neurons). ## 1. The Perceptron
Single Neuron
import numpy as np
class Perceptron:
"""
Single-layer perceptron (linear classifier).
output = activation(w·x + b)
"""
def __init__(self, n_features: int, learning_rate: float = 0.01):
self.lr = learning_rate
self.weights = np.random.randn(n_features) * 0.01
self.bias = 0
def activation(self, z: float) -> int:
"""Step function"""
return 1 if z >= 0 else 0
def predict(self, x: np.ndarray) -> int:
"""Forward pass"""
z = np.dot(self.weights, x) + self.bias
return self.activation(z)
def train(self, X: np.ndarray, y: np.ndarray, n_epochs: int = 100):
"""Perceptron learning algorithm"""
for epoch in range(n_epochs):
errors = 0
for x_i, y_i in zip(X, y):
# Predict
y_pred = self.predict(x_i)
# Update if wrong
if y_pred != y_i:
error = y_i - y_pred
self.weights += self.lr * error * x_i
self.bias += self.lr * error
errors += 1
if errors == 0:
print(f"Converged at epoch {epoch}")
break
# Example: AND gate
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 0, 0, 1])
perceptron = Perceptron(n_features=2, learning_rate=0.1)
perceptron.train(X, y)
for x_i in X:
print(f"{x_i} -> {perceptron.predict(x_i)}")
Limitations
"""
Perceptron can only learn linearly separable patterns!
XOR problem (not linearly separable):
X = [[0,0], [0,1], [1,0], [1,1]]
y = [0, 1, 1, 0]
No line can separate 0s from 1s!
Solution: Multi-layer networks (MLP)
"""
2. Multi-Layer Perceptron (MLP)
Architecture
class NeuralNetwork:
"""
Multi-layer perceptron with configurable layers.
Architecture: Input -> Hidden(s) -> Output
"""
def __init__(self, layer_sizes: list):
"""
layer_sizes: [input_size, hidden1, hidden2, ..., output_size]
"""
self.layer_sizes = layer_sizes
self.n_layers = len(layer_sizes)
self.weights = []
self.biases = []
# Initialize weights (Xavier initialization)
for i in range(len(layer_sizes) - 1):
n_in, n_out = layer_sizes[i], layer_sizes[i + 1]
# Xavier: variance = 2 / (n_in + n_out)
w = np.random.randn(n_in, n_out) * np.sqrt(2.0 / (n_in + n_out))
b = np.zeros(n_out)
self.weights.append(w)
self.biases.append(b)
def sigmoid(self, z: np.ndarray) -> np.ndarray:
"""Sigmoid activation"""
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
def sigmoid_derivative(self, a: np.ndarray) -> np.ndarray:
"""Derivative of sigmoid: a * (1 - a)"""
return a * (1 - a)
def forward(self, X: np.ndarray) -> tuple:
"""
Forward propagation.
Returns activations at each layer for backprop.
"""
activations = [X]
a = X
for i in range(self.n_layers - 1):
z = a @ self.weights[i] + self.biases[i]
a = self.sigmoid(z)
activations.append(a)
return activations
def predict(self, X: np.ndarray) -> np.ndarray:
"""Get final layer output"""
activations = self.forward(X)
return activations[-1]
Backpropagation
class NeuralNetworkBackprop(NeuralNetwork):
"""Neural network with backpropagation training"""
def __init__(self, layer_sizes: list, learning_rate: float = 0.1):
super().__init__(layer_sizes)
self.lr = learning_rate
def backward(self, X: np.ndarray, y: np.ndarray,
activations: list) -> tuple:
"""
Backpropagation.
Compute gradients of loss with respect to weights and biases.
"""
n_samples = len(X)
gradients_w = [np.zeros_like(w) for w in self.weights]
gradients_b = [np.zeros_like(b) for b in self.biases]
# Output layer error
# Using MSE loss: dL/da = (a - y)
# Chain rule: delta = dL/da * da/dz = (a - y) * sigmoid'(z)
delta = (activations[-1] - y) * self.sigmoid_derivative(activations[-1])
# Backpropagate through layers
for i in range(self.n_layers - 2, -1, -1):
# Gradients for this layer
gradients_w[i] = activations[i].T @ delta / n_samples
gradients_b[i] = np.mean(delta, axis=0)
if i > 0:
# Propagate error to previous layer
delta = (delta @ self.weights[i].T) * \
self.sigmoid_derivative(activations[i])
return gradients_w, gradients_b
def train_step(self, X: np.ndarray, y: np.ndarray) -> float:
"""Single training step"""
# Forward pass
activations = self.forward(X)
# Compute loss
loss = np.mean((activations[-1] - y) ** 2)
# Backward pass
gradients_w, gradients_b = self.backward(X, y, activations)
# Update weights
for i in range(len(self.weights)):
self.weights[i] -= self.lr * gradients_w[i]
self.biases[i] -= self.lr * gradients_b[i]
return loss
def fit(self, X: np.ndarray, y: np.ndarray,
n_epochs: int = 1000, verbose: bool = True):
"""Train the network"""
for epoch in range(n_epochs):
loss = self.train_step(X, y)
if verbose and epoch % 100 == 0:
print(f"Epoch {epoch}, Loss: {loss:.6f}")
# Solve XOR problem!
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
nn = NeuralNetworkBackprop([2, 4, 1], learning_rate=1.0)
nn.fit(X, y, n_epochs=5000)
print("\nXOR Results:")
for x_i, y_i in zip(X, y):
pred = nn.predict(x_i.reshape(1, -1))
print(f"{x_i} -> {pred[0, 0]:.4f} (expected {y_i[0]})")
3. Activation Functions
class ActivationFunctions:
"""Common activation functions and their derivatives"""
@staticmethod
def sigmoid(z):
"""Range: (0, 1) - Good for output layer (binary classification)"""
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
@staticmethod
def sigmoid_derivative(a):
return a * (1 - a)
@staticmethod
def tanh(z):
"""Range: (-1, 1) - Zero-centered, often better than sigmoid"""
return np.tanh(z)
@staticmethod
def tanh_derivative(a):
return 1 - a ** 2
@staticmethod
def relu(z):
"""Range: [0, inf) - Most popular for hidden layers"""
return np.maximum(0, z)
@staticmethod
def relu_derivative(z):
return (z > 0).astype(float)
@staticmethod
def leaky_relu(z, alpha=0.01):
"""Range: (-inf, inf) - Fixes dying ReLU problem"""
return np.where(z > 0, z, alpha * z)
@staticmethod
def leaky_relu_derivative(z, alpha=0.01):
return np.where(z > 0, 1, alpha)
@staticmethod
def softmax(z):
"""Multi-class output layer - probabilities sum to 1"""
exp_z = np.exp(z - np.max(z, axis=-1, keepdims=True))
return exp_z / np.sum(exp_z, axis=-1, keepdims=True)
"""
Activation Function Guidelines:
HIDDEN LAYERS:
- ReLU: Default choice, fast, works well
- Leaky ReLU: If experiencing dying neurons
- tanh: Sometimes for RNNs
OUTPUT LAYER:
- Binary classification: Sigmoid
- Multi-class classification: Softmax
- Regression: Linear (no activation)
PROBLEMS:
- Sigmoid/tanh: Vanishing gradient for deep networks
- ReLU: Dying neurons (stuck at 0)
"""
4. Loss Functions
class LossFunctions:
"""Common loss functions"""
@staticmethod
def mse(y_true, y_pred):
"""Mean Squared Error - Regression"""
return np.mean((y_true - y_pred) ** 2)
@staticmethod
def mse_derivative(y_true, y_pred):
return 2 * (y_pred - y_true) / len(y_true)
@staticmethod
def binary_crossentropy(y_true, y_pred, epsilon=1e-15):
"""Binary Cross-Entropy - Binary Classification"""
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return -np.mean(
y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)
)
@staticmethod
def binary_crossentropy_derivative(y_true, y_pred, epsilon=1e-15):
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return (y_pred - y_true) / (y_pred * (1 - y_pred))
@staticmethod
def categorical_crossentropy(y_true, y_pred, epsilon=1e-15):
"""Categorical Cross-Entropy - Multi-class Classification"""
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
@staticmethod
def categorical_crossentropy_derivative(y_true, y_pred):
"""For softmax output, gradient simplifies to y_pred - y_true"""
return y_pred - y_true
5. Optimizers
class Optimizer:
"""Base optimizer class"""
def update(self, weights, biases, grad_w, grad_b):
raise NotImplementedError
class SGD(Optimizer):
"""Stochastic Gradient Descent"""
def __init__(self, learning_rate=0.01):
self.lr = learning_rate
def update(self, weights, biases, grad_w, grad_b):
for i in range(len(weights)):
weights[i] -= self.lr * grad_w[i]
biases[i] -= self.lr * grad_b[i]
return weights, biases
class Momentum(Optimizer):
"""SGD with Momentum"""
def __init__(self, learning_rate=0.01, momentum=0.9):
self.lr = learning_rate
self.momentum = momentum
self.velocity_w = None
self.velocity_b = None
def update(self, weights, biases, grad_w, grad_b):
if self.velocity_w is None:
self.velocity_w = [np.zeros_like(w) for w in weights]
self.velocity_b = [np.zeros_like(b) for b in biases]
for i in range(len(weights)):
self.velocity_w[i] = self.momentum * self.velocity_w[i] - self.lr * grad_w[i]
self.velocity_b[i] = self.momentum * self.velocity_b[i] - self.lr * grad_b[i]
weights[i] += self.velocity_w[i]
biases[i] += self.velocity_b[i]
return weights, biases
class Adam(Optimizer):
"""Adaptive Moment Estimation"""
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.lr = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m_w = None # First moment
self.v_w = None # Second moment
self.m_b = None
self.v_b = None
self.t = 0
def update(self, weights, biases, grad_w, grad_b):
self.t += 1
if self.m_w is None:
self.m_w = [np.zeros_like(w) for w in weights]
self.v_w = [np.zeros_like(w) for w in weights]
self.m_b = [np.zeros_like(b) for b in biases]
self.v_b = [np.zeros_like(b) for b in biases]
for i in range(len(weights)):
# Update biased first moment
self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * grad_w[i]
self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * grad_b[i]
# Update biased second moment
self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (grad_w[i] ** 2)
self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (grad_b[i] ** 2)
# Bias correction
m_w_hat = self.m_w[i] / (1 - self.beta1 ** self.t)
v_w_hat = self.v_w[i] / (1 - self.beta2 ** self.t)
m_b_hat = self.m_b[i] / (1 - self.beta1 ** self.t)
v_b_hat = self.v_b[i] / (1 - self.beta2 ** self.t)
# Update weights
weights[i] -= self.lr * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
biases[i] -= self.lr * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)
return weights, biases
6. Regularization Techniques
class Regularization:
"""Regularization techniques to prevent overfitting"""
@staticmethod
def l2_penalty(weights, lambda_reg=0.01):
"""L2 regularization (weight decay)"""
penalty = 0
for w in weights:
penalty += lambda_reg * np.sum(w ** 2)
return penalty
@staticmethod
def l2_gradient(weights, lambda_reg=0.01):
"""Gradient of L2 penalty"""
return [2 * lambda_reg * w for w in weights]
class Dropout:
"""Dropout regularization"""
def __init__(self, rate=0.5):
self.rate = rate
self.mask = None
def forward(self, x, training=True):
"""Apply dropout during training"""
if training:
self.mask = np.random.binomial(1, 1 - self.rate, size=x.shape)
return x * self.mask / (1 - self.rate) # Scale to maintain expected value
return x
def backward(self, grad):
"""Backpropagate through dropout"""
return grad * self.mask / (1 - self.rate)
class BatchNormalization:
"""Batch normalization layer"""
def __init__(self, n_features, momentum=0.9, epsilon=1e-8):
self.gamma = np.ones(n_features)
self.beta = np.zeros(n_features)
self.momentum = momentum
self.epsilon = epsilon
# Running statistics
self.running_mean = np.zeros(n_features)
self.running_var = np.ones(n_features)
# Cache for backward pass
self.cache = None
def forward(self, x, training=True):
if training:
mean = np.mean(x, axis=0)
var = np.var(x, axis=0)
# Update running statistics
self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * mean
self.running_var = self.momentum * self.running_var + (1 - self.momentum) * var
# Normalize
x_norm = (x - mean) / np.sqrt(var + self.epsilon)
else:
x_norm = (x - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
return self.gamma * x_norm + self.beta
7. Complete Neural Network Framework
class DenseLayer:
"""Fully connected layer"""
def __init__(self, input_size, output_size, activation='relu'):
self.weights = np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
self.biases = np.zeros(output_size)
self.activation = activation
# Cache for backprop
self.input = None
self.z = None
self.a = None
def activate(self, z):
if self.activation == 'relu':
return np.maximum(0, z)
elif self.activation == 'sigmoid':
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
elif self.activation == 'tanh':
return np.tanh(z)
elif self.activation == 'softmax':
exp_z = np.exp(z - np.max(z, axis=-1, keepdims=True))
return exp_z / np.sum(exp_z, axis=-1, keepdims=True)
else: # linear
return z
def activate_derivative(self, a, z):
if self.activation == 'relu':
return (z > 0).astype(float)
elif self.activation == 'sigmoid':
return a * (1 - a)
elif self.activation == 'tanh':
return 1 - a ** 2
else: # linear, softmax handled separately
return np.ones_like(a)
def forward(self, x):
self.input = x
self.z = x @ self.weights + self.biases
self.a = self.activate(self.z)
return self.a
def backward(self, grad_output):
n_samples = len(self.input)
# Gradient through activation
if self.activation != 'softmax':
grad_z = grad_output * self.activate_derivative(self.a, self.z)
else:
grad_z = grad_output # Softmax + CE gradient is just (y_pred - y_true)
# Gradients for weights and biases
self.grad_weights = self.input.T @ grad_z / n_samples
self.grad_biases = np.mean(grad_z, axis=0)
# Gradient for previous layer
return grad_z @ self.weights.T
class NeuralNet:
"""Neural network with arbitrary layers"""
def __init__(self):
self.layers = []
def add(self, layer):
self.layers.append(layer)
def forward(self, x):
for layer in self.layers:
x = layer.forward(x)
return x
def backward(self, grad):
for layer in reversed(self.layers):
grad = layer.backward(grad)
def train(self, X, y, epochs=1000, lr=0.01, batch_size=32):
n_samples = len(X)
for epoch in range(epochs):
# Shuffle data
indices = np.random.permutation(n_samples)
X_shuffled = X[indices]
y_shuffled = y[indices]
total_loss = 0
for i in range(0, n_samples, batch_size):
X_batch = X_shuffled[i:i+batch_size]
y_batch = y_shuffled[i:i+batch_size]
# Forward
y_pred = self.forward(X_batch)
# Loss
loss = -np.mean(y_batch * np.log(y_pred + 1e-15))
total_loss += loss
# Backward
grad = y_pred - y_batch # Softmax + CE
self.backward(grad)
# Update
for layer in self.layers:
if hasattr(layer, 'weights'):
layer.weights -= lr * layer.grad_weights
layer.biases -= lr * layer.grad_biases
if epoch % 100 == 0:
print(f"Epoch {epoch}, Loss: {total_loss:.4f}")
def predict(self, X):
return self.forward(X)
# Example: MNIST-like network
"""
model = NeuralNet()
model.add(DenseLayer(784, 128, activation='relu'))
model.add(DenseLayer(128, 64, activation='relu'))
model.add(DenseLayer(64, 10, activation='softmax'))
model.train(X_train, y_train, epochs=100, lr=0.1)
predictions = model.predict(X_test)
"""
8. Common Architectures
"""
FEEDFORWARD NETWORKS (MLP):
- Fully connected layers
- Good for: Tabular data, simple tasks
- Architecture: Input -> Dense -> Dense -> Output
CONVOLUTIONAL NEURAL NETWORKS (CNN):
- Convolutional layers extract spatial features
- Good for: Images, spatial data
- Architecture: Conv -> Pool -> Conv -> Pool -> Dense -> Output
RECURRENT NEURAL NETWORKS (RNN):
- Connections loop back to maintain state
- Good for: Sequences, time series, text
- Variants: LSTM, GRU
TRANSFORMERS:
- Self-attention mechanism
- Good for: NLP, sequences, images (ViT)
- Architecture: Multi-head attention + feedforward
AUTOENCODERS:
- Encoder compresses, decoder reconstructs
- Good for: Dimensionality reduction, anomaly detection
- Architecture: Encoder -> Bottleneck -> Decoder
GENERATIVE ADVERSARIAL NETWORKS (GAN):
- Generator creates, discriminator judges
- Good for: Image generation, data augmentation
- Architecture: Generator <-> Discriminator
"""
Exercises
Basic
Implement a perceptron and train it on the AND gate.
Build a 2-layer neural network and solve the XOR problem.
Compare sigmoid, tanh, and ReLU activations on a simple classification task.
Intermediate
Implement mini-batch gradient descent with momentum.
Add L2 regularization to a neural network and observe the effect on overfitting.
Implement dropout and compare training with and without it.
Advanced
Build a neural network framework supporting arbitrary layer configurations.
Implement batch normalization and measure training speedup.
Create a simple autoencoder for dimensionality reduction.
Summary
- Perceptrons can only solve linearly separable problems
- Multi-layer networks with backpropagation can learn complex patterns
- Activation functions introduce non-linearity (ReLU is most common)
- Regularization (L2, dropout, batch norm) prevents overfitting
- Optimizers like Adam adapt learning rates for faster convergence
- Different architectures suit different problem types
Module Complete
This completes the Machine Learning Fundamentals module! You've learned the core concepts underlying modern machine learning systems.