Coding Notebook¶
Questions and solutions for coding problems.
Low-Level Neural Network Implementation¶
- Backpropagation
- Implement forward and backward propagation for a simple neural network with one hidden layer.
- Derive and code gradient updates for parameters (weights and biases) using manual differentiation.
- Activation Functions
- Write code for activation functions like ReLU, Sigmoid, Tanh, and their derivatives.
- Compare how different activation functions affect the gradient flow during backpropagation.
- Optimization
- Implement optimizers like SGD, Momentum, RMSProp, or Adam from scratch.
- Demonstrate the difference between learning rate schedules and their impact on convergence.
- Loss Functions
- Implement and compute loss functions such as Mean Squared Error (MSE), Cross-Entropy Loss, or Hinge Loss.
- Matrix Computations
- Write functions to perform matrix multiplications efficiently (without using libraries like NumPy).
- Demonstrate how batch processing can improve computational efficiency.
- NN Architectures
- Code a simple feedforward network from scratch.
- Extend the network to include more advanced layers like dropout or batch normalization.
- Debugging NN Models
- Identify and fix issues in a neural network implementation, such as exploding gradients, vanishing gradients, or incorrect weight initialization.
Based on Resume:
Perceptron implementation
Attention
- Flash Attention
Transformer implementation
Trainer code
Position encodings
- Rotational Position Encoding
Perceptron Implementation¶
In [ ]:
Copied!
# Perceptron two class - 0 or 1
import numpy as np
class Perceptron:
def __init__(self, learning_rate=0.01, n_iter=1000):
"""
Parameters:
- learning_rate: Learning rate for weight updates.
- n_iter: Number of iterations over the training dataset.
"""
self.learning_rate = learning_rate
self.n_iter = n_iter
self.weights = None
self.bias = None
def fit(self, X, y):
"""
Fit the Perceptron to the training data.
Parameters:
- X: Training data (numpy array of shape [n_samples, n_features]).
- y: Target values (numpy array of shape [n_samples], values in {0, 1}).
"""
# Initialize weights and bias
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
# Convert labels to {-1, 1}
y_ = np.where(y > 0, 1, -1)
# Training loop
for _ in range(self.n_iter):
for idx, x_i in enumerate(X):
# Calculate the linear output
linear_output = np.dot(x_i, self.weights) + self.bias
# Predict the class
y_pred = np.sign(linear_output)
# Update weights and bias if prediction is wrong
if y_[idx] * linear_output <= 0:
self.weights += self.learning_rate * y_[idx] * x_i
self.bias += self.learning_rate * y_[idx]
def predict(self, X):
"""
Predict class labels for samples in X.
Parameters:
- X: Input data (numpy array of shape [n_samples, n_features]).
Returns:
- Predicted class labels (numpy array of shape [n_samples], values in {0, 1}).
"""
linear_output = np.dot(X, self.weights) + self.bias
return np.where(linear_output > 0, 1, 0)
# Example Usage
if __name__ == "__main__":
# Sample dataset (AND logic gate)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 0, 0, 1])
# Create and train the Perceptron
perceptron = Perceptron(learning_rate=0.1, n_iter=10)
perceptron.fit(X, y)
# Test the Perceptron
predictions = perceptron.predict(X)
print("Predictions:", predictions)
print("Weights:", perceptron.weights)
print("Bias:", perceptron.bias)
# Perceptron two class - 0 or 1
import numpy as np
class Perceptron:
def __init__(self, learning_rate=0.01, n_iter=1000):
"""
Parameters:
- learning_rate: Learning rate for weight updates.
- n_iter: Number of iterations over the training dataset.
"""
self.learning_rate = learning_rate
self.n_iter = n_iter
self.weights = None
self.bias = None
def fit(self, X, y):
"""
Fit the Perceptron to the training data.
Parameters:
- X: Training data (numpy array of shape [n_samples, n_features]).
- y: Target values (numpy array of shape [n_samples], values in {0, 1}).
"""
# Initialize weights and bias
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
# Convert labels to {-1, 1}
y_ = np.where(y > 0, 1, -1)
# Training loop
for _ in range(self.n_iter):
for idx, x_i in enumerate(X):
# Calculate the linear output
linear_output = np.dot(x_i, self.weights) + self.bias
# Predict the class
y_pred = np.sign(linear_output)
# Update weights and bias if prediction is wrong
if y_[idx] * linear_output <= 0:
self.weights += self.learning_rate * y_[idx] * x_i
self.bias += self.learning_rate * y_[idx]
def predict(self, X):
"""
Predict class labels for samples in X.
Parameters:
- X: Input data (numpy array of shape [n_samples, n_features]).
Returns:
- Predicted class labels (numpy array of shape [n_samples], values in {0, 1}).
"""
linear_output = np.dot(X, self.weights) + self.bias
return np.where(linear_output > 0, 1, 0)
# Example Usage
if __name__ == "__main__":
# Sample dataset (AND logic gate)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 0, 0, 1])
# Create and train the Perceptron
perceptron = Perceptron(learning_rate=0.1, n_iter=10)
perceptron.fit(X, y)
# Test the Perceptron
predictions = perceptron.predict(X)
print("Predictions:", predictions)
print("Weights:", perceptron.weights)
print("Bias:", perceptron.bias)
Predictions: [0 0 0 1] Weights: [0.2 0.1] Bias: -0.20000000000000004
In [ ]:
Copied!
# regression
import numpy as np
class Perceptron:
def __init__(self, learning_rate=0.01, n_iter=1000):
"""
Parameters:
- learning_rate: Learning rate for weight updates.
- n_iter: Number of iterations over the training dataset.
"""
self.learning_rate = learning_rate
self.n_iter = n_iter
self.weights = None
self.bias = None
def fit(self, X, y):
"""
Fit the Perceptron to the training data. Supports both classification and regression tasks.
Parameters:
- X: Training data (numpy array of shape [n_samples, n_features]).
- y: Target values (numpy array of shape [n_samples]).
"""
# Initialize weights and bias
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
# Training loop
for _ in range(self.n_iter):
for idx, x_i in enumerate(X):
# Calculate the linear output
linear_output = np.dot(x_i, self.weights) + self.bias
# Calculate the error (difference between predicted and actual)
error = y[idx] - linear_output
# Update weights and bias
self.weights += self.learning_rate * error * x_i
self.bias += self.learning_rate * error
def predict(self, X):
"""
Predict values for samples in X.
Parameters:
- X: Input data (numpy array of shape [n_samples, n_features]).
Returns:
- Predicted values (numpy array of shape [n_samples]).
"""
linear_output = np.dot(X, self.weights) + self.bias
return linear_output
# Example Usage
if __name__ == "__main__":
# Sample dataset (Regression example)
X = np.array([[1,1], [2,2], [3,3], [4,4]]) # Feature
y = np.array([2.2, 4.1, 6.0, 8.1]) # Target
# Create and train the Perceptron
perceptron = Perceptron(learning_rate=0.01, n_iter=1000)
perceptron.fit(X, y)
# Test the Perceptron
predictions = perceptron.predict(X)
print("Predictions:", predictions)
print("Weights:", perceptron.weights)
print("Bias:", perceptron.bias)
# regression
import numpy as np
class Perceptron:
def __init__(self, learning_rate=0.01, n_iter=1000):
"""
Parameters:
- learning_rate: Learning rate for weight updates.
- n_iter: Number of iterations over the training dataset.
"""
self.learning_rate = learning_rate
self.n_iter = n_iter
self.weights = None
self.bias = None
def fit(self, X, y):
"""
Fit the Perceptron to the training data. Supports both classification and regression tasks.
Parameters:
- X: Training data (numpy array of shape [n_samples, n_features]).
- y: Target values (numpy array of shape [n_samples]).
"""
# Initialize weights and bias
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
# Training loop
for _ in range(self.n_iter):
for idx, x_i in enumerate(X):
# Calculate the linear output
linear_output = np.dot(x_i, self.weights) + self.bias
# Calculate the error (difference between predicted and actual)
error = y[idx] - linear_output
# Update weights and bias
self.weights += self.learning_rate * error * x_i
self.bias += self.learning_rate * error
def predict(self, X):
"""
Predict values for samples in X.
Parameters:
- X: Input data (numpy array of shape [n_samples, n_features]).
Returns:
- Predicted values (numpy array of shape [n_samples]).
"""
linear_output = np.dot(X, self.weights) + self.bias
return linear_output
# Example Usage
if __name__ == "__main__":
# Sample dataset (Regression example)
X = np.array([[1,1], [2,2], [3,3], [4,4]]) # Feature
y = np.array([2.2, 4.1, 6.0, 8.1]) # Target
# Create and train the Perceptron
perceptron = Perceptron(learning_rate=0.01, n_iter=1000)
perceptron.fit(X, y)
# Test the Perceptron
predictions = perceptron.predict(X)
print("Predictions:", predictions)
print("Weights:", perceptron.weights)
print("Bias:", perceptron.bias)
Predictions: [2.15480981 4.1215872 6.08836459 8.05514197] Weights: [0.98338869 0.98338869] Bias: 0.18803242439762807
Optimizer¶
In [ ]:
Copied!
# Optimizer
class Optimizer:
def __init__(self, parameters, lr=0.01, momentum=0.9, weight_decay=0.0):
"""
Initializes the optimizer.
Args:
parameters (iterable): Parameters (weights) to optimize.
lr (float): Learning rate.
momentum (float): Momentum factor (for SGD with momentum).
weight_decay (float): L2 regularization factor.
"""
self.parameters = parameters
self.lr = lr
self.momentum = momentum
self.weight_decay = weight_decay
self.velocities = {param: 0 for param in parameters}
def step(self):
"""
Updates the parameters based on the gradient and learning rate.
"""
for param in self.parameters:
grad = param.grad # Assuming 'grad' is an attribute of the parameter
if grad is None:
continue # Skip if gradient is None
# Apply weight decay (L2 regularization)
if self.weight_decay > 0:
grad += self.weight_decay * param
# Update velocity (if momentum is used)
if self.momentum > 0:
self.velocities[param] = self.momentum * self.velocities[param] + grad
update = self.lr * self.velocities[param]
else:
update = self.lr * grad
# Apply update to the parameter
param -= update
def zero_grad(self):
"""
Resets the gradients of the parameters to zero.
"""
for param in self.parameters:
param.grad = 0
# Example of a simple parameter class for demonstration
class Parameter:
def __init__(self, data):
self.data = data
self.grad = 0 # Simulate a gradient attribute
def __sub__(self, other):
return Parameter(self.data - other.data)
def __iadd__(self, other):
self.data += other.data
return self
def __repr__(self):
return f"Parameter(data={self.data})"
# Traning Code:
import numpy as np
# Generate some synthetic data
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]], dtype=float)
y = np.array([5, 7, 9, 11], dtype=float)
# Define a simple linear model
class LinearModel:
def __init__(self, input_dim):
self.weights = Parameter(np.zeros((input_dim, 1))) # Initialize weights
self.bias = Parameter(np.zeros(1)) # Initialize bias
def forward(self, X):
return np.dot(X, self.weights.data) + self.bias.data
def compute_loss(self, predictions, targets):
# Mean squared error loss
return np.mean((predictions - targets) ** 2)
# Create model and optimizer
model = LinearModel(input_dim=2)
optimizer = Optimizer(parameters=[model.weights, model.bias], lr=0.01)
# Training loop
epochs = 1000
for epoch in range(epochs):
# Forward pass
predictions = model.forward(X)
# Compute loss
loss = model.compute_loss(predictions, y)
print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}")
# Compute gradients (dummy example, replace with actual gradient computation)
model.weights.grad = 2 * (predictions - y).mean() * X # Dummy gradient for weights
model.bias.grad = 2 * (predictions - y).mean() # Dummy gradient for bias
# Step optimizer
optimizer.step()
# Reset gradients
optimizer.zero_grad()
# Optimizer
class Optimizer:
def __init__(self, parameters, lr=0.01, momentum=0.9, weight_decay=0.0):
"""
Initializes the optimizer.
Args:
parameters (iterable): Parameters (weights) to optimize.
lr (float): Learning rate.
momentum (float): Momentum factor (for SGD with momentum).
weight_decay (float): L2 regularization factor.
"""
self.parameters = parameters
self.lr = lr
self.momentum = momentum
self.weight_decay = weight_decay
self.velocities = {param: 0 for param in parameters}
def step(self):
"""
Updates the parameters based on the gradient and learning rate.
"""
for param in self.parameters:
grad = param.grad # Assuming 'grad' is an attribute of the parameter
if grad is None:
continue # Skip if gradient is None
# Apply weight decay (L2 regularization)
if self.weight_decay > 0:
grad += self.weight_decay * param
# Update velocity (if momentum is used)
if self.momentum > 0:
self.velocities[param] = self.momentum * self.velocities[param] + grad
update = self.lr * self.velocities[param]
else:
update = self.lr * grad
# Apply update to the parameter
param -= update
def zero_grad(self):
"""
Resets the gradients of the parameters to zero.
"""
for param in self.parameters:
param.grad = 0
# Example of a simple parameter class for demonstration
class Parameter:
def __init__(self, data):
self.data = data
self.grad = 0 # Simulate a gradient attribute
def __sub__(self, other):
return Parameter(self.data - other.data)
def __iadd__(self, other):
self.data += other.data
return self
def __repr__(self):
return f"Parameter(data={self.data})"
# Traning Code:
import numpy as np
# Generate some synthetic data
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]], dtype=float)
y = np.array([5, 7, 9, 11], dtype=float)
# Define a simple linear model
class LinearModel:
def __init__(self, input_dim):
self.weights = Parameter(np.zeros((input_dim, 1))) # Initialize weights
self.bias = Parameter(np.zeros(1)) # Initialize bias
def forward(self, X):
return np.dot(X, self.weights.data) + self.bias.data
def compute_loss(self, predictions, targets):
# Mean squared error loss
return np.mean((predictions - targets) ** 2)
# Create model and optimizer
model = LinearModel(input_dim=2)
optimizer = Optimizer(parameters=[model.weights, model.bias], lr=0.01)
# Training loop
epochs = 1000
for epoch in range(epochs):
# Forward pass
predictions = model.forward(X)
# Compute loss
loss = model.compute_loss(predictions, y)
print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}")
# Compute gradients (dummy example, replace with actual gradient computation)
model.weights.grad = 2 * (predictions - y).mean() * X # Dummy gradient for weights
model.bias.grad = 2 * (predictions - y).mean() # Dummy gradient for bias
# Step optimizer
optimizer.step()
# Reset gradients
optimizer.zero_grad()
In [ ]:
Copied!
## FFN: 1 hidden layer
import numpy as np
class Parameter:
def __init__(self, data):
self.data = data
self.grad = np.zeros_like(data) # Initialize gradient
def __repr__(self):
return f"Parameter(data={self.data})"
class FeedForwardNeuralNet:
def __init__(self, input_dim, hidden_dim, output_dim):
# Initialize weights and biases for input to hidden layer
self.weights1 = Parameter(np.random.randn(input_dim, hidden_dim) * 0.01)
self.bias1 = Parameter(np.zeros((1, hidden_dim)))
# Initialize weights and biases for hidden to output layer
self.weights2 = Parameter(np.random.randn(hidden_dim, output_dim) * 0.01)
self.bias2 = Parameter(np.zeros((1, output_dim)))
def forward(self, X):
# Forward pass: input to hidden layer
self.z1 = np.dot(X, self.weights1.data) + self.bias1.data
self.a1 = np.maximum(0, self.z1) # ReLU activation
# Forward pass: hidden to output layer
self.z2 = np.dot(self.a1, self.weights2.data) + self.bias2.data
return self.z2 # No activation for the output layer
def compute_loss(self, predictions, targets):
# Mean squared error loss
return np.mean((predictions - targets) ** 2)
def backward(self, X, y, learning_rate):
# Compute the gradient for output layer
d_loss = 2 * (self.z2 - y) / y.shape[0]
self.weights2.grad = np.dot(self.a1.T, d_loss) # is same as
self.bias2.grad = np.sum(d_loss, axis=0, keepdims=True)
# Gradient for hidden layer
d_hidden = d_loss.dot(self.weights2.data.T) * (self.z1 > 0) # ReLU derivative
self.weights1.grad = np.dot(X.T, d_hidden)
self.bias1.grad = np.sum(d_hidden, axis=0, keepdims=True)
# Example of using the modified class
# Generate synthetic data for training
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]], dtype=float)
y = np.array([[5], [7], [9], [11]], dtype=float)
# Define a simple learning rate scheduler (e.g., reducing LR by half every 100 epochs)
def lr_scheduler(initial_lr, epoch, step_size=100, gamma=0.5):
if epoch % step_size == 0 and epoch > 0:
return initial_lr * gamma
return initial_lr
# Create the model and optimizer
input_dim = X.shape[1]
hidden_dim = 4 # Number of neurons in the hidden layer
output_dim = y.shape[1]
model = FeedForwardNeuralNet(input_dim, hidden_dim, output_dim)
optimizer = Optimizer(
parameters=[model.weights1, model.bias1, model.weights2, model.bias2],
lr=0.01,
momentum=0.9,
lr_scheduler=lambda lr: lr_scheduler(lr, epoch),
grad_clip=(-1, 1)
)
# Training loop
epochs = 1000
for epoch in range(epochs):
# Forward pass
predictions = model.forward(X)
# Compute loss
loss = model.compute_loss(predictions, y)
print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")
# Backward pass to compute gradients
model.backward(X, y, learning_rate=optimizer.lr)
# Step optimizer
optimizer.step()
# Reset gradients
optimizer.zero_grad()
## FFN: 1 hidden layer
import numpy as np
class Parameter:
def __init__(self, data):
self.data = data
self.grad = np.zeros_like(data) # Initialize gradient
def __repr__(self):
return f"Parameter(data={self.data})"
class FeedForwardNeuralNet:
def __init__(self, input_dim, hidden_dim, output_dim):
# Initialize weights and biases for input to hidden layer
self.weights1 = Parameter(np.random.randn(input_dim, hidden_dim) * 0.01)
self.bias1 = Parameter(np.zeros((1, hidden_dim)))
# Initialize weights and biases for hidden to output layer
self.weights2 = Parameter(np.random.randn(hidden_dim, output_dim) * 0.01)
self.bias2 = Parameter(np.zeros((1, output_dim)))
def forward(self, X):
# Forward pass: input to hidden layer
self.z1 = np.dot(X, self.weights1.data) + self.bias1.data
self.a1 = np.maximum(0, self.z1) # ReLU activation
# Forward pass: hidden to output layer
self.z2 = np.dot(self.a1, self.weights2.data) + self.bias2.data
return self.z2 # No activation for the output layer
def compute_loss(self, predictions, targets):
# Mean squared error loss
return np.mean((predictions - targets) ** 2)
def backward(self, X, y, learning_rate):
# Compute the gradient for output layer
d_loss = 2 * (self.z2 - y) / y.shape[0]
self.weights2.grad = np.dot(self.a1.T, d_loss) # is same as
self.bias2.grad = np.sum(d_loss, axis=0, keepdims=True)
# Gradient for hidden layer
d_hidden = d_loss.dot(self.weights2.data.T) * (self.z1 > 0) # ReLU derivative
self.weights1.grad = np.dot(X.T, d_hidden)
self.bias1.grad = np.sum(d_hidden, axis=0, keepdims=True)
# Example of using the modified class
# Generate synthetic data for training
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]], dtype=float)
y = np.array([[5], [7], [9], [11]], dtype=float)
# Define a simple learning rate scheduler (e.g., reducing LR by half every 100 epochs)
def lr_scheduler(initial_lr, epoch, step_size=100, gamma=0.5):
if epoch % step_size == 0 and epoch > 0:
return initial_lr * gamma
return initial_lr
# Create the model and optimizer
input_dim = X.shape[1]
hidden_dim = 4 # Number of neurons in the hidden layer
output_dim = y.shape[1]
model = FeedForwardNeuralNet(input_dim, hidden_dim, output_dim)
optimizer = Optimizer(
parameters=[model.weights1, model.bias1, model.weights2, model.bias2],
lr=0.01,
momentum=0.9,
lr_scheduler=lambda lr: lr_scheduler(lr, epoch),
grad_clip=(-1, 1)
)
# Training loop
epochs = 1000
for epoch in range(epochs):
# Forward pass
predictions = model.forward(X)
# Compute loss
loss = model.compute_loss(predictions, y)
print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")
# Backward pass to compute gradients
model.backward(X, y, learning_rate=optimizer.lr)
# Step optimizer
optimizer.step()
# Reset gradients
optimizer.zero_grad()
In [ ]:
Copied!
# dot product without numpy
def dot_product(arr1, arr2):
# Check if the arrays have the same length
if len(arr1) != len(arr2):
raise ValueError("Arrays must be of the same length")
# Compute the dot product using a list comprehension and sum()
return sum(x * y for x, y in zip(arr1, arr2))
# Example usage:
array1 = [1, 2, 3]
array2 = [4, 5, 6]
result = dot_product(array1, array2)
print("Dot product:", result)
# dot product without numpy
def dot_product(arr1, arr2):
# Check if the arrays have the same length
if len(arr1) != len(arr2):
raise ValueError("Arrays must be of the same length")
# Compute the dot product using a list comprehension and sum()
return sum(x * y for x, y in zip(arr1, arr2))
# Example usage:
array1 = [1, 2, 3]
array2 = [4, 5, 6]
result = dot_product(array1, array2)
print("Dot product:", result)
In [ ]:
Copied!