Coding Notebook¶

Questions and solutions for coding problems.

Low-Level Neural Network Implementation¶

Backpropagation

Implement forward and backward propagation for a simple neural network with one hidden layer.
Derive and code gradient updates for parameters (weights and biases) using manual differentiation.

Activation Functions

Write code for activation functions like ReLU, Sigmoid, Tanh, and their derivatives.
Compare how different activation functions affect the gradient flow during backpropagation.

Optimization

Implement optimizers like SGD, Momentum, RMSProp, or Adam from scratch.
Demonstrate the difference between learning rate schedules and their impact on convergence.

Loss Functions

Implement and compute loss functions such as Mean Squared Error (MSE), Cross-Entropy Loss, or Hinge Loss.
Matrix Computations

Write functions to perform matrix multiplications efficiently (without using libraries like NumPy).

Demonstrate how batch processing can improve computational efficiency.
NN Architectures

Code a simple feedforward network from scratch.

Extend the network to include more advanced layers like dropout or batch normalization.

Debugging NN Models

Identify and fix issues in a neural network implementation, such as exploding gradients, vanishing gradients, or incorrect weight initialization.

Based on Resume:

Perceptron implementation
Attention
- Flash Attention
Transformer implementation
Trainer code
Position encodings
- Rotational Position Encoding

Perceptron Implementation¶

In [ ]:

Copied!





# Perceptron two class -  0 or 1

import numpy as np

class Perceptron:
    def __init__(self, learning_rate=0.01, n_iter=1000):
        """
        Parameters:
        - learning_rate: Learning rate for weight updates.
        - n_iter: Number of iterations over the training dataset.
        """
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        """
        Fit the Perceptron to the training data.

        Parameters:
        - X: Training data (numpy array of shape [n_samples, n_features]).
        - y: Target values (numpy array of shape [n_samples], values in {0, 1}).
        """
        # Initialize weights and bias
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Convert labels to {-1, 1}
        y_ = np.where(y > 0, 1, -1)

        # Training loop
        for _ in range(self.n_iter):
            for idx, x_i in enumerate(X):
                # Calculate the linear output
                linear_output = np.dot(x_i, self.weights) + self.bias
                # Predict the class
                y_pred = np.sign(linear_output)
                # Update weights and bias if prediction is wrong
                if y_[idx] * linear_output <= 0:
                    self.weights += self.learning_rate * y_[idx] * x_i
                    self.bias += self.learning_rate * y_[idx]

    def predict(self, X):
        """
        Predict class labels for samples in X.

        Parameters:
        - X: Input data (numpy array of shape [n_samples, n_features]).

        Returns:
        - Predicted class labels (numpy array of shape [n_samples], values in {0, 1}).
        """
        linear_output = np.dot(X, self.weights) + self.bias
        return np.where(linear_output > 0, 1, 0)

# Example Usage
if __name__ == "__main__":
    # Sample dataset (AND logic gate)
    X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
    y = np.array([0, 0, 0, 1])

    # Create and train the Perceptron
    perceptron = Perceptron(learning_rate=0.1, n_iter=10)
    perceptron.fit(X, y)

    # Test the Perceptron
    predictions = perceptron.predict(X)
    print("Predictions:", predictions)
    print("Weights:", perceptron.weights)
    print("Bias:", perceptron.bias)
# Perceptron two class -  0 or 1

import numpy as np

class Perceptron:
    def __init__(self, learning_rate=0.01, n_iter=1000):
        """
        Parameters:
        - learning_rate: Learning rate for weight updates.
        - n_iter: Number of iterations over the training dataset.
        """
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        """
        Fit the Perceptron to the training data.

        Parameters:
        - X: Training data (numpy array of shape [n_samples, n_features]).
        - y: Target values (numpy array of shape [n_samples], values in {0, 1}).
        """
        # Initialize weights and bias
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Convert labels to {-1, 1}
        y_ = np.where(y > 0, 1, -1)

        # Training loop
        for _ in range(self.n_iter):
            for idx, x_i in enumerate(X):
                # Calculate the linear output
                linear_output = np.dot(x_i, self.weights) + self.bias
                # Predict the class
                y_pred = np.sign(linear_output)
                # Update weights and bias if prediction is wrong
                if y_[idx] * linear_output <= 0:
                    self.weights += self.learning_rate * y_[idx] * x_i
                    self.bias += self.learning_rate * y_[idx]

    def predict(self, X):
        """
        Predict class labels for samples in X.

        Parameters:
        - X: Input data (numpy array of shape [n_samples, n_features]).

        Returns:
        - Predicted class labels (numpy array of shape [n_samples], values in {0, 1}).
        """
        linear_output = np.dot(X, self.weights) + self.bias
        return np.where(linear_output > 0, 1, 0)

# Example Usage
if __name__ == "__main__":
    # Sample dataset (AND logic gate)
    X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
    y = np.array([0, 0, 0, 1])

    # Create and train the Perceptron
    perceptron = Perceptron(learning_rate=0.1, n_iter=10)
    perceptron.fit(X, y)

    # Test the Perceptron
    predictions = perceptron.predict(X)
    print("Predictions:", predictions)
    print("Weights:", perceptron.weights)
    print("Bias:", perceptron.bias)

Predictions: [0 0 0 1]
Weights: [0.2 0.1]
Bias: -0.20000000000000004

In [ ]:

Copied!





# regression
import numpy as np

class Perceptron:
    def __init__(self, learning_rate=0.01, n_iter=1000):
        """
        Parameters:
        - learning_rate: Learning rate for weight updates.
        - n_iter: Number of iterations over the training dataset.
        """
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        """
        Fit the Perceptron to the training data. Supports both classification and regression tasks.

        Parameters:
        - X: Training data (numpy array of shape [n_samples, n_features]).
        - y: Target values (numpy array of shape [n_samples]).
        """
        # Initialize weights and bias
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Training loop
        for _ in range(self.n_iter):
            for idx, x_i in enumerate(X):
                # Calculate the linear output
                linear_output = np.dot(x_i, self.weights) + self.bias
                # Calculate the error (difference between predicted and actual)
                error = y[idx] - linear_output
                # Update weights and bias
                self.weights += self.learning_rate * error * x_i
                self.bias += self.learning_rate * error

    def predict(self, X):
        """
        Predict values for samples in X.

        Parameters:
        - X: Input data (numpy array of shape [n_samples, n_features]).

        Returns:
        - Predicted values (numpy array of shape [n_samples]).
        """
        linear_output = np.dot(X, self.weights) + self.bias
        return linear_output

# Example Usage
if __name__ == "__main__":
    # Sample dataset (Regression example)
    X = np.array([[1,1], [2,2], [3,3], [4,4]])  # Feature
    y = np.array([2.2, 4.1, 6.0, 8.1])  # Target

    # Create and train the Perceptron
    perceptron = Perceptron(learning_rate=0.01, n_iter=1000)
    perceptron.fit(X, y)

    # Test the Perceptron
    predictions = perceptron.predict(X)
    print("Predictions:", predictions)
    print("Weights:", perceptron.weights)
    print("Bias:", perceptron.bias)
# regression
import numpy as np

class Perceptron:
    def __init__(self, learning_rate=0.01, n_iter=1000):
        """
        Parameters:
        - learning_rate: Learning rate for weight updates.
        - n_iter: Number of iterations over the training dataset.
        """
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        """
        Fit the Perceptron to the training data. Supports both classification and regression tasks.

        Parameters:
        - X: Training data (numpy array of shape [n_samples, n_features]).
        - y: Target values (numpy array of shape [n_samples]).
        """
        # Initialize weights and bias
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Training loop
        for _ in range(self.n_iter):
            for idx, x_i in enumerate(X):
                # Calculate the linear output
                linear_output = np.dot(x_i, self.weights) + self.bias
                # Calculate the error (difference between predicted and actual)
                error = y[idx] - linear_output
                # Update weights and bias
                self.weights += self.learning_rate * error * x_i
                self.bias += self.learning_rate * error

    def predict(self, X):
        """
        Predict values for samples in X.

        Parameters:
        - X: Input data (numpy array of shape [n_samples, n_features]).

        Returns:
        - Predicted values (numpy array of shape [n_samples]).
        """
        linear_output = np.dot(X, self.weights) + self.bias
        return linear_output

# Example Usage
if __name__ == "__main__":
    # Sample dataset (Regression example)
    X = np.array([[1,1], [2,2], [3,3], [4,4]])  # Feature
    y = np.array([2.2, 4.1, 6.0, 8.1])  # Target

    # Create and train the Perceptron
    perceptron = Perceptron(learning_rate=0.01, n_iter=1000)
    perceptron.fit(X, y)

    # Test the Perceptron
    predictions = perceptron.predict(X)
    print("Predictions:", predictions)
    print("Weights:", perceptron.weights)
    print("Bias:", perceptron.bias)

Predictions: [2.15480981 4.1215872  6.08836459 8.05514197]
Weights: [0.98338869 0.98338869]
Bias: 0.18803242439762807

Optimizer¶

In [ ]:

Copied!





# Optimizer
class Optimizer:
    def __init__(self, parameters, lr=0.01, momentum=0.9, weight_decay=0.0):
        """
        Initializes the optimizer.

        Args:
            parameters (iterable): Parameters (weights) to optimize.
            lr (float): Learning rate.
            momentum (float): Momentum factor (for SGD with momentum).
            weight_decay (float): L2 regularization factor.
        """
        self.parameters = parameters
        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.velocities = {param: 0 for param in parameters}

    def step(self):
        """
        Updates the parameters based on the gradient and learning rate.
        """
        for param in self.parameters:
            grad = param.grad  # Assuming 'grad' is an attribute of the parameter

            if grad is None:
                continue  # Skip if gradient is None

            # Apply weight decay (L2 regularization)
            if self.weight_decay > 0:
                grad += self.weight_decay * param

            # Update velocity (if momentum is used)
            if self.momentum > 0:
                self.velocities[param] = self.momentum * self.velocities[param] + grad
                update = self.lr * self.velocities[param]
            else:
                update = self.lr * grad

            # Apply update to the parameter
            param -= update

    def zero_grad(self):
        """
        Resets the gradients of the parameters to zero.
        """
        for param in self.parameters:
            param.grad = 0

# Example of a simple parameter class for demonstration
class Parameter:
    def __init__(self, data):
        self.data = data
        self.grad = 0  # Simulate a gradient attribute

    def __sub__(self, other):
        return Parameter(self.data - other.data)

    def __iadd__(self, other):
        self.data += other.data
        return self

    def __repr__(self):
        return f"Parameter(data={self.data})"



# Traning Code:
import numpy as np

# Generate some synthetic data
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]], dtype=float)
y = np.array([5, 7, 9, 11], dtype=float)

# Define a simple linear model
class LinearModel:
    def __init__(self, input_dim):
        self.weights = Parameter(np.zeros((input_dim, 1)))  # Initialize weights
        self.bias = Parameter(np.zeros(1))  # Initialize bias

    def forward(self, X):
        return np.dot(X, self.weights.data) + self.bias.data

    def compute_loss(self, predictions, targets):
        # Mean squared error loss
        return np.mean((predictions - targets) ** 2)

# Create model and optimizer
model = LinearModel(input_dim=2)
optimizer = Optimizer(parameters=[model.weights, model.bias], lr=0.01)

# Training loop
epochs = 1000
for epoch in range(epochs):
    # Forward pass
    predictions = model.forward(X)

    # Compute loss
    loss = model.compute_loss(predictions, y)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}")

    # Compute gradients (dummy example, replace with actual gradient computation)
    model.weights.grad = 2 * (predictions - y).mean() * X  # Dummy gradient for weights
    model.bias.grad = 2 * (predictions - y).mean()  # Dummy gradient for bias

    # Step optimizer
    optimizer.step()

    # Reset gradients
    optimizer.zero_grad()
# Optimizer
class Optimizer:
    def __init__(self, parameters, lr=0.01, momentum=0.9, weight_decay=0.0):
        """
        Initializes the optimizer.

        Args:
            parameters (iterable): Parameters (weights) to optimize.
            lr (float): Learning rate.
            momentum (float): Momentum factor (for SGD with momentum).
            weight_decay (float): L2 regularization factor.
        """
        self.parameters = parameters
        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.velocities = {param: 0 for param in parameters}

    def step(self):
        """
        Updates the parameters based on the gradient and learning rate.
        """
        for param in self.parameters:
            grad = param.grad  # Assuming 'grad' is an attribute of the parameter

            if grad is None:
                continue  # Skip if gradient is None

            # Apply weight decay (L2 regularization)
            if self.weight_decay > 0:
                grad += self.weight_decay * param

            # Update velocity (if momentum is used)
            if self.momentum > 0:
                self.velocities[param] = self.momentum * self.velocities[param] + grad
                update = self.lr * self.velocities[param]
            else:
                update = self.lr * grad

            # Apply update to the parameter
            param -= update

    def zero_grad(self):
        """
        Resets the gradients of the parameters to zero.
        """
        for param in self.parameters:
            param.grad = 0

# Example of a simple parameter class for demonstration
class Parameter:
    def __init__(self, data):
        self.data = data
        self.grad = 0  # Simulate a gradient attribute

    def __sub__(self, other):
        return Parameter(self.data - other.data)

    def __iadd__(self, other):
        self.data += other.data
        return self

    def __repr__(self):
        return f"Parameter(data={self.data})"



# Traning Code:
import numpy as np

# Generate some synthetic data
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]], dtype=float)
y = np.array([5, 7, 9, 11], dtype=float)

# Define a simple linear model
class LinearModel:
    def __init__(self, input_dim):
        self.weights = Parameter(np.zeros((input_dim, 1)))  # Initialize weights
        self.bias = Parameter(np.zeros(1))  # Initialize bias

    def forward(self, X):
        return np.dot(X, self.weights.data) + self.bias.data

    def compute_loss(self, predictions, targets):
        # Mean squared error loss
        return np.mean((predictions - targets) ** 2)

# Create model and optimizer
model = LinearModel(input_dim=2)
optimizer = Optimizer(parameters=[model.weights, model.bias], lr=0.01)

# Training loop
epochs = 1000
for epoch in range(epochs):
    # Forward pass
    predictions = model.forward(X)

    # Compute loss
    loss = model.compute_loss(predictions, y)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}")

    # Compute gradients (dummy example, replace with actual gradient computation)
    model.weights.grad = 2 * (predictions - y).mean() * X  # Dummy gradient for weights
    model.bias.grad = 2 * (predictions - y).mean()  # Dummy gradient for bias

    # Step optimizer
    optimizer.step()

    # Reset gradients
    optimizer.zero_grad()

In [ ]:

Copied!





## FFN: 1 hidden layer

import numpy as np

class Parameter:
    def __init__(self, data):
        self.data = data
        self.grad = np.zeros_like(data)  # Initialize gradient

    def __repr__(self):
        return f"Parameter(data={self.data})"

class FeedForwardNeuralNet:
    def __init__(self, input_dim, hidden_dim, output_dim):
        # Initialize weights and biases for input to hidden layer
        self.weights1 = Parameter(np.random.randn(input_dim, hidden_dim) * 0.01)
        self.bias1 = Parameter(np.zeros((1, hidden_dim)))

        # Initialize weights and biases for hidden to output layer
        self.weights2 = Parameter(np.random.randn(hidden_dim, output_dim) * 0.01)
        self.bias2 = Parameter(np.zeros((1, output_dim)))

    def forward(self, X):
        # Forward pass: input to hidden layer
        self.z1 = np.dot(X, self.weights1.data) + self.bias1.data
        self.a1 = np.maximum(0, self.z1)  # ReLU activation

        # Forward pass: hidden to output layer
        self.z2 = np.dot(self.a1, self.weights2.data) + self.bias2.data
        return self.z2  # No activation for the output layer

    def compute_loss(self, predictions, targets):
        # Mean squared error loss
        return np.mean((predictions - targets) ** 2)

    def backward(self, X, y, learning_rate):
        # Compute the gradient for output layer
        d_loss = 2 * (self.z2 - y) / y.shape[0]
        self.weights2.grad = np.dot(self.a1.T, d_loss)   # is same as
        self.bias2.grad = np.sum(d_loss, axis=0, keepdims=True)

        # Gradient for hidden layer
        d_hidden = d_loss.dot(self.weights2.data.T) * (self.z1 > 0)  # ReLU derivative
        self.weights1.grad = np.dot(X.T, d_hidden)
        self.bias1.grad = np.sum(d_hidden, axis=0, keepdims=True)

# Example of using the modified class
# Generate synthetic data for training
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]], dtype=float)
y = np.array([[5], [7], [9], [11]], dtype=float)

# Define a simple learning rate scheduler (e.g., reducing LR by half every 100 epochs)
def lr_scheduler(initial_lr, epoch, step_size=100, gamma=0.5):
    if epoch % step_size == 0 and epoch > 0:
        return initial_lr * gamma
    return initial_lr

# Create the model and optimizer
input_dim = X.shape[1]
hidden_dim = 4  # Number of neurons in the hidden layer
output_dim = y.shape[1]

model = FeedForwardNeuralNet(input_dim, hidden_dim, output_dim)
optimizer = Optimizer(
    parameters=[model.weights1, model.bias1, model.weights2, model.bias2],
    lr=0.01,
    momentum=0.9,
    lr_scheduler=lambda lr: lr_scheduler(lr, epoch),
    grad_clip=(-1, 1)
)

# Training loop
epochs = 1000
for epoch in range(epochs):
    # Forward pass
    predictions = model.forward(X)

    # Compute loss
    loss = model.compute_loss(predictions, y)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")

    # Backward pass to compute gradients
    model.backward(X, y, learning_rate=optimizer.lr)

    # Step optimizer
    optimizer.step()

    # Reset gradients
    optimizer.zero_grad()
## FFN: 1 hidden layer

import numpy as np

class Parameter:
    def __init__(self, data):
        self.data = data
        self.grad = np.zeros_like(data)  # Initialize gradient

    def __repr__(self):
        return f"Parameter(data={self.data})"

class FeedForwardNeuralNet:
    def __init__(self, input_dim, hidden_dim, output_dim):
        # Initialize weights and biases for input to hidden layer
        self.weights1 = Parameter(np.random.randn(input_dim, hidden_dim) * 0.01)
        self.bias1 = Parameter(np.zeros((1, hidden_dim)))

        # Initialize weights and biases for hidden to output layer
        self.weights2 = Parameter(np.random.randn(hidden_dim, output_dim) * 0.01)
        self.bias2 = Parameter(np.zeros((1, output_dim)))

    def forward(self, X):
        # Forward pass: input to hidden layer
        self.z1 = np.dot(X, self.weights1.data) + self.bias1.data
        self.a1 = np.maximum(0, self.z1)  # ReLU activation

        # Forward pass: hidden to output layer
        self.z2 = np.dot(self.a1, self.weights2.data) + self.bias2.data
        return self.z2  # No activation for the output layer

    def compute_loss(self, predictions, targets):
        # Mean squared error loss
        return np.mean((predictions - targets) ** 2)

    def backward(self, X, y, learning_rate):
        # Compute the gradient for output layer
        d_loss = 2 * (self.z2 - y) / y.shape[0]
        self.weights2.grad = np.dot(self.a1.T, d_loss)   # is same as
        self.bias2.grad = np.sum(d_loss, axis=0, keepdims=True)

        # Gradient for hidden layer
        d_hidden = d_loss.dot(self.weights2.data.T) * (self.z1 > 0)  # ReLU derivative
        self.weights1.grad = np.dot(X.T, d_hidden)
        self.bias1.grad = np.sum(d_hidden, axis=0, keepdims=True)

# Example of using the modified class
# Generate synthetic data for training
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]], dtype=float)
y = np.array([[5], [7], [9], [11]], dtype=float)

# Define a simple learning rate scheduler (e.g., reducing LR by half every 100 epochs)
def lr_scheduler(initial_lr, epoch, step_size=100, gamma=0.5):
    if epoch % step_size == 0 and epoch > 0:
        return initial_lr * gamma
    return initial_lr

# Create the model and optimizer
input_dim = X.shape[1]
hidden_dim = 4  # Number of neurons in the hidden layer
output_dim = y.shape[1]

model = FeedForwardNeuralNet(input_dim, hidden_dim, output_dim)
optimizer = Optimizer(
    parameters=[model.weights1, model.bias1, model.weights2, model.bias2],
    lr=0.01,
    momentum=0.9,
    lr_scheduler=lambda lr: lr_scheduler(lr, epoch),
    grad_clip=(-1, 1)
)

# Training loop
epochs = 1000
for epoch in range(epochs):
    # Forward pass
    predictions = model.forward(X)

    # Compute loss
    loss = model.compute_loss(predictions, y)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")

    # Backward pass to compute gradients
    model.backward(X, y, learning_rate=optimizer.lr)

    # Step optimizer
    optimizer.step()

    # Reset gradients
    optimizer.zero_grad()

In [ ]:

Copied!





# dot product without numpy
def dot_product(arr1, arr2):
    # Check if the arrays have the same length
    if len(arr1) != len(arr2):
        raise ValueError("Arrays must be of the same length")

    # Compute the dot product using a list comprehension and sum()
    return sum(x * y for x, y in zip(arr1, arr2))

# Example usage:
array1 = [1, 2, 3]
array2 = [4, 5, 6]

result = dot_product(array1, array2)
print("Dot product:", result)
# dot product without numpy
def dot_product(arr1, arr2):
    # Check if the arrays have the same length
    if len(arr1) != len(arr2):
        raise ValueError("Arrays must be of the same length")

    # Compute the dot product using a list comprehension and sum()
    return sum(x * y for x, y in zip(arr1, arr2))

# Example usage:
array1 = [1, 2, 3]
array2 = [4, 5, 6]

result = dot_product(array1, array2)
print("Dot product:", result)

In [ ]: