What are Neural Networks?

Neural networks are computing systems inspired by the biological neural networks in our brains. They consist of interconnected nodes (neurons) organized in layers that can learn patterns from data.

From image recognition to language translation, neural networks power many of today's most impressive AI applications.

The Perceptron: Simplest Neural Unit

A perceptron takes inputs, multiplies by weights, sums them, and applies an activation function:

import numpy as np

class Perceptron:
    def __init__(self, n_inputs, learning_rate=0.01):
        self.weights = np.random.randn(n_inputs)
        self.bias = 0
        self.lr = learning_rate

    def activation(self, x):
        """Step function"""
        return 1 if x >= 0 else 0

    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return self.activation(linear_output)

    def train(self, X, y, epochs=100):
        for _ in range(epochs):
            for xi, yi in zip(X, y):
                prediction = self.predict(xi)
                error = yi - prediction
                # Update weights
                self.weights += self.lr * error * xi
                self.bias += self.lr * error

# Example: AND gate
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 0, 0, 1])

perceptron = Perceptron(n_inputs=2)
perceptron.train(X, y, epochs=10)

for xi in X:
    print(f"{xi} -> {perceptron.predict(xi)}")

Activation Functions

Activation functions introduce non-linearity, enabling networks to learn complex patterns:

import numpy as np
import matplotlib.pyplot as plt

def sigmoid(x):
    """Maps to (0, 1) - good for probabilities"""
    return 1 / (1 + np.exp(-x))

def tanh(x):
    """Maps to (-1, 1) - zero-centered"""
    return np.tanh(x)

def relu(x):
    """ReLU - most popular for hidden layers"""
    return np.maximum(0, x)

def leaky_relu(x, alpha=0.01):
    """Prevents dead neurons"""
    return np.where(x > 0, x, alpha * x)

def softmax(x):
    """For multi-class classification output"""
    exp_x = np.exp(x - np.max(x))
    return exp_x / exp_x.sum()

# Derivatives for backpropagation
def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

# Visualize
x = np.linspace(-5, 5, 100)
fig, axes = plt.subplots(2, 2, figsize=(10, 8))

axes[0, 0].plot(x, sigmoid(x))
axes[0, 0].set_title('Sigmoid')

axes[0, 1].plot(x, tanh(x))
axes[0, 1].set_title('Tanh')

axes[1, 0].plot(x, relu(x))
axes[1, 0].set_title('ReLU')

axes[1, 1].plot(x, leaky_relu(x))
axes[1, 1].set_title('Leaky ReLU')

plt.tight_layout()
plt.show()

Multi-Layer Perceptron (MLP)

class NeuralNetwork:
    def __init__(self, layer_sizes):
        """
        layer_sizes: list of layer sizes [input, hidden..., output]
        e.g., [784, 128, 64, 10] for MNIST
        """
        self.layers = []
        self.biases = []

        # Initialize weights and biases
        for i in range(len(layer_sizes) - 1):
            # Xavier initialization
            w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2 / layer_sizes[i])
            b = np.zeros((1, layer_sizes[i+1]))
            self.layers.append(w)
            self.biases.append(b)

    def relu(self, x):
        return np.maximum(0, x)

    def relu_derivative(self, x):
        return (x > 0).astype(float)

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def forward(self, X):
        """Forward pass through the network"""
        self.activations = [X]
        self.z_values = []

        for i, (w, b) in enumerate(zip(self.layers, self.biases)):
            z = np.dot(self.activations[-1], w) + b
            self.z_values.append(z)

            if i == len(self.layers) - 1:  # Output layer
                a = self.softmax(z)
            else:  # Hidden layers
                a = self.relu(z)

            self.activations.append(a)

        return self.activations[-1]

    def backward(self, y_true, learning_rate=0.01):
        """Backpropagation"""
        m = y_true.shape[0]
        gradients_w = []
        gradients_b = []

        # Output layer error
        delta = self.activations[-1] - y_true

        # Backpropagate through layers
        for i in reversed(range(len(self.layers))):
            grad_w = np.dot(self.activations[i].T, delta) / m
            grad_b = np.sum(delta, axis=0, keepdims=True) / m

            gradients_w.insert(0, grad_w)
            gradients_b.insert(0, grad_b)

            if i > 0:
                delta = np.dot(delta, self.layers[i].T) * self.relu_derivative(self.z_values[i-1])

        # Update weights and biases
        for i in range(len(self.layers)):
            self.layers[i] -= learning_rate * gradients_w[i]
            self.biases[i] -= learning_rate * gradients_b[i]

    def train(self, X, y, epochs, learning_rate=0.01, batch_size=32):
        """Train the network"""
        n_samples = X.shape[0]

        for epoch in range(epochs):
            # Shuffle data
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            total_loss = 0
            for i in range(0, n_samples, batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]

                # Forward pass
                output = self.forward(X_batch)

                # Calculate loss (cross-entropy)
                loss = -np.mean(np.sum(y_batch * np.log(output + 1e-8), axis=1))
                total_loss += loss

                # Backward pass
                self.backward(y_batch, learning_rate)

            if epoch % 10 == 0:
                print(f"Epoch {epoch}, Loss: {total_loss / (n_samples // batch_size):.4f}")

Loss Functions

# Mean Squared Error (Regression)
def mse_loss(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def mse_gradient(y_true, y_pred):
    return 2 * (y_pred - y_true) / len(y_true)

# Binary Cross-Entropy (Binary Classification)
def binary_cross_entropy(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Categorical Cross-Entropy (Multi-class Classification)
def categorical_cross_entropy(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

Gradient Descent Optimizers

# Stochastic Gradient Descent
def sgd_update(params, grads, learning_rate):
    return params - learning_rate * grads

# SGD with Momentum
class SGDMomentum:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.lr = learning_rate
        self.momentum = momentum
        self.velocity = None

    def update(self, params, grads):
        if self.velocity is None:
            self.velocity = np.zeros_like(params)

        self.velocity = self.momentum * self.velocity - self.lr * grads
        return params + self.velocity

# Adam Optimizer
class Adam:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None  # First moment
        self.v = None  # Second moment
        self.t = 0

    def update(self, params, grads):
        if self.m is None:
            self.m = np.zeros_like(params)
            self.v = np.zeros_like(params)

        self.t += 1

        self.m = self.beta1 * self.m + (1 - self.beta1) * grads
        self.v = self.beta2 * self.v + (1 - self.beta2) * grads**2

        # Bias correction
        m_corrected = self.m / (1 - self.beta1**self.t)
        v_corrected = self.v / (1 - self.beta2**self.t)

        return params - self.lr * m_corrected / (np.sqrt(v_corrected) + self.epsilon)

Regularization Techniques

# L2 Regularization (Weight Decay)
def l2_regularization(weights, lambda_reg):
    return lambda_reg * np.sum(weights ** 2)

# During training, add to loss:
# total_loss = data_loss + l2_regularization(weights, 0.01)

# Dropout
class Dropout:
    def __init__(self, drop_rate=0.5):
        self.drop_rate = drop_rate
        self.mask = None

    def forward(self, x, training=True):
        if training:
            self.mask = np.random.binomial(1, 1 - self.drop_rate, size=x.shape)
            return x * self.mask / (1 - self.drop_rate)  # Inverted dropout
        return x

    def backward(self, grad):
        return grad * self.mask / (1 - self.drop_rate)

# Batch Normalization
class BatchNorm:
    def __init__(self, num_features, epsilon=1e-5, momentum=0.1):
        self.gamma = np.ones(num_features)
        self.beta = np.zeros(num_features)
        self.epsilon = epsilon
        self.momentum = momentum
        self.running_mean = np.zeros(num_features)
        self.running_var = np.ones(num_features)

    def forward(self, x, training=True):
        if training:
            mean = np.mean(x, axis=0)
            var = np.var(x, axis=0)

            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * var
        else:
            mean = self.running_mean
            var = self.running_var

        x_norm = (x - mean) / np.sqrt(var + self.epsilon)
        return self.gamma * x_norm + self.beta

Building with PyTorch

import torch
import torch.nn as nn
import torch.optim as optim

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        super().__init__()
        layers = []
        prev_size = input_size

        for hidden_size in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Dropout(0.3)
            ])
            prev_size = hidden_size

        layers.append(nn.Linear(prev_size, num_classes))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# Create model
model = NeuralNet(784, [256, 128, 64], 10)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(epochs):
    model.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for batch_x, batch_y in test_loader:
            outputs = model(batch_x)
            _, predicted = torch.max(outputs, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    print(f'Epoch {epoch+1}, Accuracy: {100 * correct / total:.2f}%')

Key Concepts Summary

  • Forward Propagation: Input flows through layers to produce output
  • Backpropagation: Gradients flow backward to update weights
  • Activation Functions: Add non-linearity (ReLU most common)
  • Loss Functions: Measure prediction error
  • Optimizers: Update weights (Adam recommended)
  • Regularization: Prevent overfitting (Dropout, L2, BatchNorm)

Master Deep Learning

Our Data Science program covers neural networks from fundamentals to advanced architectures. Build real deep learning projects with expert guidance.

Explore Data Science Program

Related Articles