Case Study: Automatic Differentiation for Neural Network Training
This case study demonstrates Aprender's autograd engine for computing gradients and training neural networks.
Overview
The autograd module provides:
- Tensor: Gradient-tracking tensor type
- Computation Graph: Tape-based recording of operations
- Backward Pass: Automatic gradient computation via chain rule
- No-Grad Context: Disable tracking for inference
Basic Gradient Computation
use aprender::autograd::{Tensor, no_grad, clear_graph};
fn main() {
// Create tensors with gradient tracking
let x = Tensor::from_slice(&[1.0, 2.0, 3.0]).requires_grad();
let w = Tensor::from_slice(&[0.5, 0.5, 0.5]).requires_grad();
// Forward pass: y = sum(x * w)
let z = x.mul(&w);
let y = z.sum();
// Backward pass
y.backward();
// Access gradients
// ∂y/∂x = w (element-wise)
// ∂y/∂w = x (element-wise)
println!("x.grad = {:?}", x.grad()); // [0.5, 0.5, 0.5]
println!("w.grad = {:?}", w.grad()); // [1.0, 2.0, 3.0]
// Clear graph for next iteration
clear_graph();
}
Tensor Operations
Element-wise Operations
use aprender::autograd::Tensor;
let a = Tensor::from_slice(&[1.0, 2.0, 3.0]).requires_grad();
let b = Tensor::from_slice(&[4.0, 5.0, 6.0]).requires_grad();
// Arithmetic
let c = a.add(&b); // [5, 7, 9]
let d = a.sub(&b); // [-3, -3, -3]
let e = a.mul(&b); // [4, 10, 18]
let f = a.div(&b); // [0.25, 0.4, 0.5]
// Unary
let g = a.neg(); // [-1, -2, -3]
let h = a.exp(); // [e¹, e², e³]
let i = a.log(); // [0, ln(2), ln(3)]
let j = a.sqrt(); // [1, √2, √3]
let k = a.pow(2.0); // [1, 4, 9]
Reduction Operations
use aprender::autograd::Tensor;
let x = Tensor::new(&[1.0, 2.0, 3.0, 4.0], &[2, 2]).requires_grad();
let sum_all = x.sum(); // 10.0
let mean_all = x.mean(); // 2.5
let sum_axis0 = x.sum_axis(0); // [4.0, 6.0]
let sum_axis1 = x.sum_axis(1); // [3.0, 7.0]
Matrix Operations
use aprender::autograd::Tensor;
let a = Tensor::new(&[1.0, 2.0, 3.0, 4.0], &[2, 2]).requires_grad();
let b = Tensor::new(&[5.0, 6.0, 7.0, 8.0], &[2, 2]).requires_grad();
// Matrix multiplication
let c = a.matmul(&b);
// Transpose
let at = a.transpose();
// View/reshape
let flat = a.view(&[4]);
Activation Functions
use aprender::autograd::Tensor;
let x = Tensor::from_slice(&[-1.0, 0.0, 1.0]).requires_grad();
let relu_out = x.relu(); // [0, 0, 1]
let sigmoid_out = x.sigmoid(); // [0.27, 0.5, 0.73]
let tanh_out = x.tanh(); // [-0.76, 0, 0.76]
let gelu_out = x.gelu(); // [-0.16, 0, 0.84]
let leaky_relu = x.leaky_relu(0.01); // [-0.01, 0, 1]
// Softmax (normalizes to probability distribution)
let logits = Tensor::from_slice(&[1.0, 2.0, 3.0]).requires_grad();
let probs = logits.softmax(); // [0.09, 0.24, 0.67]
Training Loop Example
use aprender::autograd::{Tensor, clear_graph, no_grad};
fn train_linear_regression() {
// Model parameters
let mut w = Tensor::from_slice(&[0.0]).requires_grad();
let mut b = Tensor::from_slice(&[0.0]).requires_grad();
// Training data: y = 2x + 1
let x_train = Tensor::from_slice(&[1.0, 2.0, 3.0, 4.0]);
let y_train = Tensor::from_slice(&[3.0, 5.0, 7.0, 9.0]);
let learning_rate = 0.01;
let epochs = 100;
for epoch in 0..epochs {
// Forward pass
let y_pred = x_train.mul(&w).add(&b);
// Loss: MSE
let diff = y_pred.sub(&y_train);
let loss = diff.mul(&diff).mean();
// Backward pass
loss.backward();
// Gradient descent update (no_grad to avoid tracking)
no_grad(|| {
let w_grad = w.grad().unwrap();
let b_grad = b.grad().unwrap();
// w = w - lr * grad
w = w.sub(&w_grad.mul(&Tensor::from_slice(&[learning_rate])));
b = b.sub(&b_grad.mul(&Tensor::from_slice(&[learning_rate])));
// Re-enable gradient tracking
w = w.requires_grad();
b = b.requires_grad();
});
// Clear graph for next iteration
clear_graph();
if epoch % 10 == 0 {
println!("Epoch {}: loss = {:.4}", epoch, loss.item());
}
}
println!("Learned: w = {:.2}, b = {:.2}", w.item(), b.item());
// Expected: w ≈ 2.0, b ≈ 1.0
}
Neural Network Layer
use aprender::autograd::Tensor;
struct Linear {
weight: Tensor,
bias: Tensor,
}
impl Linear {
fn new(in_features: usize, out_features: usize) -> Self {
// Xavier initialization
let scale = (2.0 / (in_features + out_features) as f32).sqrt();
let weight_data: Vec<f32> = (0..in_features * out_features)
.map(|_| rand::random::<f32>() * scale - scale / 2.0)
.collect();
let bias_data = vec![0.0; out_features];
Self {
weight: Tensor::new(&weight_data, &[in_features, out_features]).requires_grad(),
bias: Tensor::new(&bias_data, &[out_features]).requires_grad(),
}
}
fn forward(&self, x: &Tensor) -> Tensor {
// y = x @ W + b
x.matmul(&self.weight).add(&self.bias)
}
fn parameters(&self) -> Vec<&Tensor> {
vec![&self.weight, &self.bias]
}
}
Multi-Layer Perceptron
use aprender::autograd::Tensor;
struct MLP {
fc1: Linear,
fc2: Linear,
fc3: Linear,
}
impl MLP {
fn new(input_dim: usize, hidden_dim: usize, output_dim: usize) -> Self {
Self {
fc1: Linear::new(input_dim, hidden_dim),
fc2: Linear::new(hidden_dim, hidden_dim),
fc3: Linear::new(hidden_dim, output_dim),
}
}
fn forward(&self, x: &Tensor) -> Tensor {
let h1 = self.fc1.forward(x).relu();
let h2 = self.fc2.forward(&h1).relu();
self.fc3.forward(&h2)
}
fn parameters(&self) -> Vec<&Tensor> {
let mut params = Vec::new();
params.extend(self.fc1.parameters());
params.extend(self.fc2.parameters());
params.extend(self.fc3.parameters());
params
}
}
Gradient Checking
Verify autograd correctness with numerical gradients:
use aprender::autograd::{Tensor, clear_graph};
fn numerical_gradient(f: impl Fn(&Tensor) -> Tensor, x: &Tensor, eps: f32) -> Vec<f32> {
let mut grads = Vec::with_capacity(x.len());
for i in 0..x.len() {
let mut x_plus = x.data().to_vec();
let mut x_minus = x.data().to_vec();
x_plus[i] += eps;
x_minus[i] -= eps;
let y_plus = f(&Tensor::from_slice(&x_plus)).item();
let y_minus = f(&Tensor::from_slice(&x_minus)).item();
grads.push((y_plus - y_minus) / (2.0 * eps));
}
grads
}
fn test_gradient() {
let x = Tensor::from_slice(&[1.0, 2.0, 3.0]).requires_grad();
// f(x) = sum(x^2) = x₁² + x₂² + x₃²
let f = |t: &Tensor| t.pow(2.0).sum();
// Autograd gradient
let y = f(&x);
y.backward();
let autograd_grad = x.grad().unwrap();
// Numerical gradient
let numerical_grad = numerical_gradient(f, &x, 1e-5);
println!("Autograd: {:?}", autograd_grad.data());
println!("Numerical: {:?}", numerical_grad);
// Should be close: [2, 4, 6]
for (ag, ng) in autograd_grad.data().iter().zip(numerical_grad.iter()) {
assert!((ag - ng).abs() < 1e-4, "Gradient mismatch!");
}
clear_graph();
}
No-Grad for Inference
use aprender::autograd::{Tensor, no_grad, is_grad_enabled};
fn inference(model: &MLP, input: &Tensor) -> Tensor {
// Disable gradient tracking for inference
no_grad(|| {
assert!(!is_grad_enabled());
let output = model.forward(input);
// No tape is recorded, saves memory
output
})
}
fn validate(model: &MLP, val_data: &[(Tensor, Tensor)]) -> f32 {
let mut total_loss = 0.0;
no_grad(|| {
for (x, y) in val_data {
let pred = model.forward(x);
let loss = pred.sub(y).pow(2.0).mean();
total_loss += loss.item();
}
});
total_loss / val_data.len() as f32
}
Broadcasting
use aprender::autograd::Tensor;
let x = Tensor::new(&[1.0, 2.0, 3.0, 4.0], &[2, 2]).requires_grad();
let bias = Tensor::from_slice(&[10.0, 20.0]).requires_grad();
// Bias is broadcast across rows
let y = x.add_broadcast(&bias);
// [[11, 22], [13, 24]]
y.sum().backward();
// Gradient is summed across broadcast dimension
println!("bias.grad = {:?}", bias.grad()); // [2.0, 2.0]
Memory Management
use aprender::autograd::{Tensor, clear_graph, clear_grad};
fn training_loop() {
let mut model = MLP::new(10, 64, 2);
for batch in 0..1000 {
// Forward + backward
let loss = compute_loss(&model);
loss.backward();
// Update parameters
update_params(&mut model, 0.01);
// IMPORTANT: Clear graph after each iteration
clear_graph();
// Optionally clear individual gradients
for param in model.parameters() {
clear_grad(param.id());
}
}
}
Running Examples
# Basic autograd demo
cargo run --example autograd_basics
# Train a simple model
cargo run --example autograd_training
# Gradient checking
cargo run --example gradient_check
References
- Baydin et al. (2018). "Automatic differentiation in machine learning: a survey." JMLR.
- Rumelhart et al. (1986). "Learning representations by back-propagating errors." Nature.
- Griewank & Walther (2008). "Evaluating derivatives." SIAM.