Chapter 13: Profiling and Optimization

Contract: apr-book-ch13

Run: cargo run -p aprender-core --example ch13_profiling

#![allow(clippy::disallowed_methods)]
//! Chapter 13: Profiling and Optimization
//!
//! Demonstrates roofline model and fusion contracts.
//! Citation: Williams et al., "Roofline," CACM 2009
//! Contract: contracts/apr-book-ch13-v1.yaml

fn main() {
    // Roofline model: compute vs memory bound
    // Operational intensity = FLOPs / Bytes
    let params_7b: f64 = 7e9;
    let flops_per_token = 2.0 * params_7b; // 2 * params for matmul
    let bytes_per_token_q4 = params_7b / 2.0; // Q4 ~ 0.5 bytes/param
    let oi = flops_per_token / bytes_per_token_q4;
    println!("Roofline analysis (Williams et al., 2009):");
    println!("  7B Q4K operational intensity: {oi:.1} FLOPs/byte");
    println!("  < 10 -> memory-bound (typical for decode)");
    println!("  > 50 -> compute-bound (typical for batched prefill)");
    assert!(oi > 1.0, "OI must be positive");

    // Memory bandwidth targets
    let ddr5_bw = 50e9_f64; // ~50 GB/s DDR5
    let theoretical_tps = ddr5_bw / bytes_per_token_q4;
    println!("\nMemory bandwidth ceiling:");
    println!("  DDR5 bandwidth: {:.0} GB/s", ddr5_bw / 1e9);
    println!("  Theoretical max: {theoretical_tps:.0} tok/s (7B Q4K)");
    assert!(theoretical_tps > 10.0, "Must exceed 10 tok/s theoretical");

    // FFN gate+up fusion contract (PMAT-FFN-FUSION)
    let layers = 28_usize;
    let dispatches_unfused = layers * 2; // gate + up separate
    let dispatches_fused = layers; // gate+up in one dispatch
    println!("\nFFN gate+up fusion:");
    println!("  Unfused: {dispatches_unfused} rayon dispatches");
    println!("  Fused:   {dispatches_fused} rayon dispatches");
    println!("  Reduction: {:.0}%", (1.0 - dispatches_fused as f64 / dispatches_unfused as f64) * 100.0);
    assert_eq!(dispatches_fused * 2, dispatches_unfused, "Fusion halves dispatches");

    // Batched prefill speedup
    let serial_ms = 2570.0_f64;
    let batched_ms = 314.0_f64;
    let speedup = serial_ms / batched_ms;
    println!("\nBatched prefill (91-token prompt):");
    println!("  Serial:  {serial_ms:.0} ms");
    println!("  Batched: {batched_ms:.0} ms");
    println!("  Speedup: {speedup:.1}x");
    assert!(speedup > 5.0, "Batched prefill must achieve >5x speedup");

    println!("\nChapter 13 contracts: PASSED");
}