Benchmark: aprender-serve vs Candle

Contract: apr-book-ch21

Run: cargo run -p aprender-core --example ch21_vs_candle

#![allow(clippy::disallowed_methods)]
//! Chapter 21: Benchmark — aprender-serve vs Candle
//!
//! Data source: paiml/candle-vs-apr (RTX 4090, Qwen2.5-Coder-1.5B Q4_K_M)
//! Contract: contracts/apr-book-ch21-v1.yaml

use aprender::format::validated_tensors::TensorStats;

fn main() {
    // Benchmark results from paiml/candle-vs-apr/performance.md (v3)
    // Hardware: RTX 4090, Model: Qwen2.5-Coder-1.5B Q4_K_M GGUF
    let aprender_serve_tps = 273.8_f64;
    let candle_tps = 227.4_f64;
    let speedup = aprender_serve_tps / candle_tps;

    println!("=== aprender-serve vs Candle (RTX 4090) ===");
    println!("Model: Qwen2.5-Coder-1.5B Q4_K_M");
    println!();
    println!("Single-request decode (c=1):");
    println!("  aprender-serve: {aprender_serve_tps:.1} tok/s");
    println!("  Candle:         {candle_tps:.1} tok/s");
    println!("  Speedup:        {speedup:.2}x");
    assert!(speedup > 1.0, "aprender-serve must be faster than Candle");

    // Scaling under concurrency (Candle has no server — N/A)
    let c32_tps = 1776.5_f64;
    let scaling = c32_tps / aprender_serve_tps;
    println!();
    println!("Scaling (aprender-serve only — Candle has no server):");
    println!("  c=1:  {aprender_serve_tps:.1} tok/s");
    println!("  c=32: {c32_tps:.1} tok/s");
    println!("  Scaling: {scaling:.1}x");
    assert!(scaling > 5.0, "Must scale >5x from c=1 to c=32");

    // Memory: Candle wins on RSS (no HTTP server overhead)
    let candle_rss_mb = 449_u64;
    let aprender_rss_mb = 3082_u64;
    println!();
    println!("Memory (Peak RSS):");
    println!("  Candle:         {candle_rss_mb} MB (CLI only, no server)");
    println!("  aprender-serve: {aprender_rss_mb} MB (HTTP server + KV cache)");

    // Use TensorStats to demonstrate aprender API usage
    let tps_samples = vec![273.8_f32, 271.2, 275.1, 273.0, 274.5];
    let stats = TensorStats::compute(&tps_samples);
    println!();
    println!("Throughput stability (5 runs):");
    println!("  mean: {:.1} tok/s, min: {:.1}, max: {:.1}", stats.mean, stats.min, stats.max);
    assert!(stats.mean > 270.0, "Mean throughput must exceed 270 tok/s");

    println!();
    println!("Repo: https://github.com/paiml/candle-vs-apr");
    println!("Chapter 21 contracts: PASSED");
}