Chapter 09: Inference with aprender-serve
Contract:
apr-book-ch09
Run: cargo run -p aprender-core --example ch09_inference
#![allow(clippy::disallowed_methods)]
//! Chapter 9: Inference with aprender-serve
//!
//! Demonstrates quantization math and inference architecture.
//! Citation: Dettmers et al., "LLM.int8()," arXiv:2208.07339
//! Contract: contracts/apr-book-ch09-v1.yaml
fn main() {
// Quantization bit-width contract
// Q4K: 4.5 bits/weight effective (Dettmers et al., arXiv:2210.17323)
let params_7b: f64 = 7e9;
let bytes_f16 = params_7b * 2.0;
let bytes_q4k = params_7b * 4.5 / 8.0;
let compression = bytes_f16 / bytes_q4k;
println!("7B model size:");
println!(" F16: {:.1} GB", bytes_f16 / 1e9);
println!(" Q4K: {:.1} GB", bytes_q4k / 1e9);
println!(" Compression: {compression:.1}x");
assert!(compression > 3.0, "Q4K must compress >3x vs F16");
// Fused dequant+matmul contract
// (Dao et al., FlashAttention-2, arXiv:2307.08691)
println!("\nFused dequant+matmul: dequantize inline during GEMV");
println!(" Avoids materializing full F32 weight matrix");
println!(" Memory bandwidth: read Q4K, compute F32, write F32");
// PagedAttention KV cache (Kwon et al., arXiv:2309.06180)
let page_size = 16_usize; // tokens per page
let ctx_len = 4096_usize;
let pages_needed = (ctx_len + page_size - 1) / page_size;
println!("\nPagedAttention:");
println!(" Page size: {page_size} tokens");
println!(" Pages for {ctx_len} context: {pages_needed}");
assert_eq!(pages_needed, 256, "Page count contract");
// Performance targets (from apr oracle)
println!("\nPerformance targets:");
println!(" 1B Q4K: 100+ tok/s CPU, 500+ tok/s GPU");
println!(" 7B Q4K: 30+ tok/s CPU, 150+ tok/s GPU");
// Architecture contract
println!("\nContract: aprender-serve handles ALL inference");
println!("Contract: aprender-core is for TRAINING ONLY");
println!("Chapter 9 contracts: PASSED");
}