CUDA Inference
Status: Verified | Idempotent: Yes | Coverage: 95%+
Run Command
cargo run --example gpu_cuda_inference
Code
//! # Recipe: CUDA GPU Inference
//!
//! **Category**: GPU Acceleration
//! **CLI Equivalent**: `apr gpu`
//! Contract: contracts/recipe-iiur-v1.yaml, contracts/flash-attention-v1.yaml
//! **Isolation Level**: Full
//! **Idempotency**: Guaranteed
//! **Dependencies**: None (default features)
//!
//! ## QA Checklist
//! 1. [x] `cargo run` succeeds (Exit Code 0)
//! 2. [x] `cargo test` passes
//! 3. [x] Deterministic output (Verified)
//! 4. [x] No temp files leaked
//! 5. [x] Memory usage stable
//! 6. [x] WASM compatible (N/A)
//! 7. [x] Clippy clean
//! 8. [x] Rustfmt standard
//! 9. [x] No `unwrap()` in logic
//! 10. [x] Proptests pass (100+ cases)
//!
//! ## Learning Objective
//! Run model inference on NVIDIA GPU via CUDA (simulated).
//!
//! ## Run Command
//! ```bash
//! cargo run --example gpu_cuda_inference
//! ```
//!
//!
//! ## Format Variants
//! ```bash
//! apr run --device gpu model.apr # APR native format
//! apr run --device gpu model.gguf # GGUF (llama.cpp compatible)
//! apr run --device gpu model.safetensors # SafeTensors (HuggingFace)
//! ```
//! ## References
//! - Dao, T. et al. (2022). *FlashAttention: Fast and Memory-Efficient Exact Attention*. NeurIPS. arXiv:2205.14135
use apr_cookbook::prelude::*;
use serde::{Deserialize, Serialize};
fn main() -> Result<()> {
let mut ctx = RecipeContext::new("gpu_cuda_inference")?;
println!("=== Recipe: {} ===", ctx.name());
println!("CUDA GPU inference simulation");
println!();
// Detect GPU
let gpu = detect_cuda_device();
println!("GPU Device:");
println!(" Name: {}", gpu.name);
println!(
" Compute capability: {}.{}",
gpu.compute_major, gpu.compute_minor
);
println!(" Memory: {}GB", gpu.memory_gb);
println!(" CUDA cores: {}", gpu.cuda_cores);
println!();
ctx.record_metric("gpu_memory_gb", i64::from(gpu.memory_gb));
ctx.record_metric("cuda_cores", i64::from(gpu.cuda_cores));
// Load model to GPU
let model = CudaModel::new(ModelConfig {
layers: 12,
hidden_size: 768,
batch_size: 32,
});
println!("Model loaded to GPU:");
println!(" Layers: {}", model.config.layers);
println!(" Hidden size: {}", model.config.hidden_size);
println!(" Batch size: {}", model.config.batch_size);
println!(" GPU memory used: {}MB", model.memory_mb);
println!();
// Run inference
let input = CudaInput {
data: vec![0.5f32; model.config.hidden_size],
batch_size: model.config.batch_size,
};
let result = model.infer(&input)?;
ctx.record_float_metric("inference_time_ms", result.inference_time_ms);
ctx.record_float_metric("throughput_samples_sec", result.throughput);
println!("Inference Results:");
println!(" Time: {:.2}ms", result.inference_time_ms);
println!(" Throughput: {:.0} samples/sec", result.throughput);
println!(" Output shape: {:?}", result.output_shape);
println!();
// Compare with CPU
let cpu_time = simulate_cpu_inference(&model.config);
let speedup = cpu_time / result.inference_time_ms;
ctx.record_float_metric("gpu_speedup", speedup);
println!("GPU vs CPU:");
println!(" CPU time: {:.2}ms", cpu_time);
println!(" GPU time: {:.2}ms", result.inference_time_ms);
println!(" Speedup: {:.1}x", speedup);
// Save benchmark
let results_path = ctx.path("cuda_benchmark.json");
save_benchmark(&results_path, &gpu, &result, speedup)?;
println!();
println!("Benchmark saved to: {:?}", results_path);
Ok(())
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct CudaDevice {
name: String,
compute_major: u32,
compute_minor: u32,
memory_gb: u32,
cuda_cores: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct ModelConfig {
layers: u32,
hidden_size: usize,
batch_size: usize,
}
#[derive(Debug)]
struct CudaModel {
config: ModelConfig,
memory_mb: u32,
}
#[derive(Debug)]
#[allow(dead_code)]
struct CudaInput {
data: Vec<f32>,
batch_size: usize,
}
#[derive(Debug, Serialize, Deserialize)]
struct InferenceResult {
inference_time_ms: f64,
throughput: f64,
output_shape: Vec<usize>,
}
fn detect_cuda_device() -> CudaDevice {
// Simulated NVIDIA GPU detection
CudaDevice {
name: "NVIDIA RTX 4090 (Simulated)".to_string(),
compute_major: 8,
compute_minor: 9,
memory_gb: 24,
cuda_cores: 16384,
}
}
impl CudaModel {
fn new(config: ModelConfig) -> Self {
// Memory = parameters * 4 bytes (f32) / 1MB
let params = config.layers as usize * config.hidden_size * config.hidden_size;
let memory_mb = (params * 4 / (1024 * 1024)) as u32 + 100; // +100MB overhead
Self { config, memory_mb }
}
fn infer(&self, input: &CudaInput) -> Result<InferenceResult> {
// Simulated GPU inference time
// GPU is efficient with parallelism
let ops = f64::from(self.config.layers)
* self.config.hidden_size as f64
* self.config.hidden_size as f64
* input.batch_size as f64;
// GPU: 10 TFLOPS (10^13 ops/sec)
let gpu_flops = 10e12;
let inference_time_ms = (ops / gpu_flops) * 1000.0 + 0.1; // +0.1ms kernel launch
let throughput = (input.batch_size as f64 / inference_time_ms) * 1000.0;
Ok(InferenceResult {
inference_time_ms,
throughput,
output_shape: vec![input.batch_size, self.config.hidden_size],
})
}
}
fn simulate_cpu_inference(config: &ModelConfig) -> f64 {
// CPU is 10-100x slower than GPU for matrix ops
let ops = f64::from(config.layers)
* config.hidden_size as f64
* config.hidden_size as f64
* config.batch_size as f64;
// CPU: 100 GFLOPS (10^11 ops/sec)
let cpu_flops = 100e9;
(ops / cpu_flops) * 1000.0
}
fn save_benchmark(
path: &std::path::Path,
gpu: &CudaDevice,
result: &InferenceResult,
speedup: f64,
) -> Result<()> {
#[derive(Serialize)]
struct Benchmark<'a> {
gpu: &'a CudaDevice,
inference: &'a InferenceResult,
speedup: f64,
}
let benchmark = Benchmark {
gpu,
inference: result,
speedup,
};
let json = serde_json::to_string_pretty(&benchmark)
.map_err(|e| CookbookError::Serialization(e.to_string()))?;
std::fs::write(path, json)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_device() {
let gpu = detect_cuda_device();
assert!(gpu.cuda_cores > 0);
assert!(gpu.memory_gb > 0);
}
#[test]
fn test_model_creation() {
let model = CudaModel::new(ModelConfig {
layers: 12,
hidden_size: 768,
batch_size: 32,
});
assert!(model.memory_mb > 0);
}
#[test]
fn test_inference() {
let model = CudaModel::new(ModelConfig {
layers: 12,
hidden_size: 768,
batch_size: 32,
});
let input = CudaInput {
data: vec![0.5f32; 768],
batch_size: 32,
};
let result = model.infer(&input).unwrap();
assert!(result.inference_time_ms > 0.0);
assert!(result.throughput > 0.0);
}
#[test]
fn test_gpu_faster_than_cpu() {
let config = ModelConfig {
layers: 12,
hidden_size: 768,
batch_size: 32,
};
let model = CudaModel::new(config.clone());
let input = CudaInput {
data: vec![0.5f32; 768],
batch_size: 32,
};
let gpu_time = model.infer(&input).unwrap().inference_time_ms;
let cpu_time = simulate_cpu_inference(&config);
assert!(gpu_time < cpu_time);
}
#[test]
fn test_deterministic_inference() {
let model = CudaModel::new(ModelConfig {
layers: 12,
hidden_size: 768,
batch_size: 32,
});
let input = CudaInput {
data: vec![0.5f32; 768],
batch_size: 32,
};
let r1 = model.infer(&input).unwrap();
let r2 = model.infer(&input).unwrap();
assert_eq!(r1.inference_time_ms, r2.inference_time_ms);
}
#[test]
fn test_save_benchmark() {
let ctx = RecipeContext::new("test_cuda_save").unwrap();
let path = ctx.path("benchmark.json");
let gpu = detect_cuda_device();
let result = InferenceResult {
inference_time_ms: 1.0,
throughput: 1000.0,
output_shape: vec![32, 768],
};
save_benchmark(&path, &gpu, &result, 10.0).unwrap();
assert!(path.exists());
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn prop_gpu_always_faster(layers in 1usize..10, hidden in 64usize..1024, batch in 1usize..128) {
let config = ModelConfig {
layers: layers as u32,
hidden_size: hidden,
batch_size: batch,
};
let model = CudaModel::new(config.clone());
let input = CudaInput {
data: vec![0.0; hidden * batch],
batch_size: batch
};
let gpu_time = model.infer(&input).unwrap().inference_time_ms;
let cpu_time = simulate_cpu_inference(&config);
// GPU is only faster when computation dominates kernel launch overhead (0.1ms).
// ops / 10e12 * 1000 < ops / 100e9 * 1000 requires ops >> 0.1 * 10e12 / 1000 = 1e9
let ops = layers * hidden * hidden * batch;
if ops > 1_000_000_000 {
prop_assert!(gpu_time < cpu_time);
}
}
#[test]
fn prop_throughput_positive(batch in 1usize..64) {
let model = CudaModel::new(ModelConfig {
layers: 12,
hidden_size: 768,
batch_size: batch,
});
let input = CudaInput {
data: vec![0.5f32; 768],
batch_size: batch,
};
let result = model.infer(&input).unwrap();
prop_assert!(result.throughput > 0.0);
}
}
}