WebGPU Acceleration
Status: Verified | Idempotent: Yes | Coverage: 95%+
Run Command
cargo run --example wasm_webgpu_acceleration
Code
//! # Recipe: WebGPU Acceleration
//!
//! Contract: contracts/recipe-iiur-v1.yaml, contracts/flash-attention-v1.yaml
//! **Category**: WASM/Browser
//! **Isolation Level**: Full
//! **Idempotency**: Guaranteed
//! **Dependencies**: None (default features)
//!
//! ## QA Checklist
//! 1. [x] `cargo run` succeeds (Exit Code 0)
//! 2. [x] `cargo test` passes
//! 3. [x] Deterministic output (Verified)
//! 4. [x] No temp files leaked
//! 5. [x] Memory usage stable
//! 6. [x] WASM compatible (Verified)
//! 7. [x] Clippy clean
//! 8. [x] Rustfmt standard
//! 9. [x] No `unwrap()` in logic
//! 10. [x] Proptests pass (100+ cases)
//!
//! ## Learning Objective
//! Accelerate browser inference with WebGPU (simulated).
//!
//! ## Run Command
//! ```bash
//! cargo run --example wasm_webgpu_acceleration
//! ```
//!
//!
//! ## Format Variants
//! ```bash
//! apr run model.apr # APR native format
//! apr run model.gguf # GGUF (llama.cpp compatible)
//! apr run model.safetensors # SafeTensors (HuggingFace)
//! ```
//! ## References
//! - Haas, A. et al. (2017). *Bringing the Web up to Speed with WebAssembly*. PLDI. DOI: 10.1145/3062341.3062363
use apr_cookbook::prelude::*;
use serde::{Deserialize, Serialize};
fn main() -> Result<()> {
let mut ctx = RecipeContext::new("wasm_webgpu_acceleration")?;
println!("=== Recipe: {} ===", ctx.name());
println!("WebGPU acceleration simulation");
println!();
// Check WebGPU availability
let gpu_info = check_webgpu_support();
println!("WebGPU Support:");
println!(" Available: {}", gpu_info.available);
println!(" Adapter: {}", gpu_info.adapter_name);
println!(" Max buffer size: {}MB", gpu_info.max_buffer_size_mb);
println!(" Max workgroup size: {}", gpu_info.max_workgroup_size);
println!();
// Create compute pipeline
let mut pipeline = WebGpuPipeline::new(PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
});
ctx.record_metric("workgroup_size", i64::from(pipeline.config.workgroup_size));
ctx.record_metric("batch_size", i64::from(pipeline.config.batch_size));
// Benchmark matrix operations
let sizes = vec![64, 128, 256, 512];
println!("Matrix multiplication benchmark:");
println!("{:-<60}", "");
println!(
"{:>8} {:>12} {:>12} {:>12} {:>10}",
"Size", "CPU(ms)", "GPU(ms)", "Speedup", "GFLOPS"
);
println!("{:-<60}", "");
for size in &sizes {
let result = pipeline.benchmark_matmul(*size)?;
println!(
"{:>8} {:>12.2} {:>12.2} {:>11.1}x {:>10.1}",
format!("{}x{}", size, size),
result.cpu_time_ms,
result.gpu_time_ms,
result.speedup,
result.gflops
);
if *size == 256 {
ctx.record_float_metric("speedup_256", result.speedup);
ctx.record_float_metric("gflops_256", result.gflops);
}
}
println!("{:-<60}", "");
// Shader compilation stats
let shader_stats = pipeline.get_shader_stats();
println!();
println!("Shader Statistics:");
println!(" Compile time: {}ms", shader_stats.compile_time_ms);
println!(" Shader modules: {}", shader_stats.module_count);
println!(" Total instructions: {}", shader_stats.instruction_count);
// Save benchmark results
let results_path = ctx.path("webgpu_benchmark.json");
pipeline.save_results(&results_path)?;
println!();
println!("Benchmark results saved to: {:?}", results_path);
Ok(())
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct GpuInfo {
available: bool,
adapter_name: String,
max_buffer_size_mb: u32,
max_workgroup_size: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PipelineConfig {
workgroup_size: u32,
batch_size: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct BenchmarkResult {
size: u32,
cpu_time_ms: f64,
gpu_time_ms: f64,
speedup: f64,
gflops: f64,
}
#[derive(Debug, Serialize, Deserialize)]
struct ShaderStats {
compile_time_ms: u32,
module_count: u32,
instruction_count: u32,
}
#[derive(Debug)]
struct WebGpuPipeline {
config: PipelineConfig,
results: Vec<BenchmarkResult>,
}
fn check_webgpu_support() -> GpuInfo {
// Simulated WebGPU detection
GpuInfo {
available: true,
adapter_name: "Simulated GPU Adapter".to_string(),
max_buffer_size_mb: 256,
max_workgroup_size: 256,
}
}
impl WebGpuPipeline {
fn new(config: PipelineConfig) -> Self {
Self {
config,
results: Vec::new(),
}
}
fn benchmark_matmul(&mut self, size: u32) -> Result<BenchmarkResult> {
// Simulated benchmark with deterministic results
// CPU: O(n^3) complexity
let flops = 2.0 * f64::from(size).powi(3);
// Simulated timings (deterministic based on size)
let cpu_time = f64::from(size).powi(3) / 1_000_000.0; // ~1ms per 1M ops
let gpu_time = f64::from(size).powi(3) / 10_000_000.0; // 10x faster on GPU
let speedup = cpu_time / gpu_time;
let gflops = flops / (gpu_time * 1_000_000.0);
let result = BenchmarkResult {
size,
cpu_time_ms: cpu_time,
gpu_time_ms: gpu_time,
speedup,
gflops,
};
self.results.push(result.clone());
Ok(result)
}
fn get_shader_stats(&self) -> ShaderStats {
ShaderStats {
compile_time_ms: 50,
module_count: 3,
instruction_count: 150,
}
}
fn save_results(&self, path: &std::path::Path) -> Result<()> {
let json = serde_json::to_string_pretty(&self.results)
.map_err(|e| CookbookError::Serialization(e.to_string()))?;
std::fs::write(path, json)?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gpu_info() {
let info = check_webgpu_support();
assert!(info.available);
assert!(info.max_buffer_size_mb > 0);
}
#[test]
fn test_pipeline_creation() {
let pipeline = WebGpuPipeline::new(PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
});
assert_eq!(pipeline.config.workgroup_size, 256);
assert!(pipeline.results.is_empty());
}
#[test]
fn test_benchmark_matmul() {
let mut pipeline = WebGpuPipeline::new(PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
});
let result = pipeline.benchmark_matmul(64).unwrap();
assert_eq!(result.size, 64);
assert!(result.cpu_time_ms > 0.0);
assert!(result.gpu_time_ms > 0.0);
assert!(result.speedup > 1.0);
}
#[test]
fn test_gpu_faster_than_cpu() {
let mut pipeline = WebGpuPipeline::new(PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
});
let result = pipeline.benchmark_matmul(128).unwrap();
assert!(result.gpu_time_ms < result.cpu_time_ms);
}
#[test]
fn test_deterministic_results() {
let config = PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
};
let mut p1 = WebGpuPipeline::new(config.clone());
let mut p2 = WebGpuPipeline::new(config);
let r1 = p1.benchmark_matmul(64).unwrap();
let r2 = p2.benchmark_matmul(64).unwrap();
assert_eq!(r1.cpu_time_ms, r2.cpu_time_ms);
assert_eq!(r1.gpu_time_ms, r2.gpu_time_ms);
}
#[test]
fn test_shader_stats() {
let pipeline = WebGpuPipeline::new(PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
});
let stats = pipeline.get_shader_stats();
assert!(stats.compile_time_ms > 0);
assert!(stats.module_count > 0);
}
#[test]
fn test_save_results() {
let ctx = RecipeContext::new("test_webgpu_save").unwrap();
let path = ctx.path("results.json");
let mut pipeline = WebGpuPipeline::new(PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
});
pipeline.benchmark_matmul(64).unwrap();
pipeline.save_results(&path).unwrap();
assert!(path.exists());
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn prop_gpu_always_faster(size in 8u32..256) {
let mut pipeline = WebGpuPipeline::new(PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
});
let result = pipeline.benchmark_matmul(size).unwrap();
prop_assert!(result.speedup > 1.0);
}
#[test]
fn prop_gflops_positive(size in 16u32..128) {
let mut pipeline = WebGpuPipeline::new(PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
});
let result = pipeline.benchmark_matmul(size).unwrap();
prop_assert!(result.gflops > 0.0);
}
#[test]
fn prop_larger_size_more_flops(size1 in 16u32..64, size2 in 65u32..128) {
let mut pipeline = WebGpuPipeline::new(PipelineConfig {
workgroup_size: 256,
batch_size: 1024,
});
let _r1 = pipeline.benchmark_matmul(size1).unwrap();
let _r2 = pipeline.benchmark_matmul(size2).unwrap();
// Larger matrices have more operations
let flops1 = 2.0 * (size1 as f64).powi(3);
let flops2 = 2.0 * (size2 as f64).powi(3);
prop_assert!(flops2 > flops1);
}
}
}