K-Means Clustering
Status: Verified | Idempotent: Yes | Coverage: 95%+
Implement k-means clustering with APR model storage.
Run Command
cargo run --example create_apr_kmeans_clustering
Code
//! # Recipe: Create APR KMeans Clustering Model
//!
//! Contract: contracts/recipe-iiur-v1.yaml
//! **Category**: Model Creation
//! **Isolation Level**: Full
//! **Idempotency**: Guaranteed
//! **Dependencies**: None (default features)
//!
//! ## QA Checklist
//! 1. [x] `cargo run` succeeds (Exit Code 0)
//! 2. [x] `cargo test` passes
//! 3. [x] Deterministic output (Verified)
//! 4. [x] No temp files leaked
//! 5. [x] Memory usage stable
//! 6. [x] WASM compatible (N/A)
//! 7. [x] Clippy clean
//! 8. [x] Rustfmt standard
//! 9. [x] No `unwrap()` in logic
//! 10. [x] Proptests pass (100+ cases)
//!
//! ## Learning Objective
//! Train a KMeans clustering model on synthetic data and save as `.apr`.
//!
//! ## Run Command
//! ```bash
//! cargo run --example create_apr_kmeans_clustering
//! ```
//!
//!
//! ## Format Variants
//! ```bash
//! apr convert model.apr # APR native format
//! apr convert model.gguf # GGUF (llama.cpp compatible)
//! apr convert model.safetensors # SafeTensors (HuggingFace)
//! ```
//! ## References
//! - Jacob, B. et al. (2018). *Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference*. CVPR. arXiv:1712.05877
use apr_cookbook::prelude::*;
use rand::Rng;
fn main() -> Result<()> {
let mut ctx = RecipeContext::new("create_apr_kmeans_clustering")?;
// Generate synthetic clustered data
let n_samples = 300;
let n_features = 2;
let n_clusters = 3;
let x_data = generate_clustered_data(ctx.rng(), n_samples, n_features, n_clusters);
// Train KMeans
let max_iters = 100;
let centroids = train_kmeans(ctx.rng(), &x_data, n_features, n_clusters, max_iters);
ctx.record_metric("n_samples", n_samples as i64);
ctx.record_metric("n_features", n_features as i64);
ctx.record_metric("n_clusters", n_clusters as i64);
// Calculate inertia (sum of squared distances to centroids)
let assignments = assign_clusters(&x_data, ¢roids, n_features);
let inertia = calculate_inertia(&x_data, ¢roids, &assignments, n_features);
ctx.record_float_metric("inertia", inertia);
// Save as APR
let mut converter = AprConverter::new();
converter.set_metadata(ConversionMetadata {
name: Some("kmeans".to_string()),
architecture: Some("clustering".to_string()),
source_format: None,
custom: std::collections::HashMap::new(),
});
converter.add_tensor(TensorData {
name: "centroids".to_string(),
shape: vec![n_clusters, n_features],
dtype: DataType::F32,
data: floats_to_bytes(¢roids),
});
let apr_path = ctx.path("kmeans.apr");
let apr_bytes = converter.to_apr()?;
std::fs::write(&apr_path, &apr_bytes)?;
println!("=== Recipe: {} ===", ctx.name());
println!("Trained KMeans with k={}", n_clusters);
println!("Centroids:");
for (i, chunk) in centroids.chunks(n_features).enumerate() {
println!(" Cluster {}: {:?}", i, chunk);
}
println!("Inertia: {:.4}", inertia);
println!("Saved to: {:?}", apr_path);
Ok(())
}
/// Generate data with k clusters
fn generate_clustered_data(
rng: &mut impl Rng,
n_samples: usize,
n_features: usize,
n_clusters: usize,
) -> Vec<f32> {
let mut data = Vec::with_capacity(n_samples * n_features);
// Generate cluster centers
let centers: Vec<Vec<f32>> = (0..n_clusters)
.map(|i| {
(0..n_features)
.map(|_| (i as f32 * 5.0) + rng.gen_range(-1.0f32..1.0f32))
.collect()
})
.collect();
let samples_per_cluster = n_samples / n_clusters;
for (cluster_idx, center) in centers.iter().enumerate() {
let n = if cluster_idx == n_clusters - 1 {
n_samples - cluster_idx * samples_per_cluster
} else {
samples_per_cluster
};
for _ in 0..n {
for &c in center {
data.push(c + rng.gen_range(-0.5f32..0.5f32));
}
}
}
data
}
/// Train KMeans clustering
fn train_kmeans(
rng: &mut impl Rng,
x_data: &[f32],
n_features: usize,
n_clusters: usize,
max_iters: usize,
) -> Vec<f32> {
let n_samples = x_data.len() / n_features;
// Initialize centroids randomly from data points
let mut centroids = Vec::with_capacity(n_clusters * n_features);
let mut used_indices = std::collections::HashSet::new();
for _ in 0..n_clusters {
let mut idx = rng.gen_range(0..n_samples);
while used_indices.contains(&idx) {
idx = rng.gen_range(0..n_samples);
}
used_indices.insert(idx);
for j in 0..n_features {
centroids.push(x_data[idx * n_features + j]);
}
}
// Iterate until convergence or max_iters
for _ in 0..max_iters {
// Assign points to nearest centroid
let assignments = assign_clusters(x_data, ¢roids, n_features);
// Update centroids
let new_centroids = update_centroids(x_data, &assignments, n_features, n_clusters);
// Check convergence
let diff: f32 = centroids
.iter()
.zip(new_centroids.iter())
.map(|(a, b)| (a - b).abs())
.sum();
centroids = new_centroids;
if diff < 1e-6 {
break;
}
}
centroids
}
/// Assign each point to nearest centroid
fn assign_clusters(x_data: &[f32], centroids: &[f32], n_features: usize) -> Vec<usize> {
let n_samples = x_data.len() / n_features;
let n_clusters = centroids.len() / n_features;
let mut assignments = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let sample = &x_data[i * n_features..(i + 1) * n_features];
let mut best_cluster = 0;
let mut best_dist = f32::MAX;
for k in 0..n_clusters {
let centroid = ¢roids[k * n_features..(k + 1) * n_features];
let dist: f32 = sample
.iter()
.zip(centroid.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
if dist < best_dist {
best_dist = dist;
best_cluster = k;
}
}
assignments.push(best_cluster);
}
assignments
}
/// Update centroids based on assignments
fn update_centroids(
x_data: &[f32],
assignments: &[usize],
n_features: usize,
n_clusters: usize,
) -> Vec<f32> {
let mut new_centroids = vec![0.0f32; n_clusters * n_features];
let mut counts = vec![0usize; n_clusters];
for (i, &cluster) in assignments.iter().enumerate() {
counts[cluster] += 1;
for j in 0..n_features {
new_centroids[cluster * n_features + j] += x_data[i * n_features + j];
}
}
for k in 0..n_clusters {
if counts[k] > 0 {
for j in 0..n_features {
new_centroids[k * n_features + j] /= counts[k] as f32;
}
}
}
new_centroids
}
/// Calculate inertia (within-cluster sum of squares)
fn calculate_inertia(
x_data: &[f32],
centroids: &[f32],
assignments: &[usize],
n_features: usize,
) -> f64 {
let mut inertia = 0.0f64;
for (i, &cluster) in assignments.iter().enumerate() {
let sample = &x_data[i * n_features..(i + 1) * n_features];
let centroid = ¢roids[cluster * n_features..(cluster + 1) * n_features];
let dist: f32 = sample
.iter()
.zip(centroid.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
inertia += f64::from(dist);
}
inertia
}
fn floats_to_bytes(floats: &[f32]) -> Vec<u8> {
floats.iter().flat_map(|f| f.to_le_bytes()).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_data_generation() {
let mut ctx = RecipeContext::new("test_kmeans_data").unwrap();
let data = generate_clustered_data(ctx.rng(), 90, 2, 3);
assert_eq!(data.len(), 180); // 90 samples * 2 features
}
#[test]
fn test_kmeans_training() {
let mut ctx = RecipeContext::new("test_kmeans_train").unwrap();
let data = generate_clustered_data(ctx.rng(), 60, 2, 3);
let centroids = train_kmeans(ctx.rng(), &data, 2, 3, 50);
assert_eq!(centroids.len(), 6); // 3 clusters * 2 features
}
#[test]
fn test_cluster_assignment() {
let centroids = vec![0.0f32, 0.0, 10.0, 10.0]; // 2 centroids in 2D
let data = vec![0.1f32, 0.1, 9.9, 9.9, 0.0, 0.0];
let assignments = assign_clusters(&data, ¢roids, 2);
assert_eq!(assignments, vec![0, 1, 0]);
}
#[test]
fn test_inertia_calculation() {
let centroids = vec![0.0f32, 0.0];
let data = vec![1.0f32, 0.0, 0.0, 1.0];
let assignments = vec![0, 0];
let inertia = calculate_inertia(&data, ¢roids, &assignments, 2);
// Each point is distance 1 from origin, so inertia = 1 + 1 = 2
assert!((inertia - 2.0).abs() < 0.001);
}
#[test]
fn test_deterministic() {
let mut ctx1 = RecipeContext::new("det_kmeans").unwrap();
let mut ctx2 = RecipeContext::new("det_kmeans").unwrap();
let data1 = generate_clustered_data(ctx1.rng(), 30, 2, 3);
let data2 = generate_clustered_data(ctx2.rng(), 30, 2, 3);
assert_eq!(data1, data2);
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(30))]
#[test]
fn prop_assignments_valid(n_samples in 10usize..50, n_clusters in 2usize..5) {
let mut ctx = RecipeContext::new("prop_assign").unwrap();
let data = generate_clustered_data(ctx.rng(), n_samples, 2, n_clusters);
let centroids = train_kmeans(ctx.rng(), &data, 2, n_clusters, 10);
let assignments = assign_clusters(&data, ¢roids, 2);
prop_assert_eq!(assignments.len(), n_samples);
for &a in &assignments {
prop_assert!(a < n_clusters);
}
}
#[test]
fn prop_inertia_non_negative(n_samples in 10usize..50) {
let mut ctx = RecipeContext::new("prop_inertia").unwrap();
let data = generate_clustered_data(ctx.rng(), n_samples, 2, 2);
let centroids = train_kmeans(ctx.rng(), &data, 2, 2, 10);
let assignments = assign_clusters(&data, ¢roids, 2);
let inertia = calculate_inertia(&data, ¢roids, &assignments, 2);
prop_assert!(inertia >= 0.0);
}
}
}
Key Concepts
- Centroids: Cluster centers stored as [k, dims] tensor
- Assignment: Nearest centroid based on Euclidean distance
- Convergence: Iterative refinement until centroids stabilize