Q4 Quantization

Status: Verified | Idempotent: Yes | Coverage: 95%+

Apply 4-bit quantization for maximum size reduction.

Run Command

cargo run --example bundle_apr_quantized_q4

Code

//! # Recipe: Bundle Quantized Q4_0 Model
//!
//! Contract: contracts/recipe-iiur-v1.yaml, contracts/int4-quantization-v1.yaml
//! **Category**: Binary Bundling
//! **Isolation Level**: Full
//! **Idempotency**: Guaranteed
//! **Dependencies**: None (default features)
//!
//! ## QA Checklist
//! 1. [x] `cargo run` succeeds (Exit Code 0)
//! 2. [x] `cargo test` passes
//! 3. [x] Deterministic output (Verified)
//! 4. [x] No temp files leaked
//! 5. [x] Memory usage stable
//! 6. [x] WASM compatible (N/A)
//! 7. [x] Clippy clean
//! 8. [x] Rustfmt standard
//! 9. [x] No `unwrap()` in logic
//! 10. [x] Proptests pass (100+ cases)
//!
//! ## Learning Objective
//! Bundle a Q4_0 quantized model for 75% size reduction.
//!
//! ## Run Command
//! ```bash
//! cargo run --example bundle_apr_quantized_q4
//! ```
//!
//!
//! ## Format Variants
//! ```bash
//! apr convert model.apr          # APR native format
//! apr convert model.gguf         # GGUF (llama.cpp compatible)
//! apr convert model.safetensors  # SafeTensors (HuggingFace)
//! ```
//! ## References
//! - Jacob, B. et al. (2018). *Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference*. CVPR. arXiv:1712.05877

use apr_cookbook::prelude::*;
use rand::Rng;

fn main() -> Result<()> {
    let mut ctx = RecipeContext::new("bundle_apr_quantized_q4")?;

    // Create original F32 weights
    let n_params = 65536; // 64K parameters
    let original_weights = generate_f32_weights(ctx.rng(), n_params);
    let original_size = n_params * 4; // 4 bytes per f32

    ctx.record_metric("n_params", n_params as i64);
    ctx.record_metric("original_size_bytes", original_size as i64);

    // Quantize to Q4_0 (4-bit quantization)
    let quantized = quantize_to_q4_0(&original_weights);
    let quantized_size = quantized.len();
    let compression_ratio = original_size as f64 / quantized_size as f64;

    ctx.record_metric("quantized_size_bytes", quantized_size as i64);
    ctx.record_float_metric("compression_ratio", compression_ratio);

    // Calculate quantization error
    let dequantized = dequantize_q4_0(&quantized, n_params);
    let mse = calculate_mse(&original_weights, &dequantized);
    ctx.record_float_metric("quantization_mse", mse);

    // Bundle quantized model
    let mut converter = AprConverter::new();
    converter.set_metadata(ConversionMetadata {
        name: Some("quantized-model-q4".to_string()),
        architecture: Some("mlp-quantized".to_string()),
        source_format: None,
        custom: std::collections::HashMap::new(),
    });

    converter.add_tensor(TensorData {
        name: "weights_q4".to_string(),
        shape: vec![n_params],
        dtype: DataType::Q4_0,
        data: quantized,
    });

    let apr_path = ctx.path("quantized_model.apr");
    let apr_bytes = converter.to_apr()?;
    std::fs::write(&apr_path, &apr_bytes)?;

    println!("=== Recipe: {} ===", ctx.name());
    println!("Original model:");
    println!("  Parameters: {}", n_params);
    println!("  Size: {} bytes (F32)", original_size);
    println!();
    println!("Quantized model (Q4_0):");
    println!("  Size: {} bytes", quantized_size);
    println!("  Compression: {:.1}x", compression_ratio);
    println!(
        "  Size reduction: {:.1}%",
        (1.0 - 1.0 / compression_ratio) * 100.0
    );
    println!("  Quantization MSE: {:.6}", mse);
    println!();
    println!("Saved to: {:?}", apr_path);

    Ok(())
}

/// Generate random F32 weights
fn generate_f32_weights(rng: &mut impl Rng, n: usize) -> Vec<f32> {
    (0..n).map(|_| rng.gen_range(-1.0f32..1.0f32)).collect()
}

/// Q4_0 block structure: 32 values packed with scale factor
const Q4_0_BLOCK_SIZE: usize = 32;

/// Quantize F32 weights to Q4_0 format
fn quantize_to_q4_0(weights: &[f32]) -> Vec<u8> {
    let n_blocks = weights.len().div_ceil(Q4_0_BLOCK_SIZE);
    // Each block: 2 bytes scale (f16) + 16 bytes data (32 x 4-bit)
    let mut result = Vec::with_capacity(n_blocks * 18);

    for block_idx in 0..n_blocks {
        let start = block_idx * Q4_0_BLOCK_SIZE;
        let end = (start + Q4_0_BLOCK_SIZE).min(weights.len());
        let block = &weights[start..end];

        // Find max absolute value for scale
        let max_abs = block.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
        let scale = if max_abs > 0.0 { max_abs / 7.0 } else { 1.0 };

        // Store scale as f16 (simplified: just use 2 bytes from f32)
        let scale_bytes = scale.to_le_bytes();
        result.push(scale_bytes[0]);
        result.push(scale_bytes[1]);

        // Quantize each value to 4 bits (0-15, centered at 8)
        let mut packed = [0u8; 16];
        for (i, &val) in block.iter().enumerate() {
            let quantized = ((val / scale) + 8.0).round().clamp(0.0, 15.0) as u8;
            let byte_idx = i / 2;
            if i % 2 == 0 {
                packed[byte_idx] |= quantized;
            } else {
                packed[byte_idx] |= quantized << 4;
            }
        }
        result.extend_from_slice(&packed);
    }

    result
}

/// Read scale factor from Q4_0 block
fn read_q4_scale(data: &[u8], offset: usize) -> f32 {
    let scale_bytes = [data[offset], data[offset + 1], 0, 0];
    let stored_scale = f32::from_le_bytes(scale_bytes);
    if stored_scale == 0.0 {
        1.0
    } else {
        stored_scale
    }
}

/// Unpack a single 4-bit value from packed byte
fn unpack_q4_value(packed: u8, index: usize) -> u8 {
    if index % 2 == 0 {
        packed & 0x0F
    } else {
        (packed >> 4) & 0x0F
    }
}

/// Dequantize a single Q4_0 block
fn dequantize_q4_block(
    data: &[u8],
    offset: usize,
    scale: f32,
    n_values: usize,
    current_count: usize,
) -> Vec<f32> {
    let mut values = Vec::with_capacity(Q4_0_BLOCK_SIZE);
    for i in 0..Q4_0_BLOCK_SIZE {
        if current_count + values.len() >= n_values {
            break;
        }
        let byte_idx = offset + 2 + i / 2;
        if byte_idx >= data.len() {
            break;
        }
        let quantized = unpack_q4_value(data[byte_idx], i);
        let value = (f32::from(quantized) - 8.0) * scale;
        values.push(value);
    }
    values
}

/// Dequantize Q4_0 back to F32
fn dequantize_q4_0(data: &[u8], n_values: usize) -> Vec<f32> {
    let mut result = Vec::with_capacity(n_values);
    let n_blocks = n_values.div_ceil(Q4_0_BLOCK_SIZE);

    for block_idx in 0..n_blocks {
        let offset = block_idx * 18;
        if offset + 18 > data.len() {
            break;
        }

        let scale = read_q4_scale(data, offset);
        let block_values = dequantize_q4_block(data, offset, scale, n_values, result.len());
        result.extend(block_values);
    }

    result
}

/// Calculate mean squared error
fn calculate_mse(a: &[f32], b: &[f32]) -> f64 {
    let n = a.len().min(b.len());
    if n == 0 {
        return 0.0;
    }

    let sum: f64 = a[..n]
        .iter()
        .zip(b[..n].iter())
        .map(|(x, y)| (f64::from(*x) - f64::from(*y)).powi(2))
        .sum();

    sum / n as f64
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_quantization_size_reduction() {
        let mut ctx = RecipeContext::new("test_quant_size").unwrap();
        let weights = generate_f32_weights(ctx.rng(), 1024);
        let quantized = quantize_to_q4_0(&weights);

        // Q4_0 should be roughly 18/32 = 0.5625 of block count
        // For 1024 values: 32 blocks * 18 bytes = 576 bytes
        // Original: 1024 * 4 = 4096 bytes
        // Ratio: ~7x compression
        assert!(quantized.len() < weights.len() * 4);
    }

    #[test]
    fn test_quantization_roundtrip() {
        let mut ctx = RecipeContext::new("test_quant_roundtrip").unwrap();
        let original = generate_f32_weights(ctx.rng(), 256);
        let quantized = quantize_to_q4_0(&original);
        let dequantized = dequantize_q4_0(&quantized, 256);

        // Should have same number of values
        assert_eq!(dequantized.len(), original.len());

        // Verify reasonable reconstruction error
        let mse = calculate_mse(&original, &dequantized);
        if mse > 0.35 {
            panic!("MSE too high: {}", mse);
        }
    }

    #[test]
    fn test_deterministic_quantization() {
        let mut ctx1 = RecipeContext::new("det_quant").unwrap();
        let mut ctx2 = RecipeContext::new("det_quant").unwrap();

        let weights1 = generate_f32_weights(ctx1.rng(), 128);
        let weights2 = generate_f32_weights(ctx2.rng(), 128);

        assert_eq!(weights1, weights2);

        let q1 = quantize_to_q4_0(&weights1);
        let q2 = quantize_to_q4_0(&weights2);

        assert_eq!(q1, q2);
    }

    #[test]
    fn test_zero_weights() {
        let zeros = vec![0.0f32; 64];
        let quantized = quantize_to_q4_0(&zeros);
        let dequantized = dequantize_q4_0(&quantized, 64);

        // All zeros should stay close to zero
        for &v in &dequantized {
            assert!(v.abs() < 0.1);
        }
    }
}

#[cfg(test)]
mod proptests {
    use super::*;
    use proptest::prelude::*;

    proptest! {
        #![proptest_config(ProptestConfig::with_cases(50))]

        #[test]
        fn prop_quantized_smaller(n_params in 32usize..1024) {
            let mut ctx = RecipeContext::new("prop_smaller").unwrap();
            let weights = generate_f32_weights(ctx.rng(), n_params);
            let quantized = quantize_to_q4_0(&weights);

            let original_size = n_params * 4;
            prop_assert!(quantized.len() < original_size);
        }

        #[test]
        fn prop_roundtrip_length(n_params in 32usize..512) {
            let mut ctx = RecipeContext::new("prop_length").unwrap();
            let weights = generate_f32_weights(ctx.rng(), n_params);
            let quantized = quantize_to_q4_0(&weights);
            let dequantized = dequantize_q4_0(&quantized, n_params);

            prop_assert_eq!(dequantized.len(), n_params);
        }
    }
}

Q4 Format

4 bits per weight value
Block-wise scaling factors
8x size reduction from FP32

APR Cookbook - Idiomatic Rust Patterns for ML Model Deployment

Q4 Quantization

Run Command

Code

Q4 Format