SIMD Dispatch
Overview
trueno-zram uses runtime CPU feature detection to select the optimal SIMD implementation.
Detection
#![allow(unused)]
fn main() {
// src/simd/detect.rs
pub fn detect() -> SimdFeatures {
#[cfg(target_arch = "x86_64")]
{
SimdFeatures {
avx512: is_x86_feature_detected!("avx512f")
&& is_x86_feature_detected!("avx512bw")
&& is_x86_feature_detected!("avx512vl"),
avx2: is_x86_feature_detected!("avx2"),
sse42: is_x86_feature_detected!("sse4.2"),
}
}
#[cfg(target_arch = "aarch64")]
{
SimdFeatures {
neon: true, // Always available on AArch64
}
}
}
}
Dispatch Pattern
#![allow(unused)]
fn main() {
// src/simd/dispatch.rs
pub fn compress(input: &[u8], output: &mut [u8]) -> Result<usize> {
let features = detect();
#[cfg(target_arch = "x86_64")]
{
if features.has_avx512() {
return unsafe { avx512::compress(input, output) };
}
if features.has_avx2() {
return unsafe { avx2::compress(input, output) };
}
}
#[cfg(target_arch = "aarch64")]
{
if features.has_neon() {
return unsafe { neon::compress(input, output) };
}
}
scalar::compress(input, output)
}
}
Implementation Structure
Each algorithm has separate implementations:
lz4/
├── mod.rs # Public API, dispatch
├── compress.rs # Core algorithm logic
├── decompress.rs # Decompression
├── avx512.rs # AVX-512 specialization
├── avx2.rs # AVX2 specialization
├── neon.rs # ARM NEON specialization
└── scalar.rs # Fallback implementation
AVX-512 Implementation
#![allow(unused)]
fn main() {
// lz4/avx512.rs
#[target_feature(enable = "avx512f,avx512bw,avx512vl")]
pub unsafe fn compress(input: &[u8], output: &mut [u8]) -> Result<usize> {
// 64-byte hash table lookups
let hash_chunk = _mm512_loadu_si512(input.as_ptr());
// Parallel match finding
let matches = _mm512_cmpeq_epi32(hash_chunk, pattern);
// Wide literal copies
_mm512_storeu_si512(output.as_mut_ptr(), data);
// ...
}
}
AVX2 Implementation
#![allow(unused)]
fn main() {
// lz4/avx2.rs
#[target_feature(enable = "avx2")]
pub unsafe fn compress(input: &[u8], output: &mut [u8]) -> Result<usize> {
// 32-byte operations
let chunk = _mm256_loadu_si256(input.as_ptr());
// Match extension
let cmp = _mm256_cmpeq_epi8(src, dst);
let mask = _mm256_movemask_epi8(cmp);
// ...
}
}
NEON Implementation
#![allow(unused)]
fn main() {
// lz4/neon.rs
#[cfg(target_arch = "aarch64")]
pub unsafe fn compress(input: &[u8], output: &mut [u8]) -> Result<usize> {
use std::arch::aarch64::*;
// 16-byte operations
let chunk = vld1q_u8(input.as_ptr());
// Parallel comparison
let cmp = vceqq_u8(src, dst);
// ...
}
}
Benchmarking Dispatch
#![allow(unused)]
fn main() {
use trueno_zram_core::simd::{detect, SimdBackend};
fn benchmark_all_backends() {
let features = detect();
let input = [0xAA; PAGE_SIZE];
let mut output = [0u8; PAGE_SIZE * 2];
// Benchmark available backends
if features.has_avx512() {
bench("AVX-512", || avx512::compress(&input, &mut output));
}
if features.has_avx2() {
bench("AVX2", || avx2::compress(&input, &mut output));
}
bench("Scalar", || scalar::compress(&input, &mut output));
}
}
Compile-Time Optimization
For maximum performance, enable target features:
# .cargo/config.toml
[build]
rustflags = ["-C", "target-cpu=native"]
Or for specific features:
rustflags = ["-C", "target-feature=+avx2,+avx512f,+avx512bw"]