DataLoader & Batching (Examples 11-20)
This section covers the DataLoader for ML training workflows.
Example 11: Basic Batching
use alimentar::{ArrowDataset, DataLoader};
let dataset = ArrowDataset::from_parquet("data.parquet")?;
let loader = DataLoader::new(dataset).batch_size(100);
for batch in loader {
println!("Batch rows: {}", batch.num_rows());
}
Example 12: Shuffle with Determinism
use alimentar::{ArrowDataset, DataLoader};
let dataset = ArrowDataset::from_parquet("data.parquet")?;
let loader = DataLoader::new(dataset)
.batch_size(100)
.shuffle(true)
.seed(42); // Reproducible
let batches: Vec<_> = loader.into_iter().collect();
Example 13: Drop Last
use alimentar::{ArrowDataset, DataLoader};
let dataset = ArrowDataset::from_parquet("data.parquet")?;
// 1000 rows, batch_size 300 = 3 full batches + 1 partial
let loader = DataLoader::new(dataset)
.batch_size(300)
.drop_last(true); // Drop incomplete last batch
let batches: Vec<_> = loader.into_iter().collect();
assert_eq!(batches.len(), 3);
Examples 14-15: Parallel and Prefetch
use alimentar::{ArrowDataset, DataLoader};
let dataset = ArrowDataset::from_parquet("data.parquet")?;
let loader = DataLoader::new(dataset)
.batch_size(100)
.num_workers(4) // Parallel loading
.prefetch_factor(2); // 2x batch prefetch
Examples 16-17: Weighted and Stratified Sampling
use alimentar::{ArrowDataset, DataLoader, WeightedSampler};
// Weighted sampling by column
let sampler = WeightedSampler::from_column("weight");
let loader = DataLoader::new(dataset)
.batch_size(100)
.sampler(sampler);
// Stratified by label
let loader = DataLoader::new(dataset)
.batch_size(100)
.stratify_by("label");
Examples 18-19: Infinite Iterator and Collate
use alimentar::{ArrowDataset, DataLoader};
// Infinite iteration for training
let loader = DataLoader::new(dataset)
.batch_size(100)
.infinite(true);
// Custom collate function
let loader = DataLoader::new(dataset)
.batch_size(100)
.collate_fn(|batches| {
// Custom batch merging logic
Ok(concat_batches(batches)?)
});
Example 20: Batch Size Benchmark
use alimentar::{ArrowDataset, DataLoader};
use std::time::Instant;
let dataset = ArrowDataset::from_parquet("large.parquet")?;
for batch_size in [32, 64, 128, 256, 512] {
let start = Instant::now();
let loader = DataLoader::new(dataset.clone()).batch_size(batch_size);
let _: Vec<_> = loader.into_iter().collect();
println!("batch_size={}: {:?}", batch_size, start.elapsed());
}
Key Concepts
- Batch size: Controls memory/compute tradeoff
- Shuffling: Seed for reproducibility in training
- Drop last: Ensures uniform batch sizes
- Prefetch: Overlaps data loading with compute