Chapter 19: Text Processing and Tokenization
Contract:
apr-book-ch19
Run: cargo run -p aprender-core --example ch19_text
#![allow(clippy::disallowed_methods)]
//! Chapter 19: Text Processing and Tokenization
//!
//! Demonstrates cosine similarity, Jaccard similarity, and edit distance.
//! Citation: Sennrich et al., "Subword Units," arXiv:1508.07909
//! Contract: contracts/apr-book-ch19-v1.yaml (v2 — api_calls enforced)
use aprender::primitives::Vector;
use aprender::text::similarity::{cosine_similarity, edit_distance, jaccard_similarity};
fn main() {
// --- Cosine similarity ---
let a = Vector::from_vec(vec![1.0_f64, 2.0, 3.0]);
let b = Vector::from_vec(vec![1.0_f64, 2.0, 3.0]);
let cos_identical = cosine_similarity(&a, &b).expect("cosine similarity");
println!("Cosine similarity:");
println!(" Identical vectors: {cos_identical:.4}");
assert!(
(cos_identical - 1.0).abs() < 1e-6,
"Identical vectors must have cosine = 1.0"
);
// Orthogonal vectors -> cosine = 0.0
let c = Vector::from_vec(vec![1.0_f64, 0.0, 0.0]);
let d = Vector::from_vec(vec![0.0_f64, 1.0, 0.0]);
let cos_orthogonal = cosine_similarity(&c, &d).expect("cosine similarity");
println!(" Orthogonal vectors: {cos_orthogonal:.4}");
assert!(
cos_orthogonal.abs() < 1e-6,
"Orthogonal vectors must have cosine = 0.0"
);
// --- Jaccard similarity ---
let tokens_a = vec!["the", "cat", "sat", "on", "the", "mat"];
let tokens_b = vec!["the", "cat", "sat", "on", "a", "hat"];
let jaccard = jaccard_similarity(&tokens_a, &tokens_b).expect("jaccard similarity");
println!("\nJaccard similarity:");
println!(" '{}' vs '{}'", tokens_a.join(" "), tokens_b.join(" "));
println!(" Jaccard: {jaccard:.4}");
assert!(jaccard > 0.0 && jaccard < 1.0, "Jaccard in (0,1) for overlapping sets");
let jaccard_self =
jaccard_similarity(&tokens_a, &tokens_a).expect("jaccard self-similarity");
assert!(
(jaccard_self - 1.0).abs() < 1e-6,
"Self-similarity must be 1.0"
);
// --- Edit distance (Levenshtein) ---
let s1 = "kitten";
let s2 = "sitting";
let dist = edit_distance(s1, s2).expect("edit distance");
println!("\nEdit distance:");
println!(" '{s1}' -> '{s2}': {dist}");
assert_eq!(dist, 3, "kitten->sitting requires 3 edits");
let dist_self = edit_distance("hello", "hello").expect("edit distance");
assert_eq!(dist_self, 0, "Same string must have distance 0");
println!("\nChapter 19 contracts: PASSED");
}