pub fn jaro_winkler_distance(str1: &str, str2: &str) -> f64 {
if str1.is_empty() || str2.is_empty() {
return 0.0;
}
fn get_matched_characters(s1: &str, s2: &str) -> String {
let mut s2 = s2.to_string();
let mut matched: Vec<char> = Vec::new();
let limit = std::cmp::min(s1.len(), s2.len()) / 2;
for (i, l) in s1.chars().enumerate() {
let left = std::cmp::max(0, i as i32 - limit as i32) as usize;
let right = std::cmp::min(i + limit + 1, s2.len());
if s2[left..right].contains(l) {
matched.push(l);
let a = &s2[0..s2.find(l).expect("this exists")];
let b = &s2[(s2.find(l).expect("this exists") + 1)..];
s2 = format!("{a} {b}");
}
}
matched.iter().collect::<String>()
}
let matching_1 = get_matched_characters(str1, str2);
let matching_2 = get_matched_characters(str2, str1);
let match_count = matching_1.len();
let transpositions = {
let mut count = 0;
for (c1, c2) in matching_1.chars().zip(matching_2.chars()) {
if c1 != c2 {
count += 1;
}
}
count / 2
};
let jaro: f64 = {
if match_count == 0 {
return 0.0;
} else {
(1_f64 / 3_f64)
* (match_count as f64 / str1.len() as f64
+ match_count as f64 / str2.len() as f64
+ (match_count - transpositions) as f64 / match_count as f64)
}
};
let mut prefix_len = 0.0;
let bound = std::cmp::min(std::cmp::min(str1.len(), str2.len()), 4);
for (c1, c2) in str1[..bound].chars().zip(str2[..bound].chars()) {
if c1 == c2 {
prefix_len += 1.0;
} else {
break;
}
}
jaro + (0.1 * prefix_len * (1.0 - jaro))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_jaro_winkler_distance() {
let a = jaro_winkler_distance("hello", "world");
assert_eq!(a, 0.4666666666666666);
let a = jaro_winkler_distance("martha", "marhta");
assert_eq!(a, 0.9611111111111111);
let a = jaro_winkler_distance("martha", "marhat");
assert_eq!(a, 0.9611111111111111);
let a = jaro_winkler_distance("test", "test");
assert_eq!(a, 1.0);
let a = jaro_winkler_distance("test", "");
assert_eq!(a, 0.0);
let a = jaro_winkler_distance("hello world", "HeLLo W0rlD");
assert_eq!(a, 0.6363636363636364);
}
}