From 3276ff3bcdf2509cc550b279ac0eedef979abb07 Mon Sep 17 00:00:00 2001 From: Sander Hautvast Date: Thu, 28 Apr 2022 16:21:13 +0200 Subject: [PATCH] draft for levenshtein edit-distance --- src/edit_distance.rs | 97 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 src/edit_distance.rs diff --git a/src/edit_distance.rs b/src/edit_distance.rs new file mode 100644 index 0000000..79af0fd --- /dev/null +++ b/src/edit_distance.rs @@ -0,0 +1,97 @@ +use unicode_segmentation::UnicodeSegmentation; + +//could also be powers of 2 that are combined using bitwise-or +// enum Backtrace { +// LEFT, +// DOWN, +// DIAGONAL, +// } + +struct Element { + value: usize, + // backtraces: Vec, +} + +impl Element { + fn new() -> Self { + Self { + value: 0, + // backtraces: Vec::new(), + } + } +} + +pub fn get_levenshtein_distance(word1: &str, word2: &str) -> usize { + get_edit_distance_table(word1, word2)[word1.len()][word2.len()].value +} + +// non recursive implementation requires a table +// my guess is that this is more efficient (should check) +fn get_edit_distance_table(word1: &str, word2: &str) -> Vec> { + // create table + let mut table = Vec::new(); + for _ in 0..=word1.len() { + let mut row = Vec::new(); + for _ in 0..=word2.len() { + row.push(Element::new()) + } + table.push(row); + } + + // set the boundaries + for i in 0..=word1.len() { + table[i][0].value = i; + } + for i in 1..=word2.len() { + table[0][i].value = i; + } + + for (i1, g1) in word1.graphemes(true).enumerate() { + for (i2, g2) in word2.graphemes(true).enumerate() { + let d_del = table[i1][i2 + 1].value + 1; //deletion + let d_ins = table[i1 + 1][i2].value + 1; //insertion + let d_sub = table[i1][i2].value + (if g1 == g2 { 0 } else { 2 }); // substitution + let min = usize::min(d_del, usize::min(d_ins, d_sub)); + let element = table[i1 + 1].get_mut(i2 + 1).unwrap(); + element.value = min; + // if d_del == min { + // element.backtraces.push(Backtrace::DOWN); + // } + // if d_ins == min { + // element.backtraces.push(Backtrace::LEFT); + // } + // if d_sub == min { + // element.backtraces.push(Backtrace::DIAGONAL); + // } + } + } + table +} + +#[cfg(test)] +mod tests { + use super::{get_edit_distance_table, get_levenshtein_distance}; + + #[test] + fn test_get_levenshtein_distance() { + assert_eq!(get_levenshtein_distance("intention", "execution"), 8); + } + + #[test] + fn test_get_edit_distance_table() { + // example from Stanford NLP course: https://youtu.be/kgcEaoM_QJA + let word1 = "intention"; + let word2 = "execution"; + + let outcome: [[usize; 10]; 10] = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 6, 7, 8], [2, 3, 4, 5, 6, 7, 8, 7, 8, 7], [3, 4, 5, 6, 7, 8, 7, 8, 9, 8], [4, 3, 4, 5, 6, 7, 8, 9, 10, 9], + [5, 4, 5, 6, 7, 8, 9, 10, 11, 10], [6, 5, 6, 7, 8, 9, 8, 9, 10, 11], [7, 6, 7, 8, 9, 10, 9, 8, 9, 10], [8, 7, 8, 9, 10, 11, 10, 9, 8, 9], [9, 8, 9, 10, 11, 12, 11, 10, 9, 8]]; + + let tab = get_edit_distance_table(word1, word2); + + for (rowindex, row) in tab.iter().enumerate() { + for (colindex, element) in row.iter().enumerate() { + assert_eq!(outcome[rowindex][colindex], element.value); + } + } + } +} \ No newline at end of file