From 294158a640b4c5951706a5cdbadb77ed9419d80f Mon Sep 17 00:00:00 2001 From: Sander Hautvast Date: Mon, 9 May 2022 22:17:30 +0200 Subject: [PATCH] added flatten, moved edit_distance to the right place --- src/edit_distance.rs | 99 ---------------------- src/lib.rs | 5 +- src/lm/mod.rs | 2 +- src/metrics/distance.rs | 52 ++++++++++++ src/metrics/mod.rs | 33 ++++++++ src/ngrams.rs | 179 ---------------------------------------- src/util/mod.rs | 36 ++++++-- src/util/ngrams.rs | 40 ++++++++- 8 files changed, 153 insertions(+), 293 deletions(-) delete mode 100644 src/edit_distance.rs create mode 100644 src/metrics/distance.rs create mode 100644 src/metrics/mod.rs delete mode 100644 src/ngrams.rs diff --git a/src/edit_distance.rs b/src/edit_distance.rs deleted file mode 100644 index bc57ca7..0000000 --- a/src/edit_distance.rs +++ /dev/null @@ -1,99 +0,0 @@ -// sandbox, to be removed - -use unicode_segmentation::UnicodeSegmentation; - -//could also be powers of 2 that are combined using bitwise-or -// enum Backtrace { -// LEFT, -// DOWN, -// DIAGONAL, -// } - -struct Element { - value: usize, - // backtraces: Vec, -} - -impl Element { - fn new() -> Self { - Self { - value: 0, - // backtraces: Vec::new(), - } - } -} - -pub fn get_levenshtein_distance(word1: &str, word2: &str) -> usize { - get_edit_distance_table(word1, word2)[word1.len()][word2.len()].value -} - -// non recursive implementation requires a table -// my guess is that this is more efficient (should check) -fn get_edit_distance_table(word1: &str, word2: &str) -> Vec> { - // create table - let mut table = Vec::new(); - for _ in 0..=word1.len() { - let mut row = Vec::new(); - for _ in 0..=word2.len() { - row.push(Element::new()) - } - table.push(row); - } - - // set the boundaries - for i in 0..=word1.len() { - table[i][0].value = i; - } - for i in 1..=word2.len() { - table[0][i].value = i; - } - - for (i1, g1) in word1.graphemes(true).enumerate() { - for (i2, g2) in word2.graphemes(true).enumerate() { - let d_del = table[i1][i2 + 1].value + 1; //deletion - let d_ins = table[i1 + 1][i2].value + 1; //insertion - let d_sub = table[i1][i2].value + (if g1 == g2 { 0 } else { 2 }); // substitution - let min = usize::min(d_del, usize::min(d_ins, d_sub)); - let element = table[i1 + 1].get_mut(i2 + 1).unwrap(); - element.value = min; - // if d_del == min { - // element.backtraces.push(Backtrace::DOWN); - // } - // if d_ins == min { - // element.backtraces.push(Backtrace::LEFT); - // } - // if d_sub == min { - // element.backtraces.push(Backtrace::DIAGONAL); - // } - } - } - table -} - -#[cfg(test)] -mod tests { - use super::{get_edit_distance_table, get_levenshtein_distance}; - - #[test] - fn test_get_levenshtein_distance() { - assert_eq!(get_levenshtein_distance("intention", "execution"), 8); - } - - #[test] - fn test_get_edit_distance_table() { - // example from Stanford NLP course: https://youtu.be/kgcEaoM_QJA - let word1 = "intention"; - let word2 = "execution"; - - let outcome: [[usize; 10]; 10] = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 6, 7, 8], [2, 3, 4, 5, 6, 7, 8, 7, 8, 7], [3, 4, 5, 6, 7, 8, 7, 8, 9, 8], [4, 3, 4, 5, 6, 7, 8, 9, 10, 9], - [5, 4, 5, 6, 7, 8, 9, 10, 11, 10], [6, 5, 6, 7, 8, 9, 8, 9, 10, 11], [7, 6, 7, 8, 9, 10, 9, 8, 9, 10], [8, 7, 8, 9, 10, 11, 10, 9, 8, 9], [9, 8, 9, 10, 11, 12, 11, 10, 9, 8]]; - - let tab = get_edit_distance_table(word1, word2); - - for (rowindex, row) in tab.iter().enumerate() { - for (colindex, element) in row.iter().enumerate() { - assert_eq!(outcome[rowindex][colindex], element.value); - } - } - } -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 9ab8083..4cd840c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,3 @@ -mod edit_distance; // to be removed -mod ngrams;// to be removed pub mod lm; -pub mod util; \ No newline at end of file +pub mod util; +pub mod metrics; \ No newline at end of file diff --git a/src/lm/mod.rs b/src/lm/mod.rs index c8db2af..fbfe084 100644 --- a/src/lm/mod.rs +++ b/src/lm/mod.rs @@ -1 +1 @@ -mod preprocessing; \ No newline at end of file +pub mod preprocessing; \ No newline at end of file diff --git a/src/metrics/distance.rs b/src/metrics/distance.rs new file mode 100644 index 0000000..64fab4d --- /dev/null +++ b/src/metrics/distance.rs @@ -0,0 +1,52 @@ +// sandbox, to be removed + +use unicode_segmentation::UnicodeSegmentation; + +pub(crate) struct Element { + pub(crate) value: usize, +} + +impl Element { + fn new() -> Self { + Self { + value: 0, + } + } +} + + + +// non recursive implementation requires a table +// my guess is that this is more efficient (should check) +pub(crate) fn get_edit_distance_table(word1: &str, word2: &str) -> Vec> { + // create table + let mut table = Vec::new(); + for _ in 0..=word1.len() { + let mut row = Vec::new(); + for _ in 0..=word2.len() { + row.push(Element::new()) + } + table.push(row); + } + + // set the boundaries + for i in 0..=word1.len() { + table[i][0].value = i; + } + for i in 1..=word2.len() { + table[0][i].value = i; + } + + for (i1, g1) in word1.graphemes(true).enumerate() { + for (i2, g2) in word2.graphemes(true).enumerate() { + let d_del = table[i1][i2 + 1].value + 1; //deletion + let d_ins = table[i1 + 1][i2].value + 1; //insertion + let d_sub = table[i1][i2].value + (if g1 == g2 { 0 } else { 2 }); // substitution + let min = usize::min(d_del, usize::min(d_ins, d_sub)); + let element = table[i1 + 1].get_mut(i2 + 1).unwrap(); + element.value = min; + } + } + table +} + diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs new file mode 100644 index 0000000..04f720c --- /dev/null +++ b/src/metrics/mod.rs @@ -0,0 +1,33 @@ +pub mod distance; + +pub fn edit_distance(s1: &str, s2: &str) -> usize { + distance::get_edit_distance_table(s1, s2)[s1.len()][s2.len()].value +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_levenshtein_distance() { + assert_eq!(edit_distance("intention", "execution"), 8); + } + + #[test] + fn test_get_edit_distance_table() { + // example from Stanford NLP course: https://youtu.be/kgcEaoM_QJA + let word1 = "intention"; + let word2 = "execution"; + + let outcome: [[usize; 10]; 10] = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 6, 7, 8], [2, 3, 4, 5, 6, 7, 8, 7, 8, 7], [3, 4, 5, 6, 7, 8, 7, 8, 9, 8], [4, 3, 4, 5, 6, 7, 8, 9, 10, 9], + [5, 4, 5, 6, 7, 8, 9, 10, 11, 10], [6, 5, 6, 7, 8, 9, 8, 9, 10, 11], [7, 6, 7, 8, 9, 10, 9, 8, 9, 10], [8, 7, 8, 9, 10, 11, 10, 9, 8, 9], [9, 8, 9, 10, 11, 12, 11, 10, 9, 8]]; + + let tab = distance::get_edit_distance_table(word1, word2); + + for (rowindex, row) in tab.iter().enumerate() { + for (colindex, element) in row.iter().enumerate() { + assert_eq!(outcome[rowindex][colindex], element.value); + } + } + } +} \ No newline at end of file diff --git a/src/ngrams.rs b/src/ngrams.rs deleted file mode 100644 index d0f58bd..0000000 --- a/src/ngrams.rs +++ /dev/null @@ -1,179 +0,0 @@ -// sandbox, to be removed -use std::cmp::Ordering; -use std::collections::BTreeMap; - -#[derive(Debug, Eq, Ord)] -pub struct NGram(Vec<&'static str>); - -impl NGram { - pub fn new(elements: Vec<&'static str>) -> Self { - Self { - 0: elements - } - } - - pub fn new_bigram(element1: &'static str, element2: &'static str) -> Self { - Self { - 0: vec![element1, element2] - } - } - - pub fn new_trigram(element1: &'static str, element2: &'static str, element3: &'static str) -> Self { - Self { - 0: vec![element1, element2, element3] - } - } - - pub fn tail(&self) -> Self { - Self { - 0: self.0[1..].to_vec() - } - } - - pub fn len(&self) -> usize { - self.0.len() - } - - pub fn get(&self, index: usize) -> &'static str { - unsafe { - self.0.get_unchecked(index) - } - } -} - -impl PartialEq for NGram { - fn eq(&self, other: &Self) -> bool { - if self.len() != other.len() { - return false; - } else { - for (i, element) in self.0.iter().enumerate() { - if *element != other.get(i) { - return false; - } - } - } - - true - } -} - -impl PartialOrd for NGram { - fn partial_cmp(&self, other: &Self) -> Option { - if self.eq(other) { - Some(Ordering::Equal) - } else { - for (i, element) in self.0.iter().enumerate() { - if let Some(ordering) = element.partial_cmp(&other.get(i)) { - if ordering != Ordering::Equal { - return Some(ordering); - } - } - } - Some(Ordering::Equal) - } - } -} - -pub struct Model { - word_counts: BTreeMap<&'static str, u32>, - ngram_counts: BTreeMap, -} - -impl Model { - pub fn calc_digrams(corpus: Vec>) -> Self { - let mut word_counts = BTreeMap::new(); - let mut ngram_counts = BTreeMap::new(); - for sentence in corpus { - for word in sentence.iter() { - let count = word_counts.entry(*word).or_insert(0); - *count += 1; - } - - for i in 0..sentence.len() - 1 { - let ngram = NGram::new(vec![sentence[i], sentence[i + 1]]); - let count = ngram_counts.entry(ngram).or_insert(0); - *count += 1; - } - } - Self { - ngram_counts, - word_counts, - } - } - - // only tested for 2-grams, and that's only happy cases - pub fn p(&self, ngram: NGram) -> Option { - // let mut probability = (*self.word_counts.get(ngram.get(0)).unwrap() as f64) / self.word_counts.len() as f64; - // - // - // for index in 0..ngram.len() - 1 { - // println!("{}", probability); - // - // - // - // println!("{}", ng_p); - // probability = ng_p * probability; - // } - // println!("{}", probability); - // Some(probability) - None - } - - fn p_ngram(&self, ngram: NGram, intermediate: f64) -> f64 { - // for index in 0..ngram.len() - 1 { - // self.ngram_counts.get(&ngram) - // .map(|count| (*count as f64) / (self.word_counts[ngram.0.get(0).unwrap()] as f64)).unwrap() - // } - // - // if ngram.len() > 2 { - // println!("{}", intermediate); - // intermediate * self.pp(ngram.tail(), intermediate) //TODO - // } else { - // self.ngram_counts.get(&ngram) - // .map(|count| (*count as f64) / (self.word_counts[ngram.0.get(0).unwrap()] as f64)).unwrap() //TODO - // } - 0.0 - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_create_model() { - let corpus = vec![ - vec!["", "I", "am", "Sam", ""], - vec!["", "Sam", "I", "am", ""], - vec!["", "I", "do", "not", "like", "eggs", "and", "ham", ""], - ]; - - let model = Model::calc_digrams(corpus); - - // assert_eq!(model.p(NGram::new(vec!["", "I"])), Some(0.6666666666666666_f64)); - // assert_eq!(model.p(NGram::new(vec!["Sam", ""])), Some(0.5_f64)); - // assert_eq!(model.p(NGram::new(vec!["", "Sam"])), Some(0.33333333333333333_f64)); - // assert_eq!(model.p(NGram::new(vec!["am", "Sam"])), Some(0.5_f64)); - // assert_eq!(model.p(NGram::new(vec!["I", "am"])), Some(0.6666666666666666_f64)); - - println!("{:?}", model.p(NGram::new(vec!["I", "am", "Sam"]))); - } - - #[test] - fn test_ngram_eq() { - let n1 = NGram::new(vec!["1", "2"]); - let n2 = NGram::new(vec!["1", "2"]); - let n3 = NGram::new(vec!["3", "4"]); - - assert_eq!(n1, n2); - assert_ne!(n1, n3); - assert_ne!(n2, n3); - } - - #[test] - fn test_ngram_tail() { - let n1 = NGram::new(vec!["1", "2", "3"]); - let n2 = NGram::new(vec!["2", "3"]); - assert_eq!(n1.tail(), n2); - } -} \ No newline at end of file diff --git a/src/util/mod.rs b/src/util/mod.rs index 5994e01..abe0288 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,5 @@ pub(crate) mod padding; -mod ngrams; +pub(crate) mod ngrams; use padding::Padder; @@ -64,10 +64,14 @@ pub fn trigrams<'a>(sequence: impl Iterator + 'a) -> impl Iter ngrams::NGramSequenceIter::new(sequence, 3) } -pub fn everygrams<'a>(sequence: impl Iterator + 'a, n: usize) -> impl Iterator + 'a> + 'a { +pub fn everygrams<'a>(sequence: impl Iterator + 'a, n: usize) -> impl Iterator + 'a>> + 'a { ngrams::EveryGramSequenceIter::everygrams(sequence, n) } +pub fn flatten<'a>(ngrams: impl Iterator + 'a>> + 'a) -> impl Iterator + 'a { + ngrams::FlatteningIter::new(ngrams) +} + #[cfg(test)] mod tests { use std::slice::Iter; @@ -77,14 +81,14 @@ mod tests { fn test_pad_both_ends_default_n2() { let text = vec!["a", "b", "c"].into_iter(); let padded = pad_sequence(text, true, "", true, "", 2); - should_be_equal_lists(padded, vec!["", "a", "b", "c", ""]); + should_be_equal_lists2(padded, vec!["", "a", "b", "c", ""]); } #[test] fn test_pad_left() { let text = vec!["a", "b", "c"].into_iter(); let padded = pad_sequence_left(text, "", 2); - should_be_equal_lists(padded, vec!["", "a", "b", "c"]); + should_be_equal_lists2(padded, vec!["", "a", "b", "c"]); } #[test] @@ -92,14 +96,14 @@ mod tests { let text = vec!["a", "b", "c"].into_iter(); let padded = pad_sequence_right(text, "", 2); - should_be_equal_lists(padded, vec!["a", "b", "c", ""]); + should_be_equal_lists2(padded, vec!["a", "b", "c", ""]); } #[test] fn test_pad_both_ends_default_n_eq_3() { let text = vec!["a", "b", "c"].into_iter(); let padded = pad_sequence(text, true, "", true, "", 3); - should_be_equal_lists(padded, vec!["", "", "a", "b", "c", "", ""]); + should_be_equal_lists2(padded, vec!["", "", "a", "b", "c", "", ""]); } #[test] @@ -107,7 +111,7 @@ mod tests { let text = vec!["a", "b", "c"].into_iter(); let padded = pad_sequence(text, true, "left", true, "right", 2); - should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]); + should_be_equal_lists2(padded, vec!["left", "a", "b", "c", "right"]); } #[test] @@ -192,8 +196,16 @@ mod tests { should_be_equal_list_of_lists(&mut bigrams, expected) } + #[test] + fn test_flatten(){ + let sequence = vec!["a", "b", "c", "d", "e"]; + let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"]; + + should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected); + } + fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator>, expected: Vec>) { - for (mut actual_outer, expected_outer) in actual.zip(expected.into_iter()) { + for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) { for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) { // println!("{} {}", actual_inner, expected_inner); assert_eq!(actual_inner, expected_inner); @@ -201,7 +213,13 @@ mod tests { } } - fn should_be_equal_lists<'a>(left: impl Iterator, right: Vec<&'a str>) { + fn should_be_equal_lists<'a>(left: impl Iterator, right: Vec<&'a str>) { + for (left, right) in left.zip(right.into_iter()) { + assert_eq!(*left, right); + } + } + + fn should_be_equal_lists2<'a>(left: impl Iterator, right: Vec<&'a str>) { for (left, right) in left.zip(right.into_iter()) { assert_eq!(left, right); } diff --git a/src/util/ngrams.rs b/src/util/ngrams.rs index b72ea30..edfeb52 100644 --- a/src/util/ngrams.rs +++ b/src/util/ngrams.rs @@ -19,7 +19,7 @@ impl<'a> Iterator for NGramSequenceIter<'a> { fn next(&mut self) -> Option { return if self.current_ngram.len() == 0 { - for i in 0..self.n { + for _ in 0..self.n { if let Some(item) = self.sequence.next() { self.current_ngram.push(item); } else { @@ -62,9 +62,10 @@ impl<'a> EveryGramSequenceIter<'a> { impl<'a> Iterator for EveryGramSequenceIter<'a> { type Item = Box + 'a>; + //noinspection DuplicatedCode, hard to deduplicate because of early return fn next(&mut self) -> Option { if self.current_ngram.len() == 0 { - for i in 0..self.n { + for _ in 0..self.n { if let Some(item) = self.sequence.next() { self.current_ngram.push(item); } else { @@ -86,4 +87,39 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> { return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size))); } +} + +pub struct FlatteningIter<'a> { + ngrams: Box + 'a>> + 'a>, + current_ngram: Option + 'a>>, +} + +impl<'a> FlatteningIter<'a> { + pub(crate) fn new(ngrams: impl Iterator + 'a>> + 'a) -> Self { + Self { + ngrams: Box::new(ngrams), + current_ngram: None, + } + } +} + +impl<'a> Iterator for FlatteningIter<'a> { + type Item = &'a &'a str; + + fn next(&mut self) -> Option { + if self.current_ngram.is_none() { + self.current_ngram = self.ngrams.next(); + } + + while let Some(ref mut current_ngram) = self.current_ngram { + let current_item = current_ngram.next(); + if current_item.is_some() { + return current_item; + } else { + self.current_ngram = self.ngrams.next(); + } + } + + None + } } \ No newline at end of file