diff --git a/src/edit_distance.rs b/src/edit_distance.rs index 79af0fd..bc57ca7 100644 --- a/src/edit_distance.rs +++ b/src/edit_distance.rs @@ -1,3 +1,5 @@ +// sandbox, to be removed + use unicode_segmentation::UnicodeSegmentation; //could also be powers of 2 that are combined using bitwise-or diff --git a/src/lib.rs b/src/lib.rs index 4d4af65..9ab8083 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -mod edit_distance; -mod ngrams; +mod edit_distance; // to be removed +mod ngrams;// to be removed pub mod lm; pub mod util; \ No newline at end of file diff --git a/src/ngrams.rs b/src/ngrams.rs index cfdaadc..d0f58bd 100644 --- a/src/ngrams.rs +++ b/src/ngrams.rs @@ -1,3 +1,4 @@ +// sandbox, to be removed use std::cmp::Ordering; use std::collections::BTreeMap; diff --git a/src/util/mod.rs b/src/util/mod.rs index fa59681..5994e01 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -64,6 +64,10 @@ pub fn trigrams<'a>(sequence: impl Iterator + 'a) -> impl Iter ngrams::NGramSequenceIter::new(sequence, 3) } +pub fn everygrams<'a>(sequence: impl Iterator + 'a, n: usize) -> impl Iterator + 'a> + 'a { + ngrams::EveryGramSequenceIter::everygrams(sequence, n) +} + #[cfg(test)] mod tests { use std::slice::Iter; @@ -154,10 +158,45 @@ mod tests { should_be_equal_list_of_lists(&mut bigrams, expected) } - fn should_be_equal_list_of_lists<'a>(bigrams: &mut impl Iterator>, expected: Vec>) { - for (mut left_outer, mut right_outer) in bigrams.zip(expected.into_iter()) { - for (left_inner, right_inner) in left_outer.zip(right_outer) { - assert_eq!(left_inner, right_inner); + + #[test] + fn test_everygrams_n_eq_2() { + let sequence = vec!["a", "b", "c", "d"]; + let mut bigrams = everygrams(sequence.iter(), 2); + let gram1 = vec!["a"]; + let gram2 = vec!["a", "b"]; + let gram3 = vec!["b"]; + let gram4 = vec!["b", "c"]; + let gram5 = vec!["c"]; + let gram6 = vec!["c", "d"]; + let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter()]; + + should_be_equal_list_of_lists(&mut bigrams, expected) + } + + #[test] + fn test_everygrams_n_eq_3() { + let sequence = vec!["a", "b", "c", "d", "e"]; + let mut bigrams = everygrams(sequence.iter(), 3); + let gram1 = vec!["a"]; + let gram2 = vec!["a", "b"]; + let gram3 = vec!["a", "b", "c"]; + let gram4 = vec!["b"]; + let gram5 = vec!["b", "c"]; + let gram6 = vec!["b", "c", "d"]; + let gram7 = vec!["c"]; + let gram8 = vec!["c", "d"]; + let gram9 = vec!["c", "d", "e"]; + let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter()]; + + should_be_equal_list_of_lists(&mut bigrams, expected) + } + + fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator>, expected: Vec>) { + for (mut actual_outer, expected_outer) in actual.zip(expected.into_iter()) { + for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) { + // println!("{} {}", actual_inner, expected_inner); + assert_eq!(actual_inner, expected_inner); } } } diff --git a/src/util/ngrams.rs b/src/util/ngrams.rs index d63d6f5..b72ea30 100644 --- a/src/util/ngrams.rs +++ b/src/util/ngrams.rs @@ -4,7 +4,7 @@ pub struct NGramSequenceIter<'a> { current_ngram: Vec<&'a &'a str>, } -impl <'a> NGramSequenceIter<'a> { +impl<'a> NGramSequenceIter<'a> { pub(crate) fn new(sequence: impl Iterator + 'a, n: usize) -> Self { Self { sequence: Box::new(sequence), @@ -40,3 +40,50 @@ impl<'a> Iterator for NGramSequenceIter<'a> { }; } } + +pub struct EveryGramSequenceIter<'a> { + sequence: Box + 'a>, + n: usize, + current_ngram: Vec<&'a &'a str>, + current_size: usize, +} + +impl<'a> EveryGramSequenceIter<'a> { + pub(crate) fn everygrams(sequence: impl Iterator + 'a, n: usize) -> Self { + Self { + sequence: Box::new(sequence), + n, + current_ngram: Vec::new(), + current_size: 0, + } + } +} + +impl<'a> Iterator for EveryGramSequenceIter<'a> { + type Item = Box + 'a>; + + fn next(&mut self) -> Option { + if self.current_ngram.len() == 0 { + for i in 0..self.n { + if let Some(item) = self.sequence.next() { + self.current_ngram.push(item); + } else { + return None; // n > len + } + } + } + + self.current_size += 1; + + if self.current_size > self.n { + self.current_size = 1; + self.current_ngram.remove(0); + let maybe_next = self.sequence.next(); + if maybe_next.is_some() { + self.current_ngram.push(&maybe_next.unwrap()); + } else { return None; } + } + + return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size))); + } +} \ No newline at end of file