diff --git a/README.md b/README.md index 6aae3c1..b20703e 100644 --- a/README.md +++ b/README.md @@ -9,14 +9,14 @@ _So as to avoid re-creating the text in memory, both train and vocab are lazy it rltk has the same philosophy: everything is done using iterators (on iterators) on string slices. Currently in it's infancy (but growing): -* rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\", "a", "b", "c", "\"] -* rltk::util::pad_sequence == same as above with customisation -* rltk::util::pad_sequence_left == same -* rltk::util::pad_sequence_right == same -* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a", "b"], \["b", "c"]] -* rltk::util::bigrams(\["a","b","c"]) == ngrams(..., 2) -* rltk::util::trigrams(\["a","b","c"]) == ngrams(..., 3) -* rltk::util::everygrams(\["a","b","c"],2) == \[\["a"], \["a", "b"], \["b"], \["b", "c"]] -* rltk::util::flatten(\[\["a"], \["a", "b"], \["b"], \["b", "c"]]) == \[\"a", "a", "b", "b", "b", "c"] -* rltk::metrics::distance::edit_distance(): calculate the levenshtein distance between two words (see doc) +* rltk::lm::preprocessing::pad_both_ends +* rltk::util::pad_sequence +* rltk::util::pad_sequence_left +* rltk::util::pad_sequence_right +* rltk::util::ngrams +* rltk::util::bigrams +* rltk::util::trigrams +* rltk::util::everygrams +* rltk::util::flatten +* rltk::metrics::distance::edit_distance diff --git a/src/lib.rs b/src/lib.rs index 4cd840c..623b242 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ pub mod lm; pub mod util; -pub mod metrics; \ No newline at end of file +pub mod metrics; +#[cfg(test)] +pub(crate) mod test; \ No newline at end of file diff --git a/src/lm/preprocessing.rs b/src/lm/preprocessing.rs index 760dd8e..7f5c114 100644 --- a/src/lm/preprocessing.rs +++ b/src/lm/preprocessing.rs @@ -2,10 +2,40 @@ /// /// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices. /// n: the n in n-grams; so for bigrams set to 2, etc -pub fn pad_both_ends<'a>(text: impl Iterator + 'static, n: usize) -> impl Iterator { - crate::util::padding::Padder::new(Box::new(text), true, "", true,"", n) +pub fn pad_both_ends<'a>(text: impl Iterator + 'a, n: usize) -> impl Iterator { + crate::util::padding::Padder::new(Box::new(text), true, &"", true,&"", n) } +pub fn padded_everygrams<'a>(sentence: impl Iterator + 'a, n: usize) -> impl Iterator + 'a>> + 'a { + crate::util::everygrams(pad_both_ends(sentence, n), n) +} +#[cfg(test)] +mod tests{ + use super::*; + #[test] + fn test(){ + let sentence = vec!["a","b", "c"]; + let bigrams = padded_everygrams(sentence.iter(),2); + for b in bigrams.into_iter(){ + for o in b{ + print!("{}, ",o); + } + println!(); + } + // let bigram1 = vec![""]; + // let bigram2 = vec!["", "a"]; + // let bigram3 = vec!["a"]; + // let bigram4 = vec!["a", "b"]; + // let bigram5 = vec!["b"]; + // let bigram6 = vec!["b", "c"]; + // let bigram7 = vec!["c"]; + // let bigram8 = vec!["c", ""]; + // let bigram9 = vec![""]; + // let expected = vec![bigram1.iter(), bigram2.iter(), bigram3.iter(), bigram4.iter(), bigram5.iter(), bigram6.iter(),bigram7.iter(),bigram8.iter(),bigram9.iter()]; + // + // crate::test::should_be_equal_list_of_lists(&mut bigrams, expected) + } +} \ No newline at end of file diff --git a/src/test.rs b/src/test.rs new file mode 100644 index 0000000..6641280 --- /dev/null +++ b/src/test.rs @@ -0,0 +1,16 @@ +use std::slice::Iter; + +pub fn should_be_equal_lists<'a>(left: impl Iterator, right: Vec<&'a str>) { + for (left, right) in left.zip(right.into_iter()) { + assert_eq!(*left, right); + } +} + +pub fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator>, expected: Vec>) { + for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) { + for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) { + // println!("{} {}", actual_inner, expected_inner); + assert_eq!(actual_inner, expected_inner); + } + } +} \ No newline at end of file diff --git a/src/util/mod.rs b/src/util/mod.rs index abe0288..c1ceb9d 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,6 +1,5 @@ pub(crate) mod padding; pub(crate) mod ngrams; - use padding::Padder; /// Returns a padded sequence of items before ngram extraction. @@ -11,7 +10,7 @@ use padding::Padder; /// pad_right: if set to true, appends a padding symbol after the sentence /// right_pad_symbol: the padding symbol to append /// n: the n in n-grams; so for bigrams set to 2, etc -pub fn pad_sequence<'a>(sentence: impl Iterator + 'static, pad_left: bool, left_pad_symbol: &'static str, pad_right: bool, right_pad_symbol: &'static str, n: usize) -> impl Iterator { +pub fn pad_sequence<'a>(sentence: impl Iterator + 'a, pad_left: bool, left_pad_symbol: &'a &'a str, pad_right: bool, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator { Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n) } @@ -19,8 +18,8 @@ pub fn pad_sequence<'a>(sentence: impl Iterator + 'static, pad_lef /// sequence: sequence of items to pad, in the form of an Iterator of string slices. /// left_pad_symbol: the padding symbol to prepend /// n: the n in n-grams; so for bigrams set to 2, etc -pub fn pad_sequence_left<'a>(sequence: impl Iterator + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator { - Padder::new(Box::new(sequence), true, left_pad_symbol, false, "", n) +pub fn pad_sequence_left<'a>(sequence: impl Iterator + 'a, left_pad_symbol: &'a &'a str, n: usize) -> impl Iterator { + Padder::new(Box::new(sequence), true, left_pad_symbol, false, &"", n) } /// Returns a padded sequence of items before ngram extraction, right-padding only. Convenience function that prevents useless arguments @@ -29,8 +28,8 @@ pub fn pad_sequence_left<'a>(sequence: impl Iterator + 'static, le /// pad_right: if set to true, appends a padding symbol after the sentence /// right_pad_symbol: the padding symbol to append /// n: the n in n-grams; so for bigrams set to 2, etc -pub fn pad_sequence_right<'a>(sequence: impl Iterator + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator + 'a { - Padder::new(Box::new(sequence), false, "", true, right_pad_symbol, n) +pub fn pad_sequence_right<'a>(sequence: impl Iterator + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator + 'a { + Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n) } /// Return the ngrams generated from a sequence of items, as an iterator. @@ -74,44 +73,44 @@ pub fn flatten<'a>(ngrams: impl Iterator #[cfg(test)] mod tests { - use std::slice::Iter; use super::*; + use crate::test::*; #[test] fn test_pad_both_ends_default_n2() { - let text = vec!["a", "b", "c"].into_iter(); - let padded = pad_sequence(text, true, "", true, "", 2); - should_be_equal_lists2(padded, vec!["", "a", "b", "c", ""]); + let text = vec!["a", "b", "c"]; + let padded = pad_sequence(text.iter(), true, &"", true, &"", 2); + should_be_equal_lists(padded, vec!["", "a", "b", "c", ""]); } #[test] fn test_pad_left() { - let text = vec!["a", "b", "c"].into_iter(); - let padded = pad_sequence_left(text, "", 2); - should_be_equal_lists2(padded, vec!["", "a", "b", "c"]); + let text = vec!["a", "b", "c"]; + let padded = pad_sequence_left(text.iter(), &"", 2); + should_be_equal_lists(padded, vec!["", "a", "b", "c"]); } #[test] fn test_pad_right() { - let text = vec!["a", "b", "c"].into_iter(); - let padded = pad_sequence_right(text, "", 2); + let text = vec!["a", "b", "c"]; + let padded = pad_sequence_right(text.iter(), &"", 2); - should_be_equal_lists2(padded, vec!["a", "b", "c", ""]); + should_be_equal_lists(padded, vec!["a", "b", "c", ""]); } #[test] fn test_pad_both_ends_default_n_eq_3() { - let text = vec!["a", "b", "c"].into_iter(); - let padded = pad_sequence(text, true, "", true, "", 3); - should_be_equal_lists2(padded, vec!["", "", "a", "b", "c", "", ""]); + let text = vec!["a", "b", "c"]; + let padded = pad_sequence(text.iter(), true, &"", true, &"", 3); + should_be_equal_lists(padded, vec!["", "", "a", "b", "c", "", ""]); } #[test] fn test_pad_both_ends_non_default_symbols() { - let text = vec!["a", "b", "c"].into_iter(); - let padded = pad_sequence(text, true, "left", true, "right", 2); + let text = vec!["a", "b", "c"]; + let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2); - should_be_equal_lists2(padded, vec!["left", "a", "b", "c", "right"]); + should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]); } #[test] @@ -159,7 +158,7 @@ mod tests { let bigram1 = vec!["a", "b"]; let expected = vec![bigram1.iter()]; - should_be_equal_list_of_lists(&mut bigrams, expected) + should_be_equal_list_of_lists(&mut bigrams, expected); } @@ -175,7 +174,7 @@ mod tests { let gram6 = vec!["c", "d"]; let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter()]; - should_be_equal_list_of_lists(&mut bigrams, expected) + should_be_equal_list_of_lists(&mut bigrams, expected); } #[test] @@ -193,7 +192,7 @@ mod tests { let gram9 = vec!["c", "d", "e"]; let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter()]; - should_be_equal_list_of_lists(&mut bigrams, expected) + should_be_equal_list_of_lists(&mut bigrams, expected); } #[test] @@ -203,25 +202,4 @@ mod tests { should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected); } - - fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator>, expected: Vec>) { - for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) { - for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) { - // println!("{} {}", actual_inner, expected_inner); - assert_eq!(actual_inner, expected_inner); - } - } - } - - fn should_be_equal_lists<'a>(left: impl Iterator, right: Vec<&'a str>) { - for (left, right) in left.zip(right.into_iter()) { - assert_eq!(*left, right); - } - } - - fn should_be_equal_lists2<'a>(left: impl Iterator, right: Vec<&'a str>) { - for (left, right) in left.zip(right.into_iter()) { - assert_eq!(left, right); - } - } } \ No newline at end of file diff --git a/src/util/ngrams.rs b/src/util/ngrams.rs index edfeb52..e4d6ddc 100644 --- a/src/util/ngrams.rs +++ b/src/util/ngrams.rs @@ -82,7 +82,12 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> { let maybe_next = self.sequence.next(); if maybe_next.is_some() { self.current_ngram.push(&maybe_next.unwrap()); - } else { return None; } + } else { + self.n = 0; // not pretty, but ensures that the following next will be the last + if self.current_ngram.len() == 0 { + return None; + } + } } return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size))); diff --git a/src/util/padding.rs b/src/util/padding.rs index 78d6572..6311629 100644 --- a/src/util/padding.rs +++ b/src/util/padding.rs @@ -1,16 +1,16 @@ pub struct Padder<'a> { n: usize, - text: Box>, + text: Box + 'a>, pad_left: bool, left_index: isize, - left_pad_symbol: &'static str, + left_pad_symbol: &'a &'a str, pad_right: bool, right_index: isize, - right_pad_symbol: &'static str, + right_pad_symbol: &'a &'a str, } impl<'a> Iterator for Padder<'a> { - type Item = &'a str; + type Item = &'a &'a str; fn next(&mut self) -> Option { if self.pad_left && self.left_index < self.n as isize { @@ -33,8 +33,8 @@ impl<'a> Iterator for Padder<'a> { } impl<'a> Padder<'a> { - pub(crate) fn new(text: Box>, pad_left: bool, left_pad_symbol: &'static str, - pad_right: bool, right_pad_symbol: &'static str, n: usize, ) -> Self { + pub(crate) fn new(text: Box + 'a>, pad_left: bool, left_pad_symbol:&'a &'a str, + pad_right: bool, right_pad_symbol: &'a &'a str, n: usize, ) -> Self { Self { text, n, pad_left, left_index: 1, left_pad_symbol, pad_right, right_index: 1, right_pad_symbol } } }