From b9fc3386f0793afa3e06a63073a3d8de8679e300 Mon Sep 17 00:00:00 2001 From: Sander Hautvast Date: Wed, 11 May 2022 15:37:12 +0200 Subject: [PATCH] minor improvements --- src/lm/preprocessing.rs | 16 ++++-- src/test.rs | 4 +- src/util/mod.rs | 108 +++++++++++++++++----------------------- 3 files changed, 60 insertions(+), 68 deletions(-) diff --git a/src/lm/preprocessing.rs b/src/lm/preprocessing.rs index 4a370dc..f4cff45 100644 --- a/src/lm/preprocessing.rs +++ b/src/lm/preprocessing.rs @@ -1,13 +1,19 @@ +use crate::util::flatten; + /// Pads a sequence of words with defaults; prepends "" and appends "" /// /// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices. /// n: the n in n-grams; so for bigrams set to 2, etc -pub fn pad_both_ends<'a>(text: impl Iterator + 'a, n: usize) -> impl Iterator { - crate::util::padding::Padder::new(Box::new(text), true, &"", true,&"", n) +pub fn pad_both_ends<'a>(text: impl Iterator + 'a, order: usize) -> impl Iterator { + crate::util::padding::Padder::new(Box::new(text), true, &"", true, &"", order) } -pub fn padded_everygrams<'a>(sentence: impl Iterator + 'a, n: usize) -> impl Iterator + 'a>> + 'a { - crate::util::everygrams(pad_both_ends(sentence, n), n) +pub fn padded_everygrams<'a>(sentence: impl Iterator + 'a, order: usize) -> impl Iterator + 'a>> + 'a { + crate::util::everygrams(pad_both_ends(sentence, order), order) +} + +pub fn padded_everygram_pipeline<'a>(text: impl Iterator + 'a, order: usize) -> (impl Iterator){ + (text.iter().map(|sent| rltk::lm::preprocessing::pad_both_ends(sent.iter(), order)).flatten())//vocab } #[cfg(test)] @@ -17,7 +23,7 @@ mod tests{ #[test] fn test(){ - let sentence = vec!["a","b", "c"]; + let sentence = ["a","b", "c"]; let mut bigrams = padded_everygrams(sentence.iter(),2); let bigram1 = vec![""]; diff --git a/src/test.rs b/src/test.rs index 9150a0d..1ec2bda 100644 --- a/src/test.rs +++ b/src/test.rs @@ -1,8 +1,8 @@ use std::slice::Iter; -pub fn should_be_equal_lists<'a>(left: impl Iterator, right: Vec<&'a str>) { +pub fn should_be_equal_lists<'a>(left: impl Iterator, right: &[&'a str]) { for (left, right) in left.zip(right.into_iter()) { - assert_eq!(*left, right); + assert_eq!(left, right); } } diff --git a/src/util/mod.rs b/src/util/mod.rs index 9a1274c..dbe54e5 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -29,7 +29,7 @@ pub fn pad_sequence_left<'a>(sequence: impl Iterator + 'a, lef /// pad_right: if set to true, appends a padding symbol after the sentence /// right_pad_symbol: the padding symbol to append /// n: the n in n-grams; so for bigrams set to 2, etc -pub fn pad_sequence_right<'a>(sequence: impl Iterator + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator + 'a { +pub fn pad_sequence_right<'a>(sequence: impl Iterator + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator { Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n) } @@ -52,72 +52,71 @@ pub fn pad_sequence_right<'a>(sequence: impl Iterator + 'a, ri /// } /// ``` /// -pub fn ngrams<'a>(sequence: impl Iterator + 'a, n: usize) -> impl Iterator + 'a> + 'a { +pub fn ngrams<'a>(sequence: impl Iterator + 'a, n: usize) -> impl Iterator + 'a> { ngrams::NGramSequenceIter::new(sequence, n) } -pub fn bigrams<'a>(sequence: impl Iterator + 'a) -> impl Iterator + 'a> + 'a { +pub fn bigrams<'a>(sequence: impl Iterator + 'a) -> impl Iterator + 'a> { ngrams::NGramSequenceIter::new(sequence, 2) } -pub fn trigrams<'a>(sequence: impl Iterator + 'a) -> impl Iterator + 'a> + 'a { +pub fn trigrams<'a>(sequence: impl Iterator + 'a) -> impl Iterator + 'a> { ngrams::NGramSequenceIter::new(sequence, 3) } -pub fn everygrams<'a>(sequence: impl Iterator + 'a, n: usize) -> impl Iterator + 'a>> + 'a { +pub fn everygrams<'a>(sequence: impl Iterator + 'a, n: usize) -> impl Iterator + 'a>> { ngrams::EveryGramSequenceIter::everygrams(sequence, n) } -pub fn flatten<'a>(ngrams: impl Iterator + 'a>> + 'a) -> impl Iterator + 'a { +pub fn flatten<'a>(ngrams: impl Iterator + 'a>> + 'a) -> impl Iterator { ngrams::FlatteningIter::new(ngrams) } #[cfg(test)] mod tests { - use crate::lm::preprocessing::pad_both_ends; use super::*; use crate::test::*; #[test] fn test_pad_both_ends_default_n2() { - let text = vec!["a", "b", "c"]; + let text = ["a", "b", "c"]; let padded = pad_sequence(text.iter(), true, &"", true, &"", 2); - should_be_equal_lists(padded, vec!["", "a", "b", "c", ""]); + should_be_equal_lists(padded, &["", "a", "b", "c", ""]); } #[test] fn test_pad_left() { - let text = vec!["a", "b", "c"]; + let text = ["a", "b", "c"]; let padded = pad_sequence_left(text.iter(), &"", 2); - should_be_equal_lists(padded, vec!["", "a", "b", "c"]); + should_be_equal_lists(padded, &["", "a", "b", "c"]); } #[test] fn test_pad_right() { - let text = vec!["a", "b", "c"]; + let text = ["a", "b", "c"]; let padded = pad_sequence_right(text.iter(), &"", 2); - should_be_equal_lists(padded, vec!["a", "b", "c", ""]); + should_be_equal_lists(padded, &["a", "b", "c", ""]); } #[test] fn test_pad_both_ends_default_n_eq_3() { - let text = vec!["a", "b", "c"]; + let text = ["a", "b", "c"]; let padded = pad_sequence(text.iter(), true, &"", true, &"", 3); - should_be_equal_lists(padded, vec!["", "", "a", "b", "c", "", ""]); + should_be_equal_lists(padded, &["", "", "a", "b", "c", "", ""]); } #[test] fn test_pad_both_ends_non_default_symbols() { - let text = vec!["a", "b", "c"]; + let text = ["a", "b", "c"]; let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2); - should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]); + should_be_equal_lists(padded, &["left", "a", "b", "c", "right"]); } #[test] fn test_bigrams() { - let sequence = vec!["a", "b", "c", "d"]; + let sequence = ["a", "b", "c", "d"]; let mut bigrams = ngrams(sequence.iter(), 2); let bigram1 = vec!["a", "b"]; let bigram2 = vec!["b", "c"]; @@ -129,7 +128,7 @@ mod tests { #[test] fn test_trigrams() { - let sequence = vec!["a", "b", "c", "d", "e"]; + let sequence = ["a", "b", "c", "d", "e"]; let mut bigrams = ngrams(sequence.iter(), 3); let trigram1 = vec!["a", "b", "c"]; let trigram2 = vec!["b", "c", "d"]; @@ -141,21 +140,21 @@ mod tests { #[test] fn test_bigrams_n_gt_len() { - let sequence = vec!["a"]; + let sequence = ["a"]; let mut bigrams = ngrams(sequence.iter(), 2); assert!(bigrams.next().is_none()); } #[test] fn test_bigrams_empty_sequence() { - let sequence = vec![]; + let sequence: Vec<&str> = vec![]; let mut bigrams = ngrams(sequence.iter(), 10); assert!(bigrams.next().is_none()); } #[test] fn test_bigrams_n_eq_len() { - let sequence = vec!["a", "b"]; + let sequence = ["a", "b"]; let mut bigrams = ngrams(sequence.iter(), 2); let bigram1 = vec!["a", "b"]; let expected = vec![bigram1.iter()]; @@ -166,43 +165,37 @@ mod tests { #[test] fn test_everygrams_n_eq_2() { - let sequence = vec!["a", "b", "c", "d"]; + let sequence = ["a", "b", "c", "d"]; let mut grams = everygrams(sequence.iter(), 2); - // let gram1 = vec!["a"]; - // let gram2 = vec!["a", "b"]; - // let gram3 = vec!["b"]; - // let gram4 = vec!["b", "c"]; - // let gram5 = vec!["c"]; - // let gram6 = vec!["c", "d"]; - // let gram7 = vec!["d"]; - // let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()]; + let gram1 = ["a"]; + let gram2 = ["a", "b"]; + let gram3 = ["b"]; + let gram4 = ["b", "c"]; + let gram5 = ["c"]; + let gram6 = ["c", "d"]; + let gram7 = ["d"]; + let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()]; - for i in grams{ - for j in i{ - print!("{},",j); - } - println!(); - } - // should_be_equal_list_of_lists(&mut bigrams, expected); + should_be_equal_list_of_lists(&mut grams, expected); } #[test] fn test_everygrams_n_eq_3() { - let sequence = vec!["a", "b", "c", "d", "e"]; + let sequence = ["a", "b", "c", "d", "e"]; let mut bigrams = everygrams(sequence.iter(), 3); - let gram1 = vec!["a"]; - let gram2 = vec!["a", "b"]; - let gram3 = vec!["a", "b", "c"]; - let gram4 = vec!["b"]; - let gram5 = vec!["b", "c"]; - let gram6 = vec!["b", "c", "d"]; - let gram7 = vec!["c"]; - let gram8 = vec!["c", "d"]; - let gram9 = vec!["c", "d", "e"]; - let gram10 = vec!["d"]; - let gram11 = vec!["d", "e"]; - let gram12 = vec!["e"]; + let gram1 = ["a"]; + let gram2 = ["a", "b"]; + let gram3 = ["a", "b", "c"]; + let gram4 = ["b"]; + let gram5 = ["b", "c"]; + let gram6 = ["b", "c", "d"]; + let gram7 = ["c"]; + let gram8 = ["c", "d"]; + let gram9 = ["c", "d", "e"]; + let gram10 = ["d"]; + let gram11 = ["d", "e"]; + let gram12 = ["e"]; let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter(), gram10.iter(), gram11.iter(), gram12.iter()]; @@ -211,16 +204,9 @@ mod tests { #[test] fn test_flatten() { - let sequence = vec!["a", "b", "c", "d", "e"]; - let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"]; + let sequence = ["a", "b", "c", "d", "e"]; + let expected = ["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"]; - should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected); - } - - #[test] - fn example() { - let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]]; - let result: Vec<&&str> = text.iter().map(|sent|pad_both_ends(sent.iter(),2)).flatten().collect(); - println!("{:?}", result); + should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), &expected); } } \ No newline at end of file