diff --git a/examples/lm.rs b/examples/lm.rs new file mode 100644 index 0000000..5d1f729 --- /dev/null +++ b/examples/lm.rs @@ -0,0 +1,50 @@ + +/// These examples are taken from +/// https://www.nltk.org/api/nltk.lm.html +fn main() { + let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]]; + + println!("bigrams of {:?}:", text[0]); + let bigrams = rltk::util::bigrams(text[0].iter()); + print(bigrams); + + println!("\npadding {:?}", text[0]); + let padded: Vec<&&str> = rltk::util::pad_sequence(text[0].iter(), true, &"", true, &"", 2).collect(); + println!("{:?}", padded); + + println!("\ncombining bigrams and padding"); + let combined = rltk::util::bigrams(rltk::lm::preprocessing::pad_both_ends(text[0].iter(),2)); + print(combined); + + // padded_bigrams = list(pad_both_ends(text[0], n=2)) + // >>> list(everygrams(padded_bigrams, max_len=2)) + println!("\neverygrams:"); + let padded_bigrams: Vec<&&str> = rltk::lm::preprocessing::pad_both_ends(text[0].iter(),2).collect(); + println!("padded {:?}",padded_bigrams); + let everygrams = rltk::util::everygrams(padded_bigrams.into_iter(), 2); + print(everygrams); + + print!("or the same with padded_everygrams: "); + let padded_everygrams = rltk::lm::preprocessing::padded_everygrams(text[0].iter(),2); + print(padded_everygrams); + + + println!("\ncombining padding and flattening: {:?}:", text); + let flattened: Vec<&&str> = text.iter().map(|sent| rltk::lm::preprocessing::pad_both_ends(sent.iter(), 2)).flatten().collect(); + println!("{:?}", flattened); + + +} + +fn print<'a>(nested: impl Iterator>) { + print!("["); + + for group in nested { + print!("["); + for word in group { + print!("{},", word); + } + print!("],"); + } + println!("]"); +} diff --git a/src/util/mod.rs b/src/util/mod.rs index aa33196..9a1274c 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,6 @@ pub(crate) mod padding; pub(crate) mod ngrams; + use padding::Padder; /// Returns a padded sequence of items before ngram extraction. @@ -73,6 +74,7 @@ pub fn flatten<'a>(ngrams: impl Iterator #[cfg(test)] mod tests { + use crate::lm::preprocessing::pad_both_ends; use super::*; use crate::test::*; @@ -165,17 +167,23 @@ mod tests { #[test] fn test_everygrams_n_eq_2() { let sequence = vec!["a", "b", "c", "d"]; - let mut bigrams = everygrams(sequence.iter(), 2); - let gram1 = vec!["a"]; - let gram2 = vec!["a", "b"]; - let gram3 = vec!["b"]; - let gram4 = vec!["b", "c"]; - let gram5 = vec!["c"]; - let gram6 = vec!["c", "d"]; - let gram7 = vec!["d"]; - let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()]; + let mut grams = everygrams(sequence.iter(), 2); + // let gram1 = vec!["a"]; + // let gram2 = vec!["a", "b"]; + // let gram3 = vec!["b"]; + // let gram4 = vec!["b", "c"]; + // let gram5 = vec!["c"]; + // let gram6 = vec!["c", "d"]; + // let gram7 = vec!["d"]; + // let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()]; - should_be_equal_list_of_lists(&mut bigrams, expected); + for i in grams{ + for j in i{ + print!("{},",j); + } + println!(); + } + // should_be_equal_list_of_lists(&mut bigrams, expected); } #[test] @@ -202,10 +210,17 @@ mod tests { } #[test] - fn test_flatten(){ + fn test_flatten() { let sequence = vec!["a", "b", "c", "d", "e"]; - let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"]; + let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"]; should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected); } + + #[test] + fn example() { + let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]]; + let result: Vec<&&str> = text.iter().map(|sent|pad_both_ends(sent.iter(),2)).flatten().collect(); + println!("{:?}", result); + } } \ No newline at end of file diff --git a/src/util/ngrams.rs b/src/util/ngrams.rs index e0a2327..4720ed6 100644 --- a/src/util/ngrams.rs +++ b/src/util/ngrams.rs @@ -43,18 +43,18 @@ impl<'a> Iterator for NGramSequenceIter<'a> { pub struct EveryGramSequenceIter<'a> { sequence: Box + 'a>, - n: usize, + max_order: usize, current_ngram: Vec<&'a &'a str>, - current_size: usize, + current_order: usize, } impl<'a> EveryGramSequenceIter<'a> { - pub(crate) fn everygrams(sequence: impl Iterator + 'a, n: usize) -> Self { + pub(crate) fn everygrams(sequence: impl Iterator + 'a, max_order: usize) -> Self { Self { sequence: Box::new(sequence), - n, + max_order, current_ngram: Vec::new(), - current_size: 0, + current_order: 0, } } } @@ -64,8 +64,9 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> { //noinspection DuplicatedCode, hard to deduplicate because of early return fn next(&mut self) -> Option { + // initiate a temp buffer (current_ngram) from which if self.current_ngram.len() == 0 { - for _ in 0..self.n { + for _ in 0..self.max_order { if let Some(item) = self.sequence.next() { self.current_ngram.push(item); } else { @@ -74,36 +75,40 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> { } } - self.current_size += 1; + self.current_order += 1; - if self.current_size > self.n { - self.current_size = 1; - self.current_ngram.remove(0); - let maybe_next = self.sequence.next(); + // slide window to the right in the sentence, if all ngrams of desired max order have been iterated + // and accomodate for end of sentence + if self.current_order > self.max_order { // last item of current ngram reached + self.current_order = 1; // start again with 1 + self.current_ngram.remove(0); // first item is not part of any coming ngrams, and can be removed + let maybe_next = self.sequence.next(); // next item in source if maybe_next.is_some() { self.current_ngram.push(&maybe_next.unwrap()); } else { - self.n -= 1; // not pretty, but ensures correct ending - if self.current_ngram.len() == 0 { + self.max_order -= 1; // the desired max ngram length gets shorter at the end where there are no more new items in the iterator + // theoretically it would be better if we do not mutate max_order and a use a new variable "desired_max_order" oder so etwas. + if self.current_ngram.len() == 0 { // all items have been removed and no new have been added, we're at the end return None; } } } - - return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size))); + // take n items from the ngram where n (current_order) is incremented (unigram, bigram, trigram etc) + return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_order))); } } +/// like flatmap fn pub struct FlatteningIter<'a> { - ngrams: Box + 'a>> + 'a>, - current_ngram: Option + 'a>>, + list_of_lists: Box + 'a>> + 'a>, + current: Option + 'a>>, } impl<'a> FlatteningIter<'a> { pub(crate) fn new(ngrams: impl Iterator + 'a>> + 'a) -> Self { Self { - ngrams: Box::new(ngrams), - current_ngram: None, + list_of_lists: Box::new(ngrams), + current: None, } } } @@ -112,16 +117,16 @@ impl<'a> Iterator for FlatteningIter<'a> { type Item = &'a &'a str; fn next(&mut self) -> Option { - if self.current_ngram.is_none() { - self.current_ngram = self.ngrams.next(); + if self.current.is_none() { + self.current = self.list_of_lists.next(); } - while let Some(ref mut current_ngram) = self.current_ngram { + while let Some(ref mut current_ngram) = self.current { let current_item = current_ngram.next(); if current_item.is_some() { return current_item; } else { - self.current_ngram = self.ngrams.next(); + self.current = self.list_of_lists.next(); } }