diff --git a/examples/lm.rs b/examples/lm.rs
new file mode 100644
index 0000000..5d1f729
--- /dev/null
+++ b/examples/lm.rs
@@ -0,0 +1,50 @@
+
+/// These examples are taken from
+/// https://www.nltk.org/api/nltk.lm.html
+fn main() {
+ let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]];
+
+ println!("bigrams of {:?}:", text[0]);
+ let bigrams = rltk::util::bigrams(text[0].iter());
+ print(bigrams);
+
+ println!("\npadding {:?}", text[0]);
+ let padded: Vec<&&str> = rltk::util::pad_sequence(text[0].iter(), true, &"", true, &"", 2).collect();
+ println!("{:?}", padded);
+
+ println!("\ncombining bigrams and padding");
+ let combined = rltk::util::bigrams(rltk::lm::preprocessing::pad_both_ends(text[0].iter(),2));
+ print(combined);
+
+ // padded_bigrams = list(pad_both_ends(text[0], n=2))
+ // >>> list(everygrams(padded_bigrams, max_len=2))
+ println!("\neverygrams:");
+ let padded_bigrams: Vec<&&str> = rltk::lm::preprocessing::pad_both_ends(text[0].iter(),2).collect();
+ println!("padded {:?}",padded_bigrams);
+ let everygrams = rltk::util::everygrams(padded_bigrams.into_iter(), 2);
+ print(everygrams);
+
+ print!("or the same with padded_everygrams: ");
+ let padded_everygrams = rltk::lm::preprocessing::padded_everygrams(text[0].iter(),2);
+ print(padded_everygrams);
+
+
+ println!("\ncombining padding and flattening: {:?}:", text);
+ let flattened: Vec<&&str> = text.iter().map(|sent| rltk::lm::preprocessing::pad_both_ends(sent.iter(), 2)).flatten().collect();
+ println!("{:?}", flattened);
+
+
+}
+
+fn print<'a>(nested: impl Iterator- >) {
+ print!("[");
+
+ for group in nested {
+ print!("[");
+ for word in group {
+ print!("{},", word);
+ }
+ print!("],");
+ }
+ println!("]");
+}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index aa33196..9a1274c 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,5 +1,6 @@
pub(crate) mod padding;
pub(crate) mod ngrams;
+
use padding::Padder;
/// Returns a padded sequence of items before ngram extraction.
@@ -73,6 +74,7 @@ pub fn flatten<'a>(ngrams: impl Iterator
-
#[cfg(test)]
mod tests {
+ use crate::lm::preprocessing::pad_both_ends;
use super::*;
use crate::test::*;
@@ -165,17 +167,23 @@ mod tests {
#[test]
fn test_everygrams_n_eq_2() {
let sequence = vec!["a", "b", "c", "d"];
- let mut bigrams = everygrams(sequence.iter(), 2);
- let gram1 = vec!["a"];
- let gram2 = vec!["a", "b"];
- let gram3 = vec!["b"];
- let gram4 = vec!["b", "c"];
- let gram5 = vec!["c"];
- let gram6 = vec!["c", "d"];
- let gram7 = vec!["d"];
- let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()];
+ let mut grams = everygrams(sequence.iter(), 2);
+ // let gram1 = vec!["a"];
+ // let gram2 = vec!["a", "b"];
+ // let gram3 = vec!["b"];
+ // let gram4 = vec!["b", "c"];
+ // let gram5 = vec!["c"];
+ // let gram6 = vec!["c", "d"];
+ // let gram7 = vec!["d"];
+ // let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()];
- should_be_equal_list_of_lists(&mut bigrams, expected);
+ for i in grams{
+ for j in i{
+ print!("{},",j);
+ }
+ println!();
+ }
+ // should_be_equal_list_of_lists(&mut bigrams, expected);
}
#[test]
@@ -202,10 +210,17 @@ mod tests {
}
#[test]
- fn test_flatten(){
+ fn test_flatten() {
let sequence = vec!["a", "b", "c", "d", "e"];
- let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"];
+ let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"];
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected);
}
+
+ #[test]
+ fn example() {
+ let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]];
+ let result: Vec<&&str> = text.iter().map(|sent|pad_both_ends(sent.iter(),2)).flatten().collect();
+ println!("{:?}", result);
+ }
}
\ No newline at end of file
diff --git a/src/util/ngrams.rs b/src/util/ngrams.rs
index e0a2327..4720ed6 100644
--- a/src/util/ngrams.rs
+++ b/src/util/ngrams.rs
@@ -43,18 +43,18 @@ impl<'a> Iterator for NGramSequenceIter<'a> {
pub struct EveryGramSequenceIter<'a> {
sequence: Box + 'a>,
- n: usize,
+ max_order: usize,
current_ngram: Vec<&'a &'a str>,
- current_size: usize,
+ current_order: usize,
}
impl<'a> EveryGramSequenceIter<'a> {
- pub(crate) fn everygrams(sequence: impl Iterator
- + 'a, n: usize) -> Self {
+ pub(crate) fn everygrams(sequence: impl Iterator
- + 'a, max_order: usize) -> Self {
Self {
sequence: Box::new(sequence),
- n,
+ max_order,
current_ngram: Vec::new(),
- current_size: 0,
+ current_order: 0,
}
}
}
@@ -64,8 +64,9 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> {
//noinspection DuplicatedCode, hard to deduplicate because of early return
fn next(&mut self) -> Option {
+ // initiate a temp buffer (current_ngram) from which
if self.current_ngram.len() == 0 {
- for _ in 0..self.n {
+ for _ in 0..self.max_order {
if let Some(item) = self.sequence.next() {
self.current_ngram.push(item);
} else {
@@ -74,36 +75,40 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> {
}
}
- self.current_size += 1;
+ self.current_order += 1;
- if self.current_size > self.n {
- self.current_size = 1;
- self.current_ngram.remove(0);
- let maybe_next = self.sequence.next();
+ // slide window to the right in the sentence, if all ngrams of desired max order have been iterated
+ // and accomodate for end of sentence
+ if self.current_order > self.max_order { // last item of current ngram reached
+ self.current_order = 1; // start again with 1
+ self.current_ngram.remove(0); // first item is not part of any coming ngrams, and can be removed
+ let maybe_next = self.sequence.next(); // next item in source
if maybe_next.is_some() {
self.current_ngram.push(&maybe_next.unwrap());
} else {
- self.n -= 1; // not pretty, but ensures correct ending
- if self.current_ngram.len() == 0 {
+ self.max_order -= 1; // the desired max ngram length gets shorter at the end where there are no more new items in the iterator
+ // theoretically it would be better if we do not mutate max_order and a use a new variable "desired_max_order" oder so etwas.
+ if self.current_ngram.len() == 0 { // all items have been removed and no new have been added, we're at the end
return None;
}
}
}
-
- return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size)));
+ // take n items from the ngram where n (current_order) is incremented (unigram, bigram, trigram etc)
+ return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_order)));
}
}
+/// like flatmap fn
pub struct FlatteningIter<'a> {
- ngrams: Box + 'a>> + 'a>,
- current_ngram: Option + 'a>>,
+ list_of_lists: Box + 'a>> + 'a>,
+ current: Option + 'a>>,
}
impl<'a> FlatteningIter<'a> {
pub(crate) fn new(ngrams: impl Iterator
- + 'a>> + 'a) -> Self {
Self {
- ngrams: Box::new(ngrams),
- current_ngram: None,
+ list_of_lists: Box::new(ngrams),
+ current: None,
}
}
}
@@ -112,16 +117,16 @@ impl<'a> Iterator for FlatteningIter<'a> {
type Item = &'a &'a str;
fn next(&mut self) -> Option {
- if self.current_ngram.is_none() {
- self.current_ngram = self.ngrams.next();
+ if self.current.is_none() {
+ self.current = self.list_of_lists.next();
}
- while let Some(ref mut current_ngram) = self.current_ngram {
+ while let Some(ref mut current_ngram) = self.current {
let current_item = current_ngram.next();
if current_item.is_some() {
return current_item;
} else {
- self.current_ngram = self.ngrams.next();
+ self.current = self.list_of_lists.next();
}
}