added example

This commit is contained in:
Sander Hautvast 2022-05-10 21:56:05 +02:00
parent b9a5490325
commit 67166b8ad1
3 changed files with 105 additions and 35 deletions

50
examples/lm.rs Normal file
View file

@ -0,0 +1,50 @@
/// These examples are taken from
/// https://www.nltk.org/api/nltk.lm.html
fn main() {
let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]];
println!("bigrams of {:?}:", text[0]);
let bigrams = rltk::util::bigrams(text[0].iter());
print(bigrams);
println!("\npadding {:?}", text[0]);
let padded: Vec<&&str> = rltk::util::pad_sequence(text[0].iter(), true, &"<s>", true, &"</s>", 2).collect();
println!("{:?}", padded);
println!("\ncombining bigrams and padding");
let combined = rltk::util::bigrams(rltk::lm::preprocessing::pad_both_ends(text[0].iter(),2));
print(combined);
// padded_bigrams = list(pad_both_ends(text[0], n=2))
// >>> list(everygrams(padded_bigrams, max_len=2))
println!("\neverygrams:");
let padded_bigrams: Vec<&&str> = rltk::lm::preprocessing::pad_both_ends(text[0].iter(),2).collect();
println!("padded {:?}",padded_bigrams);
let everygrams = rltk::util::everygrams(padded_bigrams.into_iter(), 2);
print(everygrams);
print!("or the same with padded_everygrams: ");
let padded_everygrams = rltk::lm::preprocessing::padded_everygrams(text[0].iter(),2);
print(padded_everygrams);
println!("\ncombining padding and flattening: {:?}:", text);
let flattened: Vec<&&str> = text.iter().map(|sent| rltk::lm::preprocessing::pad_both_ends(sent.iter(), 2)).flatten().collect();
println!("{:?}", flattened);
}
fn print<'a>(nested: impl Iterator<Item=impl Iterator<Item=&'a &'a str>>) {
print!("[");
for group in nested {
print!("[");
for word in group {
print!("{},", word);
}
print!("],");
}
println!("]");
}

View file

@ -1,5 +1,6 @@
pub(crate) mod padding; pub(crate) mod padding;
pub(crate) mod ngrams; pub(crate) mod ngrams;
use padding::Padder; use padding::Padder;
/// Returns a padded sequence of items before ngram extraction. /// Returns a padded sequence of items before ngram extraction.
@ -73,6 +74,7 @@ pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str>
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::lm::preprocessing::pad_both_ends;
use super::*; use super::*;
use crate::test::*; use crate::test::*;
@ -165,17 +167,23 @@ mod tests {
#[test] #[test]
fn test_everygrams_n_eq_2() { fn test_everygrams_n_eq_2() {
let sequence = vec!["a", "b", "c", "d"]; let sequence = vec!["a", "b", "c", "d"];
let mut bigrams = everygrams(sequence.iter(), 2); let mut grams = everygrams(sequence.iter(), 2);
let gram1 = vec!["a"]; // let gram1 = vec!["a"];
let gram2 = vec!["a", "b"]; // let gram2 = vec!["a", "b"];
let gram3 = vec!["b"]; // let gram3 = vec!["b"];
let gram4 = vec!["b", "c"]; // let gram4 = vec!["b", "c"];
let gram5 = vec!["c"]; // let gram5 = vec!["c"];
let gram6 = vec!["c", "d"]; // let gram6 = vec!["c", "d"];
let gram7 = vec!["d"]; // let gram7 = vec!["d"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()]; // let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()];
should_be_equal_list_of_lists(&mut bigrams, expected); for i in grams{
for j in i{
print!("{},",j);
}
println!();
}
// should_be_equal_list_of_lists(&mut bigrams, expected);
} }
#[test] #[test]
@ -202,10 +210,17 @@ mod tests {
} }
#[test] #[test]
fn test_flatten(){ fn test_flatten() {
let sequence = vec!["a", "b", "c", "d", "e"]; let sequence = vec!["a", "b", "c", "d", "e"];
let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"]; let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"];
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected); should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected);
} }
#[test]
fn example() {
let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]];
let result: Vec<&&str> = text.iter().map(|sent|pad_both_ends(sent.iter(),2)).flatten().collect();
println!("{:?}", result);
}
} }

View file

@ -43,18 +43,18 @@ impl<'a> Iterator for NGramSequenceIter<'a> {
pub struct EveryGramSequenceIter<'a> { pub struct EveryGramSequenceIter<'a> {
sequence: Box<dyn Iterator<Item=&'a &'a str> + 'a>, sequence: Box<dyn Iterator<Item=&'a &'a str> + 'a>,
n: usize, max_order: usize,
current_ngram: Vec<&'a &'a str>, current_ngram: Vec<&'a &'a str>,
current_size: usize, current_order: usize,
} }
impl<'a> EveryGramSequenceIter<'a> { impl<'a> EveryGramSequenceIter<'a> {
pub(crate) fn everygrams(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> Self { pub(crate) fn everygrams(sequence: impl Iterator<Item=&'a &'a str> + 'a, max_order: usize) -> Self {
Self { Self {
sequence: Box::new(sequence), sequence: Box::new(sequence),
n, max_order,
current_ngram: Vec::new(), current_ngram: Vec::new(),
current_size: 0, current_order: 0,
} }
} }
} }
@ -64,8 +64,9 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> {
//noinspection DuplicatedCode, hard to deduplicate because of early return //noinspection DuplicatedCode, hard to deduplicate because of early return
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
// initiate a temp buffer (current_ngram) from which
if self.current_ngram.len() == 0 { if self.current_ngram.len() == 0 {
for _ in 0..self.n { for _ in 0..self.max_order {
if let Some(item) = self.sequence.next() { if let Some(item) = self.sequence.next() {
self.current_ngram.push(item); self.current_ngram.push(item);
} else { } else {
@ -74,36 +75,40 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> {
} }
} }
self.current_size += 1; self.current_order += 1;
if self.current_size > self.n { // slide window to the right in the sentence, if all ngrams of desired max order have been iterated
self.current_size = 1; // and accomodate for end of sentence
self.current_ngram.remove(0); if self.current_order > self.max_order { // last item of current ngram reached
let maybe_next = self.sequence.next(); self.current_order = 1; // start again with 1
self.current_ngram.remove(0); // first item is not part of any coming ngrams, and can be removed
let maybe_next = self.sequence.next(); // next item in source
if maybe_next.is_some() { if maybe_next.is_some() {
self.current_ngram.push(&maybe_next.unwrap()); self.current_ngram.push(&maybe_next.unwrap());
} else { } else {
self.n -= 1; // not pretty, but ensures correct ending self.max_order -= 1; // the desired max ngram length gets shorter at the end where there are no more new items in the iterator
if self.current_ngram.len() == 0 { // theoretically it would be better if we do not mutate max_order and a use a new variable "desired_max_order" oder so etwas.
if self.current_ngram.len() == 0 { // all items have been removed and no new have been added, we're at the end
return None; return None;
} }
} }
} }
// take n items from the ngram where n (current_order) is incremented (unigram, bigram, trigram etc)
return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size))); return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_order)));
} }
} }
/// like flatmap fn
pub struct FlatteningIter<'a> { pub struct FlatteningIter<'a> {
ngrams: Box<dyn Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a>, list_of_lists: Box<dyn Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a>,
current_ngram: Option<Box<dyn Iterator<Item=&'a &'a str> + 'a>>, current: Option<Box<dyn Iterator<Item=&'a &'a str> + 'a>>,
} }
impl<'a> FlatteningIter<'a> { impl<'a> FlatteningIter<'a> {
pub(crate) fn new(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> Self { pub(crate) fn new(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> Self {
Self { Self {
ngrams: Box::new(ngrams), list_of_lists: Box::new(ngrams),
current_ngram: None, current: None,
} }
} }
} }
@ -112,16 +117,16 @@ impl<'a> Iterator for FlatteningIter<'a> {
type Item = &'a &'a str; type Item = &'a &'a str;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
if self.current_ngram.is_none() { if self.current.is_none() {
self.current_ngram = self.ngrams.next(); self.current = self.list_of_lists.next();
} }
while let Some(ref mut current_ngram) = self.current_ngram { while let Some(ref mut current_ngram) = self.current {
let current_item = current_ngram.next(); let current_item = current_ngram.next();
if current_item.is_some() { if current_item.is_some() {
return current_item; return current_item;
} else { } else {
self.current_ngram = self.ngrams.next(); self.current = self.list_of_lists.next();
} }
} }