added padded_everygrams, refactored lifetimes

This commit is contained in:
Sander Hautvast 2022-05-10 15:19:33 +02:00
parent f72da25396
commit 0fb2b4bb42
7 changed files with 97 additions and 66 deletions

View file

@ -9,14 +9,14 @@ _So as to avoid re-creating the text in memory, both train and vocab are lazy it
rltk has the same philosophy: everything is done using iterators (on iterators) on string slices. rltk has the same philosophy: everything is done using iterators (on iterators) on string slices.
Currently in it's infancy (but growing): Currently in it's infancy (but growing):
* rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\<s>", "a", "b", "c", "\</s>"] * rltk::lm::preprocessing::pad_both_ends
* rltk::util::pad_sequence == same as above with customisation * rltk::util::pad_sequence
* rltk::util::pad_sequence_left == same * rltk::util::pad_sequence_left
* rltk::util::pad_sequence_right == same * rltk::util::pad_sequence_right
* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a", "b"], \["b", "c"]] * rltk::util::ngrams
* rltk::util::bigrams(\["a","b","c"]) == ngrams(..., 2) * rltk::util::bigrams
* rltk::util::trigrams(\["a","b","c"]) == ngrams(..., 3) * rltk::util::trigrams
* rltk::util::everygrams(\["a","b","c"],2) == \[\["a"], \["a", "b"], \["b"], \["b", "c"]] * rltk::util::everygrams
* rltk::util::flatten(\[\["a"], \["a", "b"], \["b"], \["b", "c"]]) == \[\"a", "a", "b", "b", "b", "c"] * rltk::util::flatten
* rltk::metrics::distance::edit_distance(): calculate the levenshtein distance between two words (see doc) * rltk::metrics::distance::edit_distance

View file

@ -1,3 +1,5 @@
pub mod lm; pub mod lm;
pub mod util; pub mod util;
pub mod metrics; pub mod metrics;
#[cfg(test)]
pub(crate) mod test;

View file

@ -2,10 +2,40 @@
/// ///
/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices. /// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices.
/// n: the n in n-grams; so for bigrams set to 2, etc /// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a str> + 'static, n: usize) -> impl Iterator<Item=&'a str> { pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=&'a &'a str> {
crate::util::padding::Padder::new(Box::new(text), true, "<s>", true,"</s>", n) crate::util::padding::Padder::new(Box::new(text), true, &"<s>", true,&"</s>", n)
} }
pub fn padded_everygrams<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
crate::util::everygrams(pad_both_ends(sentence, n), n)
}
#[cfg(test)]
mod tests{
use super::*;
#[test]
fn test(){
let sentence = vec!["a","b", "c"];
let bigrams = padded_everygrams(sentence.iter(),2);
for b in bigrams.into_iter(){
for o in b{
print!("{}, ",o);
}
println!();
}
// let bigram1 = vec!["<s>"];
// let bigram2 = vec!["<s>", "a"];
// let bigram3 = vec!["a"];
// let bigram4 = vec!["a", "b"];
// let bigram5 = vec!["b"];
// let bigram6 = vec!["b", "c"];
// let bigram7 = vec!["c"];
// let bigram8 = vec!["c", "</s>"];
// let bigram9 = vec!["</s>"];
// let expected = vec![bigram1.iter(), bigram2.iter(), bigram3.iter(), bigram4.iter(), bigram5.iter(), bigram6.iter(),bigram7.iter(),bigram8.iter(),bigram9.iter()];
//
// crate::test::should_be_equal_list_of_lists(&mut bigrams, expected)
}
}

16
src/test.rs Normal file
View file

@ -0,0 +1,16 @@
use std::slice::Iter;
pub fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: Vec<&'a str>) {
for (left, right) in left.zip(right.into_iter()) {
assert_eq!(*left, right);
}
}
pub fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
// println!("{} {}", actual_inner, expected_inner);
assert_eq!(actual_inner, expected_inner);
}
}
}

View file

@ -1,6 +1,5 @@
pub(crate) mod padding; pub(crate) mod padding;
pub(crate) mod ngrams; pub(crate) mod ngrams;
use padding::Padder; use padding::Padder;
/// Returns a padded sequence of items before ngram extraction. /// Returns a padded sequence of items before ngram extraction.
@ -11,7 +10,7 @@ use padding::Padder;
/// pad_right: if set to true, appends a padding symbol after the sentence /// pad_right: if set to true, appends a padding symbol after the sentence
/// right_pad_symbol: the padding symbol to append /// right_pad_symbol: the padding symbol to append
/// n: the n in n-grams; so for bigrams set to 2, etc /// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a str> + 'static, pad_left: bool, left_pad_symbol: &'static str, pad_right: bool, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> { pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, pad_left: bool, left_pad_symbol: &'a &'a str, pad_right: bool, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> {
Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n) Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n)
} }
@ -19,8 +18,8 @@ pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a str> + 'static, pad_lef
/// sequence: sequence of items to pad, in the form of an Iterator of string slices. /// sequence: sequence of items to pad, in the form of an Iterator of string slices.
/// left_pad_symbol: the padding symbol to prepend /// left_pad_symbol: the padding symbol to prepend
/// n: the n in n-grams; so for bigrams set to 2, etc /// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a str> + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> { pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, left_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> {
Padder::new(Box::new(sequence), true, left_pad_symbol, false, "", n) Padder::new(Box::new(sequence), true, left_pad_symbol, false, &"", n)
} }
/// Returns a padded sequence of items before ngram extraction, right-padding only. Convenience function that prevents useless arguments /// Returns a padded sequence of items before ngram extraction, right-padding only. Convenience function that prevents useless arguments
@ -29,8 +28,8 @@ pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a str> + 'static, le
/// pad_right: if set to true, appends a padding symbol after the sentence /// pad_right: if set to true, appends a padding symbol after the sentence
/// right_pad_symbol: the padding symbol to append /// right_pad_symbol: the padding symbol to append
/// n: the n in n-grams; so for bigrams set to 2, etc /// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a str> + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> + 'a { pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> + 'a {
Padder::new(Box::new(sequence), false, "", true, right_pad_symbol, n) Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n)
} }
/// Return the ngrams generated from a sequence of items, as an iterator. /// Return the ngrams generated from a sequence of items, as an iterator.
@ -74,44 +73,44 @@ pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str>
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::slice::Iter;
use super::*; use super::*;
use crate::test::*;
#[test] #[test]
fn test_pad_both_ends_default_n2() { fn test_pad_both_ends_default_n2() {
let text = vec!["a", "b", "c"].into_iter(); let text = vec!["a", "b", "c"];
let padded = pad_sequence(text, true, "<s>", true, "</s>", 2); let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 2);
should_be_equal_lists2(padded, vec!["<s>", "a", "b", "c", "</s>"]); should_be_equal_lists(padded, vec!["<s>", "a", "b", "c", "</s>"]);
} }
#[test] #[test]
fn test_pad_left() { fn test_pad_left() {
let text = vec!["a", "b", "c"].into_iter(); let text = vec!["a", "b", "c"];
let padded = pad_sequence_left(text, "<s>", 2); let padded = pad_sequence_left(text.iter(), &"<s>", 2);
should_be_equal_lists2(padded, vec!["<s>", "a", "b", "c"]); should_be_equal_lists(padded, vec!["<s>", "a", "b", "c"]);
} }
#[test] #[test]
fn test_pad_right() { fn test_pad_right() {
let text = vec!["a", "b", "c"].into_iter(); let text = vec!["a", "b", "c"];
let padded = pad_sequence_right(text, "</s>", 2); let padded = pad_sequence_right(text.iter(), &"</s>", 2);
should_be_equal_lists2(padded, vec!["a", "b", "c", "</s>"]); should_be_equal_lists(padded, vec!["a", "b", "c", "</s>"]);
} }
#[test] #[test]
fn test_pad_both_ends_default_n_eq_3() { fn test_pad_both_ends_default_n_eq_3() {
let text = vec!["a", "b", "c"].into_iter(); let text = vec!["a", "b", "c"];
let padded = pad_sequence(text, true, "<s>", true, "</s>", 3); let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 3);
should_be_equal_lists2(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]); should_be_equal_lists(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
} }
#[test] #[test]
fn test_pad_both_ends_non_default_symbols() { fn test_pad_both_ends_non_default_symbols() {
let text = vec!["a", "b", "c"].into_iter(); let text = vec!["a", "b", "c"];
let padded = pad_sequence(text, true, "left", true, "right", 2); let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2);
should_be_equal_lists2(padded, vec!["left", "a", "b", "c", "right"]); should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]);
} }
#[test] #[test]
@ -159,7 +158,7 @@ mod tests {
let bigram1 = vec!["a", "b"]; let bigram1 = vec!["a", "b"];
let expected = vec![bigram1.iter()]; let expected = vec![bigram1.iter()];
should_be_equal_list_of_lists(&mut bigrams, expected) should_be_equal_list_of_lists(&mut bigrams, expected);
} }
@ -175,7 +174,7 @@ mod tests {
let gram6 = vec!["c", "d"]; let gram6 = vec!["c", "d"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter()]; let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter()];
should_be_equal_list_of_lists(&mut bigrams, expected) should_be_equal_list_of_lists(&mut bigrams, expected);
} }
#[test] #[test]
@ -193,7 +192,7 @@ mod tests {
let gram9 = vec!["c", "d", "e"]; let gram9 = vec!["c", "d", "e"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter()]; let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter()];
should_be_equal_list_of_lists(&mut bigrams, expected) should_be_equal_list_of_lists(&mut bigrams, expected);
} }
#[test] #[test]
@ -203,25 +202,4 @@ mod tests {
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected); should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected);
} }
fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
// println!("{} {}", actual_inner, expected_inner);
assert_eq!(actual_inner, expected_inner);
}
}
}
fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: Vec<&'a str>) {
for (left, right) in left.zip(right.into_iter()) {
assert_eq!(*left, right);
}
}
fn should_be_equal_lists2<'a>(left: impl Iterator<Item=&'a str>, right: Vec<&'a str>) {
for (left, right) in left.zip(right.into_iter()) {
assert_eq!(left, right);
}
}
} }

View file

@ -82,7 +82,12 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> {
let maybe_next = self.sequence.next(); let maybe_next = self.sequence.next();
if maybe_next.is_some() { if maybe_next.is_some() {
self.current_ngram.push(&maybe_next.unwrap()); self.current_ngram.push(&maybe_next.unwrap());
} else { return None; } } else {
self.n = 0; // not pretty, but ensures that the following next will be the last
if self.current_ngram.len() == 0 {
return None;
}
}
} }
return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size))); return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size)));

View file

@ -1,16 +1,16 @@
pub struct Padder<'a> { pub struct Padder<'a> {
n: usize, n: usize,
text: Box<dyn Iterator<Item=&'a str>>, text: Box<dyn Iterator<Item=&'a &'a str> + 'a>,
pad_left: bool, pad_left: bool,
left_index: isize, left_index: isize,
left_pad_symbol: &'static str, left_pad_symbol: &'a &'a str,
pad_right: bool, pad_right: bool,
right_index: isize, right_index: isize,
right_pad_symbol: &'static str, right_pad_symbol: &'a &'a str,
} }
impl<'a> Iterator for Padder<'a> { impl<'a> Iterator for Padder<'a> {
type Item = &'a str; type Item = &'a &'a str;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
if self.pad_left && self.left_index < self.n as isize { if self.pad_left && self.left_index < self.n as isize {
@ -33,8 +33,8 @@ impl<'a> Iterator for Padder<'a> {
} }
impl<'a> Padder<'a> { impl<'a> Padder<'a> {
pub(crate) fn new(text: Box<dyn Iterator<Item=&'a str>>, pad_left: bool, left_pad_symbol: &'static str, pub(crate) fn new(text: Box<dyn Iterator<Item=&'a &'a str> + 'a>, pad_left: bool, left_pad_symbol:&'a &'a str,
pad_right: bool, right_pad_symbol: &'static str, n: usize, ) -> Self { pad_right: bool, right_pad_symbol: &'a &'a str, n: usize, ) -> Self {
Self { text, n, pad_left, left_index: 1, left_pad_symbol, pad_right, right_index: 1, right_pad_symbol } Self { text, n, pad_left, left_index: 1, left_pad_symbol, pad_right, right_index: 1, right_pad_symbol }
} }
} }