diff --git a/README.md b/README.md
index 6aae3c1..b20703e 100644
--- a/README.md
+++ b/README.md
@@ -9,14 +9,14 @@ _So as to avoid re-creating the text in memory, both train and vocab are lazy it
rltk has the same philosophy: everything is done using iterators (on iterators) on string slices.
Currently in it's infancy (but growing):
-* rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\", "a", "b", "c", "\"]
-* rltk::util::pad_sequence == same as above with customisation
-* rltk::util::pad_sequence_left == same
-* rltk::util::pad_sequence_right == same
-* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a", "b"], \["b", "c"]]
-* rltk::util::bigrams(\["a","b","c"]) == ngrams(..., 2)
-* rltk::util::trigrams(\["a","b","c"]) == ngrams(..., 3)
-* rltk::util::everygrams(\["a","b","c"],2) == \[\["a"], \["a", "b"], \["b"], \["b", "c"]]
-* rltk::util::flatten(\[\["a"], \["a", "b"], \["b"], \["b", "c"]]) == \[\"a", "a", "b", "b", "b", "c"]
-* rltk::metrics::distance::edit_distance(): calculate the levenshtein distance between two words (see doc)
+* rltk::lm::preprocessing::pad_both_ends
+* rltk::util::pad_sequence
+* rltk::util::pad_sequence_left
+* rltk::util::pad_sequence_right
+* rltk::util::ngrams
+* rltk::util::bigrams
+* rltk::util::trigrams
+* rltk::util::everygrams
+* rltk::util::flatten
+* rltk::metrics::distance::edit_distance
diff --git a/src/lib.rs b/src/lib.rs
index 4cd840c..623b242 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,5 @@
pub mod lm;
pub mod util;
-pub mod metrics;
\ No newline at end of file
+pub mod metrics;
+#[cfg(test)]
+pub(crate) mod test;
\ No newline at end of file
diff --git a/src/lm/preprocessing.rs b/src/lm/preprocessing.rs
index 760dd8e..7f5c114 100644
--- a/src/lm/preprocessing.rs
+++ b/src/lm/preprocessing.rs
@@ -2,10 +2,40 @@
///
/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices.
/// n: the n in n-grams; so for bigrams set to 2, etc
-pub fn pad_both_ends<'a>(text: impl Iterator- + 'static, n: usize) -> impl Iterator
- {
- crate::util::padding::Padder::new(Box::new(text), true, "
", true,"", n)
+pub fn pad_both_ends<'a>(text: impl Iterator- + 'a, n: usize) -> impl Iterator
- {
+ crate::util::padding::Padder::new(Box::new(text), true, &"
", true,&"", n)
}
+pub fn padded_everygrams<'a>(sentence: impl Iterator- + 'a, n: usize) -> impl Iterator
- + 'a>> + 'a {
+ crate::util::everygrams(pad_both_ends(sentence, n), n)
+}
+#[cfg(test)]
+mod tests{
+ use super::*;
+ #[test]
+ fn test(){
+ let sentence = vec!["a","b", "c"];
+ let bigrams = padded_everygrams(sentence.iter(),2);
+ for b in bigrams.into_iter(){
+ for o in b{
+ print!("{}, ",o);
+ }
+ println!();
+ }
+ // let bigram1 = vec!["
"];
+ // let bigram2 = vec!["", "a"];
+ // let bigram3 = vec!["a"];
+ // let bigram4 = vec!["a", "b"];
+ // let bigram5 = vec!["b"];
+ // let bigram6 = vec!["b", "c"];
+ // let bigram7 = vec!["c"];
+ // let bigram8 = vec!["c", ""];
+ // let bigram9 = vec![""];
+ // let expected = vec![bigram1.iter(), bigram2.iter(), bigram3.iter(), bigram4.iter(), bigram5.iter(), bigram6.iter(),bigram7.iter(),bigram8.iter(),bigram9.iter()];
+ //
+ // crate::test::should_be_equal_list_of_lists(&mut bigrams, expected)
+ }
+}
\ No newline at end of file
diff --git a/src/test.rs b/src/test.rs
new file mode 100644
index 0000000..6641280
--- /dev/null
+++ b/src/test.rs
@@ -0,0 +1,16 @@
+use std::slice::Iter;
+
+pub fn should_be_equal_lists<'a>(left: impl Iterator- , right: Vec<&'a str>) {
+ for (left, right) in left.zip(right.into_iter()) {
+ assert_eq!(*left, right);
+ }
+}
+
+pub fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator
- >, expected: Vec>) {
+ for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
+ for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
+ // println!("{} {}", actual_inner, expected_inner);
+ assert_eq!(actual_inner, expected_inner);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/util/mod.rs b/src/util/mod.rs
index abe0288..c1ceb9d 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,6 +1,5 @@
pub(crate) mod padding;
pub(crate) mod ngrams;
-
use padding::Padder;
/// Returns a padded sequence of items before ngram extraction.
@@ -11,7 +10,7 @@ use padding::Padder;
/// pad_right: if set to true, appends a padding symbol after the sentence
/// right_pad_symbol: the padding symbol to append
/// n: the n in n-grams; so for bigrams set to 2, etc
-pub fn pad_sequence<'a>(sentence: impl Iterator
- + 'static, pad_left: bool, left_pad_symbol: &'static str, pad_right: bool, right_pad_symbol: &'static str, n: usize) -> impl Iterator
- {
+pub fn pad_sequence<'a>(sentence: impl Iterator
- + 'a, pad_left: bool, left_pad_symbol: &'a &'a str, pad_right: bool, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator
- {
Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n)
}
@@ -19,8 +18,8 @@ pub fn pad_sequence<'a>(sentence: impl Iterator
- + 'static, pad_lef
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
/// left_pad_symbol: the padding symbol to prepend
/// n: the n in n-grams; so for bigrams set to 2, etc
-pub fn pad_sequence_left<'a>(sequence: impl Iterator
- + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator
- {
- Padder::new(Box::new(sequence), true, left_pad_symbol, false, "", n)
+pub fn pad_sequence_left<'a>(sequence: impl Iterator
- + 'a, left_pad_symbol: &'a &'a str, n: usize) -> impl Iterator
- {
+ Padder::new(Box::new(sequence), true, left_pad_symbol, false, &"", n)
}
/// Returns a padded sequence of items before ngram extraction, right-padding only. Convenience function that prevents useless arguments
@@ -29,8 +28,8 @@ pub fn pad_sequence_left<'a>(sequence: impl Iterator
- + 'static, le
/// pad_right: if set to true, appends a padding symbol after the sentence
/// right_pad_symbol: the padding symbol to append
/// n: the n in n-grams; so for bigrams set to 2, etc
-pub fn pad_sequence_right<'a>(sequence: impl Iterator
- + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator
- + 'a {
- Padder::new(Box::new(sequence), false, "", true, right_pad_symbol, n)
+pub fn pad_sequence_right<'a>(sequence: impl Iterator
- + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator
- + 'a {
+ Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n)
}
/// Return the ngrams generated from a sequence of items, as an iterator.
@@ -74,44 +73,44 @@ pub fn flatten<'a>(ngrams: impl Iterator
-
#[cfg(test)]
mod tests {
- use std::slice::Iter;
use super::*;
+ use crate::test::*;
#[test]
fn test_pad_both_ends_default_n2() {
- let text = vec!["a", "b", "c"].into_iter();
- let padded = pad_sequence(text, true, "
", true, "", 2);
- should_be_equal_lists2(padded, vec!["", "a", "b", "c", ""]);
+ let text = vec!["a", "b", "c"];
+ let padded = pad_sequence(text.iter(), true, &"", true, &"", 2);
+ should_be_equal_lists(padded, vec!["", "a", "b", "c", ""]);
}
#[test]
fn test_pad_left() {
- let text = vec!["a", "b", "c"].into_iter();
- let padded = pad_sequence_left(text, "", 2);
- should_be_equal_lists2(padded, vec!["", "a", "b", "c"]);
+ let text = vec!["a", "b", "c"];
+ let padded = pad_sequence_left(text.iter(), &"", 2);
+ should_be_equal_lists(padded, vec!["", "a", "b", "c"]);
}
#[test]
fn test_pad_right() {
- let text = vec!["a", "b", "c"].into_iter();
- let padded = pad_sequence_right(text, "", 2);
+ let text = vec!["a", "b", "c"];
+ let padded = pad_sequence_right(text.iter(), &"", 2);
- should_be_equal_lists2(padded, vec!["a", "b", "c", ""]);
+ should_be_equal_lists(padded, vec!["a", "b", "c", ""]);
}
#[test]
fn test_pad_both_ends_default_n_eq_3() {
- let text = vec!["a", "b", "c"].into_iter();
- let padded = pad_sequence(text, true, "", true, "", 3);
- should_be_equal_lists2(padded, vec!["", "", "a", "b", "c", "", ""]);
+ let text = vec!["a", "b", "c"];
+ let padded = pad_sequence(text.iter(), true, &"", true, &"", 3);
+ should_be_equal_lists(padded, vec!["", "", "a", "b", "c", "", ""]);
}
#[test]
fn test_pad_both_ends_non_default_symbols() {
- let text = vec!["a", "b", "c"].into_iter();
- let padded = pad_sequence(text, true, "left", true, "right", 2);
+ let text = vec!["a", "b", "c"];
+ let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2);
- should_be_equal_lists2(padded, vec!["left", "a", "b", "c", "right"]);
+ should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]);
}
#[test]
@@ -159,7 +158,7 @@ mod tests {
let bigram1 = vec!["a", "b"];
let expected = vec![bigram1.iter()];
- should_be_equal_list_of_lists(&mut bigrams, expected)
+ should_be_equal_list_of_lists(&mut bigrams, expected);
}
@@ -175,7 +174,7 @@ mod tests {
let gram6 = vec!["c", "d"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter()];
- should_be_equal_list_of_lists(&mut bigrams, expected)
+ should_be_equal_list_of_lists(&mut bigrams, expected);
}
#[test]
@@ -193,7 +192,7 @@ mod tests {
let gram9 = vec!["c", "d", "e"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter()];
- should_be_equal_list_of_lists(&mut bigrams, expected)
+ should_be_equal_list_of_lists(&mut bigrams, expected);
}
#[test]
@@ -203,25 +202,4 @@ mod tests {
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected);
}
-
- fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator- >, expected: Vec>) {
- for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
- for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
- // println!("{} {}", actual_inner, expected_inner);
- assert_eq!(actual_inner, expected_inner);
- }
- }
- }
-
- fn should_be_equal_lists<'a>(left: impl Iterator
- , right: Vec<&'a str>) {
- for (left, right) in left.zip(right.into_iter()) {
- assert_eq!(*left, right);
- }
- }
-
- fn should_be_equal_lists2<'a>(left: impl Iterator
- , right: Vec<&'a str>) {
- for (left, right) in left.zip(right.into_iter()) {
- assert_eq!(left, right);
- }
- }
}
\ No newline at end of file
diff --git a/src/util/ngrams.rs b/src/util/ngrams.rs
index edfeb52..e4d6ddc 100644
--- a/src/util/ngrams.rs
+++ b/src/util/ngrams.rs
@@ -82,7 +82,12 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> {
let maybe_next = self.sequence.next();
if maybe_next.is_some() {
self.current_ngram.push(&maybe_next.unwrap());
- } else { return None; }
+ } else {
+ self.n = 0; // not pretty, but ensures that the following next will be the last
+ if self.current_ngram.len() == 0 {
+ return None;
+ }
+ }
}
return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size)));
diff --git a/src/util/padding.rs b/src/util/padding.rs
index 78d6572..6311629 100644
--- a/src/util/padding.rs
+++ b/src/util/padding.rs
@@ -1,16 +1,16 @@
pub struct Padder<'a> {
n: usize,
- text: Box>,
+ text: Box + 'a>,
pad_left: bool,
left_index: isize,
- left_pad_symbol: &'static str,
+ left_pad_symbol: &'a &'a str,
pad_right: bool,
right_index: isize,
- right_pad_symbol: &'static str,
+ right_pad_symbol: &'a &'a str,
}
impl<'a> Iterator for Padder<'a> {
- type Item = &'a str;
+ type Item = &'a &'a str;
fn next(&mut self) -> Option {
if self.pad_left && self.left_index < self.n as isize {
@@ -33,8 +33,8 @@ impl<'a> Iterator for Padder<'a> {
}
impl<'a> Padder<'a> {
- pub(crate) fn new(text: Box>, pad_left: bool, left_pad_symbol: &'static str,
- pad_right: bool, right_pad_symbol: &'static str, n: usize, ) -> Self {
+ pub(crate) fn new(text: Box + 'a>, pad_left: bool, left_pad_symbol:&'a &'a str,
+ pad_right: bool, right_pad_symbol: &'a &'a str, n: usize, ) -> Self {
Self { text, n, pad_left, left_index: 1, left_pad_symbol, pad_right, right_index: 1, right_pad_symbol }
}
}