added padded_everygrams, refactored lifetimes
This commit is contained in:
parent
f72da25396
commit
0fb2b4bb42
7 changed files with 97 additions and 66 deletions
20
README.md
20
README.md
|
|
@ -9,14 +9,14 @@ _So as to avoid re-creating the text in memory, both train and vocab are lazy it
|
|||
rltk has the same philosophy: everything is done using iterators (on iterators) on string slices.
|
||||
|
||||
Currently in it's infancy (but growing):
|
||||
* rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\<s>", "a", "b", "c", "\</s>"]
|
||||
* rltk::util::pad_sequence == same as above with customisation
|
||||
* rltk::util::pad_sequence_left == same
|
||||
* rltk::util::pad_sequence_right == same
|
||||
* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a", "b"], \["b", "c"]]
|
||||
* rltk::util::bigrams(\["a","b","c"]) == ngrams(..., 2)
|
||||
* rltk::util::trigrams(\["a","b","c"]) == ngrams(..., 3)
|
||||
* rltk::util::everygrams(\["a","b","c"],2) == \[\["a"], \["a", "b"], \["b"], \["b", "c"]]
|
||||
* rltk::util::flatten(\[\["a"], \["a", "b"], \["b"], \["b", "c"]]) == \[\"a", "a", "b", "b", "b", "c"]
|
||||
* rltk::metrics::distance::edit_distance(): calculate the levenshtein distance between two words (see doc)
|
||||
* rltk::lm::preprocessing::pad_both_ends
|
||||
* rltk::util::pad_sequence
|
||||
* rltk::util::pad_sequence_left
|
||||
* rltk::util::pad_sequence_right
|
||||
* rltk::util::ngrams
|
||||
* rltk::util::bigrams
|
||||
* rltk::util::trigrams
|
||||
* rltk::util::everygrams
|
||||
* rltk::util::flatten
|
||||
* rltk::metrics::distance::edit_distance
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
pub mod lm;
|
||||
pub mod util;
|
||||
pub mod metrics;
|
||||
pub mod metrics;
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test;
|
||||
|
|
@ -2,10 +2,40 @@
|
|||
///
|
||||
/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices.
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a str> + 'static, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
crate::util::padding::Padder::new(Box::new(text), true, "<s>", true,"</s>", n)
|
||||
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=&'a &'a str> {
|
||||
crate::util::padding::Padder::new(Box::new(text), true, &"<s>", true,&"</s>", n)
|
||||
}
|
||||
|
||||
pub fn padded_everygrams<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
|
||||
crate::util::everygrams(pad_both_ends(sentence, n), n)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests{
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test(){
|
||||
let sentence = vec!["a","b", "c"];
|
||||
let bigrams = padded_everygrams(sentence.iter(),2);
|
||||
for b in bigrams.into_iter(){
|
||||
for o in b{
|
||||
print!("{}, ",o);
|
||||
}
|
||||
println!();
|
||||
}
|
||||
// let bigram1 = vec!["<s>"];
|
||||
// let bigram2 = vec!["<s>", "a"];
|
||||
// let bigram3 = vec!["a"];
|
||||
// let bigram4 = vec!["a", "b"];
|
||||
// let bigram5 = vec!["b"];
|
||||
// let bigram6 = vec!["b", "c"];
|
||||
// let bigram7 = vec!["c"];
|
||||
// let bigram8 = vec!["c", "</s>"];
|
||||
// let bigram9 = vec!["</s>"];
|
||||
// let expected = vec![bigram1.iter(), bigram2.iter(), bigram3.iter(), bigram4.iter(), bigram5.iter(), bigram6.iter(),bigram7.iter(),bigram8.iter(),bigram9.iter()];
|
||||
//
|
||||
// crate::test::should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||
}
|
||||
}
|
||||
16
src/test.rs
Normal file
16
src/test.rs
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
use std::slice::Iter;
|
||||
|
||||
pub fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: Vec<&'a str>) {
|
||||
for (left, right) in left.zip(right.into_iter()) {
|
||||
assert_eq!(*left, right);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
|
||||
for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
|
||||
for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
|
||||
// println!("{} {}", actual_inner, expected_inner);
|
||||
assert_eq!(actual_inner, expected_inner);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,5 @@
|
|||
pub(crate) mod padding;
|
||||
pub(crate) mod ngrams;
|
||||
|
||||
use padding::Padder;
|
||||
|
||||
/// Returns a padded sequence of items before ngram extraction.
|
||||
|
|
@ -11,7 +10,7 @@ use padding::Padder;
|
|||
/// pad_right: if set to true, appends a padding symbol after the sentence
|
||||
/// right_pad_symbol: the padding symbol to append
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a str> + 'static, pad_left: bool, left_pad_symbol: &'static str, pad_right: bool, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, pad_left: bool, left_pad_symbol: &'a &'a str, pad_right: bool, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> {
|
||||
Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n)
|
||||
}
|
||||
|
||||
|
|
@ -19,8 +18,8 @@ pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a str> + 'static, pad_lef
|
|||
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
|
||||
/// left_pad_symbol: the padding symbol to prepend
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a str> + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
Padder::new(Box::new(sequence), true, left_pad_symbol, false, "", n)
|
||||
pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, left_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> {
|
||||
Padder::new(Box::new(sequence), true, left_pad_symbol, false, &"", n)
|
||||
}
|
||||
|
||||
/// Returns a padded sequence of items before ngram extraction, right-padding only. Convenience function that prevents useless arguments
|
||||
|
|
@ -29,8 +28,8 @@ pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a str> + 'static, le
|
|||
/// pad_right: if set to true, appends a padding symbol after the sentence
|
||||
/// right_pad_symbol: the padding symbol to append
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a str> + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> + 'a {
|
||||
Padder::new(Box::new(sequence), false, "", true, right_pad_symbol, n)
|
||||
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> + 'a {
|
||||
Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n)
|
||||
}
|
||||
|
||||
/// Return the ngrams generated from a sequence of items, as an iterator.
|
||||
|
|
@ -74,44 +73,44 @@ pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str>
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::slice::Iter;
|
||||
use super::*;
|
||||
use crate::test::*;
|
||||
|
||||
#[test]
|
||||
fn test_pad_both_ends_default_n2() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence(text, true, "<s>", true, "</s>", 2);
|
||||
should_be_equal_lists2(padded, vec!["<s>", "a", "b", "c", "</s>"]);
|
||||
let text = vec!["a", "b", "c"];
|
||||
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 2);
|
||||
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c", "</s>"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_left() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence_left(text, "<s>", 2);
|
||||
should_be_equal_lists2(padded, vec!["<s>", "a", "b", "c"]);
|
||||
let text = vec!["a", "b", "c"];
|
||||
let padded = pad_sequence_left(text.iter(), &"<s>", 2);
|
||||
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_right() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence_right(text, "</s>", 2);
|
||||
let text = vec!["a", "b", "c"];
|
||||
let padded = pad_sequence_right(text.iter(), &"</s>", 2);
|
||||
|
||||
should_be_equal_lists2(padded, vec!["a", "b", "c", "</s>"]);
|
||||
should_be_equal_lists(padded, vec!["a", "b", "c", "</s>"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_both_ends_default_n_eq_3() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence(text, true, "<s>", true, "</s>", 3);
|
||||
should_be_equal_lists2(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
|
||||
let text = vec!["a", "b", "c"];
|
||||
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 3);
|
||||
should_be_equal_lists(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_both_ends_non_default_symbols() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence(text, true, "left", true, "right", 2);
|
||||
let text = vec!["a", "b", "c"];
|
||||
let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2);
|
||||
|
||||
should_be_equal_lists2(padded, vec!["left", "a", "b", "c", "right"]);
|
||||
should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -159,7 +158,7 @@ mod tests {
|
|||
let bigram1 = vec!["a", "b"];
|
||||
let expected = vec![bigram1.iter()];
|
||||
|
||||
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||
should_be_equal_list_of_lists(&mut bigrams, expected);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -175,7 +174,7 @@ mod tests {
|
|||
let gram6 = vec!["c", "d"];
|
||||
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter()];
|
||||
|
||||
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||
should_be_equal_list_of_lists(&mut bigrams, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -193,7 +192,7 @@ mod tests {
|
|||
let gram9 = vec!["c", "d", "e"];
|
||||
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter()];
|
||||
|
||||
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||
should_be_equal_list_of_lists(&mut bigrams, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -203,25 +202,4 @@ mod tests {
|
|||
|
||||
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected);
|
||||
}
|
||||
|
||||
fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
|
||||
for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
|
||||
for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
|
||||
// println!("{} {}", actual_inner, expected_inner);
|
||||
assert_eq!(actual_inner, expected_inner);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: Vec<&'a str>) {
|
||||
for (left, right) in left.zip(right.into_iter()) {
|
||||
assert_eq!(*left, right);
|
||||
}
|
||||
}
|
||||
|
||||
fn should_be_equal_lists2<'a>(left: impl Iterator<Item=&'a str>, right: Vec<&'a str>) {
|
||||
for (left, right) in left.zip(right.into_iter()) {
|
||||
assert_eq!(left, right);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -82,7 +82,12 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> {
|
|||
let maybe_next = self.sequence.next();
|
||||
if maybe_next.is_some() {
|
||||
self.current_ngram.push(&maybe_next.unwrap());
|
||||
} else { return None; }
|
||||
} else {
|
||||
self.n = 0; // not pretty, but ensures that the following next will be the last
|
||||
if self.current_ngram.len() == 0 {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size)));
|
||||
|
|
|
|||
|
|
@ -1,16 +1,16 @@
|
|||
pub struct Padder<'a> {
|
||||
n: usize,
|
||||
text: Box<dyn Iterator<Item=&'a str>>,
|
||||
text: Box<dyn Iterator<Item=&'a &'a str> + 'a>,
|
||||
pad_left: bool,
|
||||
left_index: isize,
|
||||
left_pad_symbol: &'static str,
|
||||
left_pad_symbol: &'a &'a str,
|
||||
pad_right: bool,
|
||||
right_index: isize,
|
||||
right_pad_symbol: &'static str,
|
||||
right_pad_symbol: &'a &'a str,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Padder<'a> {
|
||||
type Item = &'a str;
|
||||
type Item = &'a &'a str;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.pad_left && self.left_index < self.n as isize {
|
||||
|
|
@ -33,8 +33,8 @@ impl<'a> Iterator for Padder<'a> {
|
|||
}
|
||||
|
||||
impl<'a> Padder<'a> {
|
||||
pub(crate) fn new(text: Box<dyn Iterator<Item=&'a str>>, pad_left: bool, left_pad_symbol: &'static str,
|
||||
pad_right: bool, right_pad_symbol: &'static str, n: usize, ) -> Self {
|
||||
pub(crate) fn new(text: Box<dyn Iterator<Item=&'a &'a str> + 'a>, pad_left: bool, left_pad_symbol:&'a &'a str,
|
||||
pad_right: bool, right_pad_symbol: &'a &'a str, n: usize, ) -> Self {
|
||||
Self { text, n, pad_left, left_index: 1, left_pad_symbol, pad_right, right_index: 1, right_pad_symbol }
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue