minor improvements

This commit is contained in:
Sander Hautvast 2022-05-11 15:37:12 +02:00
parent 2c78b4cf6d
commit b9fc3386f0
3 changed files with 60 additions and 68 deletions

View file

@ -1,13 +1,19 @@
use crate::util::flatten;
/// Pads a sequence of words with defaults; prepends "<s>" and appends "<s>" /// Pads a sequence of words with defaults; prepends "<s>" and appends "<s>"
/// ///
/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices. /// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices.
/// n: the n in n-grams; so for bigrams set to 2, etc /// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=&'a &'a str> { pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, order: usize) -> impl Iterator<Item=&'a &'a str> {
crate::util::padding::Padder::new(Box::new(text), true, &"<s>", true,&"</s>", n) crate::util::padding::Padder::new(Box::new(text), true, &"<s>", true, &"</s>", order)
} }
pub fn padded_everygrams<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a { pub fn padded_everygrams<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, order: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
crate::util::everygrams(pad_both_ends(sentence, n), n) crate::util::everygrams(pad_both_ends(sentence, order), order)
}
pub fn padded_everygram_pipeline<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, order: usize) -> (impl Iterator<Item=&'a &'a str>){
(text.iter().map(|sent| rltk::lm::preprocessing::pad_both_ends(sent.iter(), order)).flatten())//vocab
} }
#[cfg(test)] #[cfg(test)]
@ -17,7 +23,7 @@ mod tests{
#[test] #[test]
fn test(){ fn test(){
let sentence = vec!["a","b", "c"]; let sentence = ["a","b", "c"];
let mut bigrams = padded_everygrams(sentence.iter(),2); let mut bigrams = padded_everygrams(sentence.iter(),2);
let bigram1 = vec!["<s>"]; let bigram1 = vec!["<s>"];

View file

@ -1,8 +1,8 @@
use std::slice::Iter; use std::slice::Iter;
pub fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: Vec<&'a str>) { pub fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: &[&'a str]) {
for (left, right) in left.zip(right.into_iter()) { for (left, right) in left.zip(right.into_iter()) {
assert_eq!(*left, right); assert_eq!(left, right);
} }
} }

View file

@ -29,7 +29,7 @@ pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, lef
/// pad_right: if set to true, appends a padding symbol after the sentence /// pad_right: if set to true, appends a padding symbol after the sentence
/// right_pad_symbol: the padding symbol to append /// right_pad_symbol: the padding symbol to append
/// n: the n in n-grams; so for bigrams set to 2, etc /// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> + 'a { pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> {
Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n) Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n)
} }
@ -52,72 +52,71 @@ pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, ri
/// } /// }
/// ``` /// ```
/// ///
pub fn ngrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a { pub fn ngrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> {
ngrams::NGramSequenceIter::new(sequence, n) ngrams::NGramSequenceIter::new(sequence, n)
} }
pub fn bigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a { pub fn bigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> {
ngrams::NGramSequenceIter::new(sequence, 2) ngrams::NGramSequenceIter::new(sequence, 2)
} }
pub fn trigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a { pub fn trigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> {
ngrams::NGramSequenceIter::new(sequence, 3) ngrams::NGramSequenceIter::new(sequence, 3)
} }
pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a { pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> {
ngrams::EveryGramSequenceIter::everygrams(sequence, n) ngrams::EveryGramSequenceIter::everygrams(sequence, n)
} }
pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> impl Iterator<Item=&'a &'a str> + 'a { pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> impl Iterator<Item=&'a &'a str> {
ngrams::FlatteningIter::new(ngrams) ngrams::FlatteningIter::new(ngrams)
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::lm::preprocessing::pad_both_ends;
use super::*; use super::*;
use crate::test::*; use crate::test::*;
#[test] #[test]
fn test_pad_both_ends_default_n2() { fn test_pad_both_ends_default_n2() {
let text = vec!["a", "b", "c"]; let text = ["a", "b", "c"];
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 2); let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 2);
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c", "</s>"]); should_be_equal_lists(padded, &["<s>", "a", "b", "c", "</s>"]);
} }
#[test] #[test]
fn test_pad_left() { fn test_pad_left() {
let text = vec!["a", "b", "c"]; let text = ["a", "b", "c"];
let padded = pad_sequence_left(text.iter(), &"<s>", 2); let padded = pad_sequence_left(text.iter(), &"<s>", 2);
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c"]); should_be_equal_lists(padded, &["<s>", "a", "b", "c"]);
} }
#[test] #[test]
fn test_pad_right() { fn test_pad_right() {
let text = vec!["a", "b", "c"]; let text = ["a", "b", "c"];
let padded = pad_sequence_right(text.iter(), &"</s>", 2); let padded = pad_sequence_right(text.iter(), &"</s>", 2);
should_be_equal_lists(padded, vec!["a", "b", "c", "</s>"]); should_be_equal_lists(padded, &["a", "b", "c", "</s>"]);
} }
#[test] #[test]
fn test_pad_both_ends_default_n_eq_3() { fn test_pad_both_ends_default_n_eq_3() {
let text = vec!["a", "b", "c"]; let text = ["a", "b", "c"];
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 3); let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 3);
should_be_equal_lists(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]); should_be_equal_lists(padded, &["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
} }
#[test] #[test]
fn test_pad_both_ends_non_default_symbols() { fn test_pad_both_ends_non_default_symbols() {
let text = vec!["a", "b", "c"]; let text = ["a", "b", "c"];
let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2); let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2);
should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]); should_be_equal_lists(padded, &["left", "a", "b", "c", "right"]);
} }
#[test] #[test]
fn test_bigrams() { fn test_bigrams() {
let sequence = vec!["a", "b", "c", "d"]; let sequence = ["a", "b", "c", "d"];
let mut bigrams = ngrams(sequence.iter(), 2); let mut bigrams = ngrams(sequence.iter(), 2);
let bigram1 = vec!["a", "b"]; let bigram1 = vec!["a", "b"];
let bigram2 = vec!["b", "c"]; let bigram2 = vec!["b", "c"];
@ -129,7 +128,7 @@ mod tests {
#[test] #[test]
fn test_trigrams() { fn test_trigrams() {
let sequence = vec!["a", "b", "c", "d", "e"]; let sequence = ["a", "b", "c", "d", "e"];
let mut bigrams = ngrams(sequence.iter(), 3); let mut bigrams = ngrams(sequence.iter(), 3);
let trigram1 = vec!["a", "b", "c"]; let trigram1 = vec!["a", "b", "c"];
let trigram2 = vec!["b", "c", "d"]; let trigram2 = vec!["b", "c", "d"];
@ -141,21 +140,21 @@ mod tests {
#[test] #[test]
fn test_bigrams_n_gt_len() { fn test_bigrams_n_gt_len() {
let sequence = vec!["a"]; let sequence = ["a"];
let mut bigrams = ngrams(sequence.iter(), 2); let mut bigrams = ngrams(sequence.iter(), 2);
assert!(bigrams.next().is_none()); assert!(bigrams.next().is_none());
} }
#[test] #[test]
fn test_bigrams_empty_sequence() { fn test_bigrams_empty_sequence() {
let sequence = vec![]; let sequence: Vec<&str> = vec![];
let mut bigrams = ngrams(sequence.iter(), 10); let mut bigrams = ngrams(sequence.iter(), 10);
assert!(bigrams.next().is_none()); assert!(bigrams.next().is_none());
} }
#[test] #[test]
fn test_bigrams_n_eq_len() { fn test_bigrams_n_eq_len() {
let sequence = vec!["a", "b"]; let sequence = ["a", "b"];
let mut bigrams = ngrams(sequence.iter(), 2); let mut bigrams = ngrams(sequence.iter(), 2);
let bigram1 = vec!["a", "b"]; let bigram1 = vec!["a", "b"];
let expected = vec![bigram1.iter()]; let expected = vec![bigram1.iter()];
@ -166,43 +165,37 @@ mod tests {
#[test] #[test]
fn test_everygrams_n_eq_2() { fn test_everygrams_n_eq_2() {
let sequence = vec!["a", "b", "c", "d"]; let sequence = ["a", "b", "c", "d"];
let mut grams = everygrams(sequence.iter(), 2); let mut grams = everygrams(sequence.iter(), 2);
// let gram1 = vec!["a"]; let gram1 = ["a"];
// let gram2 = vec!["a", "b"]; let gram2 = ["a", "b"];
// let gram3 = vec!["b"]; let gram3 = ["b"];
// let gram4 = vec!["b", "c"]; let gram4 = ["b", "c"];
// let gram5 = vec!["c"]; let gram5 = ["c"];
// let gram6 = vec!["c", "d"]; let gram6 = ["c", "d"];
// let gram7 = vec!["d"]; let gram7 = ["d"];
// let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()]; let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()];
for i in grams{ should_be_equal_list_of_lists(&mut grams, expected);
for j in i{
print!("{},",j);
}
println!();
}
// should_be_equal_list_of_lists(&mut bigrams, expected);
} }
#[test] #[test]
fn test_everygrams_n_eq_3() { fn test_everygrams_n_eq_3() {
let sequence = vec!["a", "b", "c", "d", "e"]; let sequence = ["a", "b", "c", "d", "e"];
let mut bigrams = everygrams(sequence.iter(), 3); let mut bigrams = everygrams(sequence.iter(), 3);
let gram1 = vec!["a"]; let gram1 = ["a"];
let gram2 = vec!["a", "b"]; let gram2 = ["a", "b"];
let gram3 = vec!["a", "b", "c"]; let gram3 = ["a", "b", "c"];
let gram4 = vec!["b"]; let gram4 = ["b"];
let gram5 = vec!["b", "c"]; let gram5 = ["b", "c"];
let gram6 = vec!["b", "c", "d"]; let gram6 = ["b", "c", "d"];
let gram7 = vec!["c"]; let gram7 = ["c"];
let gram8 = vec!["c", "d"]; let gram8 = ["c", "d"];
let gram9 = vec!["c", "d", "e"]; let gram9 = ["c", "d", "e"];
let gram10 = vec!["d"]; let gram10 = ["d"];
let gram11 = vec!["d", "e"]; let gram11 = ["d", "e"];
let gram12 = vec!["e"]; let gram12 = ["e"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter(), gram10.iter(), gram11.iter(), gram12.iter()]; let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter(), gram10.iter(), gram11.iter(), gram12.iter()];
@ -211,16 +204,9 @@ mod tests {
#[test] #[test]
fn test_flatten() { fn test_flatten() {
let sequence = vec!["a", "b", "c", "d", "e"]; let sequence = ["a", "b", "c", "d", "e"];
let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"]; let expected = ["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"];
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected); should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), &expected);
}
#[test]
fn example() {
let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]];
let result: Vec<&&str> = text.iter().map(|sent|pad_both_ends(sent.iter(),2)).flatten().collect();
println!("{:?}", result);
} }
} }