minor improvements

This commit is contained in:
Sander Hautvast 2022-05-11 15:37:12 +02:00
parent 2c78b4cf6d
commit b9fc3386f0
3 changed files with 60 additions and 68 deletions

View file

@ -1,13 +1,19 @@
use crate::util::flatten;
/// Pads a sequence of words with defaults; prepends "<s>" and appends "<s>"
///
/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices.
/// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=&'a &'a str> {
crate::util::padding::Padder::new(Box::new(text), true, &"<s>", true,&"</s>", n)
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, order: usize) -> impl Iterator<Item=&'a &'a str> {
crate::util::padding::Padder::new(Box::new(text), true, &"<s>", true, &"</s>", order)
}
pub fn padded_everygrams<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
crate::util::everygrams(pad_both_ends(sentence, n), n)
pub fn padded_everygrams<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, order: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
crate::util::everygrams(pad_both_ends(sentence, order), order)
}
pub fn padded_everygram_pipeline<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, order: usize) -> (impl Iterator<Item=&'a &'a str>){
(text.iter().map(|sent| rltk::lm::preprocessing::pad_both_ends(sent.iter(), order)).flatten())//vocab
}
#[cfg(test)]
@ -17,7 +23,7 @@ mod tests{
#[test]
fn test(){
let sentence = vec!["a","b", "c"];
let sentence = ["a","b", "c"];
let mut bigrams = padded_everygrams(sentence.iter(),2);
let bigram1 = vec!["<s>"];

View file

@ -1,8 +1,8 @@
use std::slice::Iter;
pub fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: Vec<&'a str>) {
pub fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: &[&'a str]) {
for (left, right) in left.zip(right.into_iter()) {
assert_eq!(*left, right);
assert_eq!(left, right);
}
}

View file

@ -29,7 +29,7 @@ pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, lef
/// pad_right: if set to true, appends a padding symbol after the sentence
/// right_pad_symbol: the padding symbol to append
/// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> + 'a {
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> {
Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n)
}
@ -52,72 +52,71 @@ pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, ri
/// }
/// ```
///
pub fn ngrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
pub fn ngrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> {
ngrams::NGramSequenceIter::new(sequence, n)
}
pub fn bigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
pub fn bigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> {
ngrams::NGramSequenceIter::new(sequence, 2)
}
pub fn trigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
pub fn trigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> {
ngrams::NGramSequenceIter::new(sequence, 3)
}
pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> {
ngrams::EveryGramSequenceIter::everygrams(sequence, n)
}
pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> impl Iterator<Item=&'a &'a str> + 'a {
pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> impl Iterator<Item=&'a &'a str> {
ngrams::FlatteningIter::new(ngrams)
}
#[cfg(test)]
mod tests {
use crate::lm::preprocessing::pad_both_ends;
use super::*;
use crate::test::*;
#[test]
fn test_pad_both_ends_default_n2() {
let text = vec!["a", "b", "c"];
let text = ["a", "b", "c"];
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 2);
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c", "</s>"]);
should_be_equal_lists(padded, &["<s>", "a", "b", "c", "</s>"]);
}
#[test]
fn test_pad_left() {
let text = vec!["a", "b", "c"];
let text = ["a", "b", "c"];
let padded = pad_sequence_left(text.iter(), &"<s>", 2);
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c"]);
should_be_equal_lists(padded, &["<s>", "a", "b", "c"]);
}
#[test]
fn test_pad_right() {
let text = vec!["a", "b", "c"];
let text = ["a", "b", "c"];
let padded = pad_sequence_right(text.iter(), &"</s>", 2);
should_be_equal_lists(padded, vec!["a", "b", "c", "</s>"]);
should_be_equal_lists(padded, &["a", "b", "c", "</s>"]);
}
#[test]
fn test_pad_both_ends_default_n_eq_3() {
let text = vec!["a", "b", "c"];
let text = ["a", "b", "c"];
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 3);
should_be_equal_lists(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
should_be_equal_lists(padded, &["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
}
#[test]
fn test_pad_both_ends_non_default_symbols() {
let text = vec!["a", "b", "c"];
let text = ["a", "b", "c"];
let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2);
should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]);
should_be_equal_lists(padded, &["left", "a", "b", "c", "right"]);
}
#[test]
fn test_bigrams() {
let sequence = vec!["a", "b", "c", "d"];
let sequence = ["a", "b", "c", "d"];
let mut bigrams = ngrams(sequence.iter(), 2);
let bigram1 = vec!["a", "b"];
let bigram2 = vec!["b", "c"];
@ -129,7 +128,7 @@ mod tests {
#[test]
fn test_trigrams() {
let sequence = vec!["a", "b", "c", "d", "e"];
let sequence = ["a", "b", "c", "d", "e"];
let mut bigrams = ngrams(sequence.iter(), 3);
let trigram1 = vec!["a", "b", "c"];
let trigram2 = vec!["b", "c", "d"];
@ -141,21 +140,21 @@ mod tests {
#[test]
fn test_bigrams_n_gt_len() {
let sequence = vec!["a"];
let sequence = ["a"];
let mut bigrams = ngrams(sequence.iter(), 2);
assert!(bigrams.next().is_none());
}
#[test]
fn test_bigrams_empty_sequence() {
let sequence = vec![];
let sequence: Vec<&str> = vec![];
let mut bigrams = ngrams(sequence.iter(), 10);
assert!(bigrams.next().is_none());
}
#[test]
fn test_bigrams_n_eq_len() {
let sequence = vec!["a", "b"];
let sequence = ["a", "b"];
let mut bigrams = ngrams(sequence.iter(), 2);
let bigram1 = vec!["a", "b"];
let expected = vec![bigram1.iter()];
@ -166,43 +165,37 @@ mod tests {
#[test]
fn test_everygrams_n_eq_2() {
let sequence = vec!["a", "b", "c", "d"];
let sequence = ["a", "b", "c", "d"];
let mut grams = everygrams(sequence.iter(), 2);
// let gram1 = vec!["a"];
// let gram2 = vec!["a", "b"];
// let gram3 = vec!["b"];
// let gram4 = vec!["b", "c"];
// let gram5 = vec!["c"];
// let gram6 = vec!["c", "d"];
// let gram7 = vec!["d"];
// let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()];
let gram1 = ["a"];
let gram2 = ["a", "b"];
let gram3 = ["b"];
let gram4 = ["b", "c"];
let gram5 = ["c"];
let gram6 = ["c", "d"];
let gram7 = ["d"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()];
for i in grams{
for j in i{
print!("{},",j);
}
println!();
}
// should_be_equal_list_of_lists(&mut bigrams, expected);
should_be_equal_list_of_lists(&mut grams, expected);
}
#[test]
fn test_everygrams_n_eq_3() {
let sequence = vec!["a", "b", "c", "d", "e"];
let sequence = ["a", "b", "c", "d", "e"];
let mut bigrams = everygrams(sequence.iter(), 3);
let gram1 = vec!["a"];
let gram2 = vec!["a", "b"];
let gram3 = vec!["a", "b", "c"];
let gram4 = vec!["b"];
let gram5 = vec!["b", "c"];
let gram6 = vec!["b", "c", "d"];
let gram7 = vec!["c"];
let gram8 = vec!["c", "d"];
let gram9 = vec!["c", "d", "e"];
let gram10 = vec!["d"];
let gram11 = vec!["d", "e"];
let gram12 = vec!["e"];
let gram1 = ["a"];
let gram2 = ["a", "b"];
let gram3 = ["a", "b", "c"];
let gram4 = ["b"];
let gram5 = ["b", "c"];
let gram6 = ["b", "c", "d"];
let gram7 = ["c"];
let gram8 = ["c", "d"];
let gram9 = ["c", "d", "e"];
let gram10 = ["d"];
let gram11 = ["d", "e"];
let gram12 = ["e"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter(), gram10.iter(), gram11.iter(), gram12.iter()];
@ -211,16 +204,9 @@ mod tests {
#[test]
fn test_flatten() {
let sequence = vec!["a", "b", "c", "d", "e"];
let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"];
let sequence = ["a", "b", "c", "d", "e"];
let expected = ["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"];
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected);
}
#[test]
fn example() {
let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]];
let result: Vec<&&str> = text.iter().map(|sent|pad_both_ends(sent.iter(),2)).flatten().collect();
println!("{:?}", result);
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), &expected);
}
}