minor improvements
This commit is contained in:
parent
2c78b4cf6d
commit
b9fc3386f0
3 changed files with 60 additions and 68 deletions
|
|
@ -1,13 +1,19 @@
|
||||||
|
use crate::util::flatten;
|
||||||
|
|
||||||
/// Pads a sequence of words with defaults; prepends "<s>" and appends "<s>"
|
/// Pads a sequence of words with defaults; prepends "<s>" and appends "<s>"
|
||||||
///
|
///
|
||||||
/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices.
|
/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices.
|
||||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||||
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=&'a &'a str> {
|
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, order: usize) -> impl Iterator<Item=&'a &'a str> {
|
||||||
crate::util::padding::Padder::new(Box::new(text), true, &"<s>", true,&"</s>", n)
|
crate::util::padding::Padder::new(Box::new(text), true, &"<s>", true, &"</s>", order)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn padded_everygrams<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
|
pub fn padded_everygrams<'a>(sentence: impl Iterator<Item=&'a &'a str> + 'a, order: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
|
||||||
crate::util::everygrams(pad_both_ends(sentence, n), n)
|
crate::util::everygrams(pad_both_ends(sentence, order), order)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn padded_everygram_pipeline<'a>(text: impl Iterator<Item=&'a &'a str> + 'a, order: usize) -> (impl Iterator<Item=&'a &'a str>){
|
||||||
|
(text.iter().map(|sent| rltk::lm::preprocessing::pad_both_ends(sent.iter(), order)).flatten())//vocab
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
@ -17,7 +23,7 @@ mod tests{
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test(){
|
fn test(){
|
||||||
let sentence = vec!["a","b", "c"];
|
let sentence = ["a","b", "c"];
|
||||||
let mut bigrams = padded_everygrams(sentence.iter(),2);
|
let mut bigrams = padded_everygrams(sentence.iter(),2);
|
||||||
|
|
||||||
let bigram1 = vec!["<s>"];
|
let bigram1 = vec!["<s>"];
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
use std::slice::Iter;
|
use std::slice::Iter;
|
||||||
|
|
||||||
pub fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: Vec<&'a str>) {
|
pub fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: &[&'a str]) {
|
||||||
for (left, right) in left.zip(right.into_iter()) {
|
for (left, right) in left.zip(right.into_iter()) {
|
||||||
assert_eq!(*left, right);
|
assert_eq!(left, right);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
108
src/util/mod.rs
108
src/util/mod.rs
|
|
@ -29,7 +29,7 @@ pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, lef
|
||||||
/// pad_right: if set to true, appends a padding symbol after the sentence
|
/// pad_right: if set to true, appends a padding symbol after the sentence
|
||||||
/// right_pad_symbol: the padding symbol to append
|
/// right_pad_symbol: the padding symbol to append
|
||||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||||
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> + 'a {
|
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, right_pad_symbol: &'a &'a str, n: usize) -> impl Iterator<Item=&'a &'a str> {
|
||||||
Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n)
|
Padder::new(Box::new(sequence), false, &"", true, right_pad_symbol, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -52,72 +52,71 @@ pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, ri
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
pub fn ngrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
|
pub fn ngrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> {
|
||||||
ngrams::NGramSequenceIter::new(sequence, n)
|
ngrams::NGramSequenceIter::new(sequence, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn bigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
|
pub fn bigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> {
|
||||||
ngrams::NGramSequenceIter::new(sequence, 2)
|
ngrams::NGramSequenceIter::new(sequence, 2)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn trigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
|
pub fn trigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> {
|
||||||
ngrams::NGramSequenceIter::new(sequence, 3)
|
ngrams::NGramSequenceIter::new(sequence, 3)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
|
pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> {
|
||||||
ngrams::EveryGramSequenceIter::everygrams(sequence, n)
|
ngrams::EveryGramSequenceIter::everygrams(sequence, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> impl Iterator<Item=&'a &'a str> + 'a {
|
pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> impl Iterator<Item=&'a &'a str> {
|
||||||
ngrams::FlatteningIter::new(ngrams)
|
ngrams::FlatteningIter::new(ngrams)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::lm::preprocessing::pad_both_ends;
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::test::*;
|
use crate::test::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_pad_both_ends_default_n2() {
|
fn test_pad_both_ends_default_n2() {
|
||||||
let text = vec!["a", "b", "c"];
|
let text = ["a", "b", "c"];
|
||||||
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 2);
|
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 2);
|
||||||
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c", "</s>"]);
|
should_be_equal_lists(padded, &["<s>", "a", "b", "c", "</s>"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_pad_left() {
|
fn test_pad_left() {
|
||||||
let text = vec!["a", "b", "c"];
|
let text = ["a", "b", "c"];
|
||||||
let padded = pad_sequence_left(text.iter(), &"<s>", 2);
|
let padded = pad_sequence_left(text.iter(), &"<s>", 2);
|
||||||
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c"]);
|
should_be_equal_lists(padded, &["<s>", "a", "b", "c"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_pad_right() {
|
fn test_pad_right() {
|
||||||
let text = vec!["a", "b", "c"];
|
let text = ["a", "b", "c"];
|
||||||
let padded = pad_sequence_right(text.iter(), &"</s>", 2);
|
let padded = pad_sequence_right(text.iter(), &"</s>", 2);
|
||||||
|
|
||||||
should_be_equal_lists(padded, vec!["a", "b", "c", "</s>"]);
|
should_be_equal_lists(padded, &["a", "b", "c", "</s>"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_pad_both_ends_default_n_eq_3() {
|
fn test_pad_both_ends_default_n_eq_3() {
|
||||||
let text = vec!["a", "b", "c"];
|
let text = ["a", "b", "c"];
|
||||||
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 3);
|
let padded = pad_sequence(text.iter(), true, &"<s>", true, &"</s>", 3);
|
||||||
should_be_equal_lists(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
|
should_be_equal_lists(padded, &["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_pad_both_ends_non_default_symbols() {
|
fn test_pad_both_ends_non_default_symbols() {
|
||||||
let text = vec!["a", "b", "c"];
|
let text = ["a", "b", "c"];
|
||||||
let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2);
|
let padded = pad_sequence(text.iter(), true, &"left", true, &"right", 2);
|
||||||
|
|
||||||
should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]);
|
should_be_equal_lists(padded, &["left", "a", "b", "c", "right"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bigrams() {
|
fn test_bigrams() {
|
||||||
let sequence = vec!["a", "b", "c", "d"];
|
let sequence = ["a", "b", "c", "d"];
|
||||||
let mut bigrams = ngrams(sequence.iter(), 2);
|
let mut bigrams = ngrams(sequence.iter(), 2);
|
||||||
let bigram1 = vec!["a", "b"];
|
let bigram1 = vec!["a", "b"];
|
||||||
let bigram2 = vec!["b", "c"];
|
let bigram2 = vec!["b", "c"];
|
||||||
|
|
@ -129,7 +128,7 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_trigrams() {
|
fn test_trigrams() {
|
||||||
let sequence = vec!["a", "b", "c", "d", "e"];
|
let sequence = ["a", "b", "c", "d", "e"];
|
||||||
let mut bigrams = ngrams(sequence.iter(), 3);
|
let mut bigrams = ngrams(sequence.iter(), 3);
|
||||||
let trigram1 = vec!["a", "b", "c"];
|
let trigram1 = vec!["a", "b", "c"];
|
||||||
let trigram2 = vec!["b", "c", "d"];
|
let trigram2 = vec!["b", "c", "d"];
|
||||||
|
|
@ -141,21 +140,21 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bigrams_n_gt_len() {
|
fn test_bigrams_n_gt_len() {
|
||||||
let sequence = vec!["a"];
|
let sequence = ["a"];
|
||||||
let mut bigrams = ngrams(sequence.iter(), 2);
|
let mut bigrams = ngrams(sequence.iter(), 2);
|
||||||
assert!(bigrams.next().is_none());
|
assert!(bigrams.next().is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bigrams_empty_sequence() {
|
fn test_bigrams_empty_sequence() {
|
||||||
let sequence = vec![];
|
let sequence: Vec<&str> = vec![];
|
||||||
let mut bigrams = ngrams(sequence.iter(), 10);
|
let mut bigrams = ngrams(sequence.iter(), 10);
|
||||||
assert!(bigrams.next().is_none());
|
assert!(bigrams.next().is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bigrams_n_eq_len() {
|
fn test_bigrams_n_eq_len() {
|
||||||
let sequence = vec!["a", "b"];
|
let sequence = ["a", "b"];
|
||||||
let mut bigrams = ngrams(sequence.iter(), 2);
|
let mut bigrams = ngrams(sequence.iter(), 2);
|
||||||
let bigram1 = vec!["a", "b"];
|
let bigram1 = vec!["a", "b"];
|
||||||
let expected = vec![bigram1.iter()];
|
let expected = vec![bigram1.iter()];
|
||||||
|
|
@ -166,43 +165,37 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_everygrams_n_eq_2() {
|
fn test_everygrams_n_eq_2() {
|
||||||
let sequence = vec!["a", "b", "c", "d"];
|
let sequence = ["a", "b", "c", "d"];
|
||||||
let mut grams = everygrams(sequence.iter(), 2);
|
let mut grams = everygrams(sequence.iter(), 2);
|
||||||
// let gram1 = vec!["a"];
|
let gram1 = ["a"];
|
||||||
// let gram2 = vec!["a", "b"];
|
let gram2 = ["a", "b"];
|
||||||
// let gram3 = vec!["b"];
|
let gram3 = ["b"];
|
||||||
// let gram4 = vec!["b", "c"];
|
let gram4 = ["b", "c"];
|
||||||
// let gram5 = vec!["c"];
|
let gram5 = ["c"];
|
||||||
// let gram6 = vec!["c", "d"];
|
let gram6 = ["c", "d"];
|
||||||
// let gram7 = vec!["d"];
|
let gram7 = ["d"];
|
||||||
// let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()];
|
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter()];
|
||||||
|
|
||||||
for i in grams{
|
should_be_equal_list_of_lists(&mut grams, expected);
|
||||||
for j in i{
|
|
||||||
print!("{},",j);
|
|
||||||
}
|
|
||||||
println!();
|
|
||||||
}
|
|
||||||
// should_be_equal_list_of_lists(&mut bigrams, expected);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_everygrams_n_eq_3() {
|
fn test_everygrams_n_eq_3() {
|
||||||
let sequence = vec!["a", "b", "c", "d", "e"];
|
let sequence = ["a", "b", "c", "d", "e"];
|
||||||
let mut bigrams = everygrams(sequence.iter(), 3);
|
let mut bigrams = everygrams(sequence.iter(), 3);
|
||||||
|
|
||||||
let gram1 = vec!["a"];
|
let gram1 = ["a"];
|
||||||
let gram2 = vec!["a", "b"];
|
let gram2 = ["a", "b"];
|
||||||
let gram3 = vec!["a", "b", "c"];
|
let gram3 = ["a", "b", "c"];
|
||||||
let gram4 = vec!["b"];
|
let gram4 = ["b"];
|
||||||
let gram5 = vec!["b", "c"];
|
let gram5 = ["b", "c"];
|
||||||
let gram6 = vec!["b", "c", "d"];
|
let gram6 = ["b", "c", "d"];
|
||||||
let gram7 = vec!["c"];
|
let gram7 = ["c"];
|
||||||
let gram8 = vec!["c", "d"];
|
let gram8 = ["c", "d"];
|
||||||
let gram9 = vec!["c", "d", "e"];
|
let gram9 = ["c", "d", "e"];
|
||||||
let gram10 = vec!["d"];
|
let gram10 = ["d"];
|
||||||
let gram11 = vec!["d", "e"];
|
let gram11 = ["d", "e"];
|
||||||
let gram12 = vec!["e"];
|
let gram12 = ["e"];
|
||||||
|
|
||||||
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter(), gram10.iter(), gram11.iter(), gram12.iter()];
|
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter(), gram10.iter(), gram11.iter(), gram12.iter()];
|
||||||
|
|
||||||
|
|
@ -211,16 +204,9 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_flatten() {
|
fn test_flatten() {
|
||||||
let sequence = vec!["a", "b", "c", "d", "e"];
|
let sequence = ["a", "b", "c", "d", "e"];
|
||||||
let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"];
|
let expected = ["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"];
|
||||||
|
|
||||||
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected);
|
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), &expected);
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn example() {
|
|
||||||
let text = vec![vec!["a", "b", "c"], vec!["a", "c", "d", "c", "e", "f"]];
|
|
||||||
let result: Vec<&&str> = text.iter().map(|sent|pad_both_ends(sent.iter(),2)).flatten().collect();
|
|
||||||
println!("{:?}", result);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Add table
Reference in a new issue