reorganised the code
This commit is contained in:
parent
d8b79c2e36
commit
b42dab3c80
5 changed files with 168 additions and 126 deletions
|
|
@ -1,4 +1,4 @@
|
|||
mod edit_distance;
|
||||
mod ngrams;
|
||||
mod lm;
|
||||
mod util;
|
||||
pub mod lm;
|
||||
pub mod util;
|
||||
|
|
@ -3,7 +3,7 @@
|
|||
/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices.
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a str> + 'static, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
crate::util::Padder::new(Box::new(text), true, "<s>", true,"</s>", n)
|
||||
crate::util::padding::Padder::new(Box::new(text), true, "<s>", true,"</s>", n)
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
206
src/util/mod.rs
206
src/util/mod.rs
|
|
@ -1,4 +1,7 @@
|
|||
use std::slice::Iter;
|
||||
pub(crate) mod padding;
|
||||
mod ngrams;
|
||||
|
||||
use padding::Padder;
|
||||
|
||||
/// Returns a padded sequence of items before ngram extraction.
|
||||
///
|
||||
|
|
@ -31,173 +34,130 @@ pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a str> + 'static, r
|
|||
}
|
||||
|
||||
/// Return the ngrams generated from a sequence of items, as an iterator.
|
||||
// this is a windowing function on a list
|
||||
// pub fn ngrams<'a>(mut sequence: impl Iterator<Item=&'a str> + 'static, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a str> + 'a> + 'a {
|
||||
pub fn ngrams<'a>(sequence: &'a Vec<&'a str>, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
|
||||
let mut ngram = Vec::new();
|
||||
|
||||
NGramSequenceIter { sequence: sequence, n, current_ngram: ngram, index: 0, sequence_iter: None }
|
||||
}
|
||||
|
||||
struct NGramSequenceIter<'a> {
|
||||
sequence_iter: Option<Box<dyn Iterator<Item=&'a &'a str> + 'a>>,
|
||||
sequence: &'a Vec<&'a str>,
|
||||
n: usize,
|
||||
current_ngram: Vec<&'a &'a str>,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for NGramSequenceIter<'a> {
|
||||
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.current_ngram.len() == 0 {
|
||||
self.sequence_iter = Some(Box::new(self.sequence.iter()));
|
||||
for i in 0..self.n {
|
||||
self.current_ngram.push(self.sequence_iter.as_mut().unwrap().next().unwrap());
|
||||
self.index += 1;
|
||||
}
|
||||
|
||||
return Some(Box::new(self.current_ngram.clone().into_iter()));
|
||||
} else {
|
||||
self.current_ngram.remove(0);
|
||||
let maybe_next = self.sequence_iter.as_mut().unwrap().next();
|
||||
self.index += 1;
|
||||
return if maybe_next.is_some() {
|
||||
self.current_ngram.push(&maybe_next.unwrap());
|
||||
Some(Box::new(self.current_ngram.clone().into_iter()))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Padder<'a> {
|
||||
n: usize,
|
||||
text: Box<dyn Iterator<Item=&'a str>>,
|
||||
pad_left: bool,
|
||||
left_index: isize,
|
||||
left_pad_symbol: &'static str,
|
||||
pad_right: bool,
|
||||
right_index: isize,
|
||||
right_pad_symbol: &'static str,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Padder<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.pad_left && self.left_index < self.n as isize {
|
||||
self.left_index += 1;
|
||||
return Some(self.left_pad_symbol);
|
||||
} else {
|
||||
let maybe_next = self.text.next();
|
||||
if maybe_next.is_some() {
|
||||
return maybe_next;
|
||||
} else {
|
||||
if self.pad_right && self.right_index < self.n as isize {
|
||||
self.right_index += 1;
|
||||
return Some(self.right_pad_symbol);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Padder<'a> {
|
||||
pub(crate) fn new(text: Box<dyn Iterator<Item=&'a str>>, pad_left: bool, left_pad_symbol: &'static str,
|
||||
pad_right: bool, right_pad_symbol: &'static str, n: usize, ) -> Self {
|
||||
Self { text, n, pad_left, left_index: 1, left_pad_symbol, pad_right, right_index: 1, right_pad_symbol }
|
||||
}
|
||||
///
|
||||
/// sequence: the sequence items in the form of an Iterator over &&str
|
||||
/// use like:
|
||||
/// ```
|
||||
/// let sequence = vec!["a", "b", "c"];
|
||||
/// let mut bigrams = rltk::util::ngrams(sequence.iter(), 2);
|
||||
///
|
||||
/// let bigram1 = vec!["a", "b"];
|
||||
/// let bigram2 = vec!["b", "c"];
|
||||
/// let expected = vec![bigram1.iter(), bigram2.iter()];
|
||||
///
|
||||
/// for (mut left_outer,mut right_outer) in bigrams.zip(expected.into_iter()){
|
||||
/// for (left_inner,right_inner) in left_outer.zip(right_outer){
|
||||
/// assert_eq!(left_inner, right_inner);
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
pub fn ngrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
|
||||
ngrams::NGramSequenceIter::new(sequence, n)
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::slice::Iter;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pad_both_ends_default_n2() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence(text, true, "<s>", true, "</s>", 2);
|
||||
assert!(equal(padded, vec!["<s>", "a", "b", "c", "</s>"].into_iter()));
|
||||
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c", "</s>"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_left() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence_left(text, "<s>", 2);
|
||||
assert!(equal(padded, vec!["<s>", "a", "b", "c"].into_iter()));
|
||||
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_right() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence_right(text, "</s>", 2);
|
||||
assert!(equal(padded, vec!["a", "b", "c", "</s>"].into_iter()));
|
||||
|
||||
should_be_equal_lists(padded, vec!["a", "b", "c", "</s>"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_both_ends_default_n_eq_3() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence(text, true, "<s>", true, "</s>", 3);
|
||||
assert!(equal(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"].into_iter()));
|
||||
should_be_equal_lists(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_both_ends_non_default_symbols() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence(text, true, "left", true, "right", 2);
|
||||
assert!(equal(padded, vec!["left", "a", "b", "c", "right"].into_iter()));
|
||||
|
||||
should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bigrams() {
|
||||
let sequence = vec!["a", "b", "c", "d"];
|
||||
let mut bigrams = ngrams(&sequence, 2);
|
||||
let mut bigram = bigrams.next().unwrap();
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "a");
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "b");
|
||||
assert!(bigram.next().is_none());
|
||||
let mut bigrams = ngrams(sequence.iter(), 2);
|
||||
let bigram1 = vec!["a", "b"];
|
||||
let bigram2 = vec!["b", "c"];
|
||||
let bigram3 = vec!["c", "d"];
|
||||
let expected = vec![bigram1.iter(), bigram2.iter(), bigram3.iter()];
|
||||
|
||||
let mut bigram = bigrams.next().unwrap();
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "b");
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "c");
|
||||
assert!(bigram.next().is_none());
|
||||
|
||||
let mut bigram = bigrams.next().unwrap();
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "c");
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "d");
|
||||
assert!(bigram.next().is_none());
|
||||
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_trigrams() {
|
||||
let sequence = vec!["a", "b", "c", "d", "e"];
|
||||
let mut bigrams = ngrams(sequence.iter(), 3);
|
||||
let trigram1 = vec!["a", "b", "c"];
|
||||
let trigram2 = vec!["b", "c", "d"];
|
||||
let trigram3 = vec!["c", "d", "e"];
|
||||
let expected = vec![trigram1.iter(), trigram2.iter(), trigram3.iter()];
|
||||
|
||||
fn equal<'a>(mut l1: impl Iterator<Item=&'a str>, mut l2: impl Iterator<Item=&'a str>) -> bool {
|
||||
loop {
|
||||
let e1 = l1.next();
|
||||
let e2 = l2.next();
|
||||
if e1.is_none() {
|
||||
return if e2.is_none() {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
} else if e2.is_none() {
|
||||
return false;
|
||||
} else {
|
||||
if e1.unwrap() != e2.unwrap() {
|
||||
return false;
|
||||
}
|
||||
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bigrams_n_gt_len() {
|
||||
let sequence = vec!["a"];
|
||||
let mut bigrams = ngrams(sequence.iter(), 2);
|
||||
assert!(bigrams.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bigrams_empty_sequence() {
|
||||
let sequence = vec![];
|
||||
let mut bigrams = ngrams(sequence.iter(), 10);
|
||||
assert!(bigrams.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bigrams_n_eq_len() {
|
||||
let sequence = vec!["a", "b"];
|
||||
let mut bigrams = ngrams(sequence.iter(), 2);
|
||||
let bigram1 = vec!["a", "b"];
|
||||
let expected = vec![bigram1.iter()];
|
||||
|
||||
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||
}
|
||||
|
||||
fn should_be_equal_list_of_lists<'a>(bigrams: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
|
||||
for (mut left_outer, mut right_outer) in bigrams.zip(expected.into_iter()) {
|
||||
for (left_inner, right_inner) in left_outer.zip(right_outer) {
|
||||
assert_eq!(left_inner, right_inner);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a str>, right: Vec<&'a str>) {
|
||||
for (left, right) in left.zip(right.into_iter()) {
|
||||
assert_eq!(left, right);
|
||||
}
|
||||
}
|
||||
}
|
||||
42
src/util/ngrams.rs
Normal file
42
src/util/ngrams.rs
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
pub struct NGramSequenceIter<'a> {
|
||||
sequence: Box<dyn Iterator<Item=&'a &'a str> + 'a>,
|
||||
n: usize,
|
||||
current_ngram: Vec<&'a &'a str>,
|
||||
}
|
||||
|
||||
impl <'a> NGramSequenceIter<'a> {
|
||||
pub(crate) fn new(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> Self {
|
||||
Self {
|
||||
sequence: Box::new(sequence),
|
||||
n,
|
||||
current_ngram: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for NGramSequenceIter<'a> {
|
||||
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
return if self.current_ngram.len() == 0 {
|
||||
for i in 0..self.n {
|
||||
if let Some(item) = self.sequence.next() {
|
||||
self.current_ngram.push(item);
|
||||
} else {
|
||||
return None; // n > len
|
||||
}
|
||||
}
|
||||
|
||||
Some(Box::new(self.current_ngram.clone().into_iter()))
|
||||
} else {
|
||||
self.current_ngram.remove(0);
|
||||
let maybe_next = self.sequence.next();
|
||||
if maybe_next.is_some() {
|
||||
self.current_ngram.push(&maybe_next.unwrap());
|
||||
Some(Box::new(self.current_ngram.clone().into_iter()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
40
src/util/padding.rs
Normal file
40
src/util/padding.rs
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
pub struct Padder<'a> {
|
||||
n: usize,
|
||||
text: Box<dyn Iterator<Item=&'a str>>,
|
||||
pad_left: bool,
|
||||
left_index: isize,
|
||||
left_pad_symbol: &'static str,
|
||||
pad_right: bool,
|
||||
right_index: isize,
|
||||
right_pad_symbol: &'static str,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Padder<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.pad_left && self.left_index < self.n as isize {
|
||||
self.left_index += 1;
|
||||
return Some(self.left_pad_symbol);
|
||||
} else {
|
||||
let maybe_next = self.text.next();
|
||||
if maybe_next.is_some() {
|
||||
return maybe_next;
|
||||
} else {
|
||||
if self.pad_right && self.right_index < self.n as isize {
|
||||
self.right_index += 1;
|
||||
return Some(self.right_pad_symbol);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Padder<'a> {
|
||||
pub(crate) fn new(text: Box<dyn Iterator<Item=&'a str>>, pad_left: bool, left_pad_symbol: &'static str,
|
||||
pad_right: bool, right_pad_symbol: &'static str, n: usize, ) -> Self {
|
||||
Self { text, n, pad_left, left_index: 1, left_pad_symbol, pad_right, right_index: 1, right_pad_symbol }
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue