added everygrams

This commit is contained in:
Sander Hautvast 2022-05-09 11:48:21 +02:00
parent 7c0a7adb7b
commit d907001acd
5 changed files with 96 additions and 7 deletions

View file

@ -1,3 +1,5 @@
// sandbox, to be removed
use unicode_segmentation::UnicodeSegmentation;
//could also be powers of 2 that are combined using bitwise-or

View file

@ -1,4 +1,4 @@
mod edit_distance;
mod ngrams;
mod edit_distance; // to be removed
mod ngrams;// to be removed
pub mod lm;
pub mod util;

View file

@ -1,3 +1,4 @@
// sandbox, to be removed
use std::cmp::Ordering;
use std::collections::BTreeMap;

View file

@ -64,6 +64,10 @@ pub fn trigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iter
ngrams::NGramSequenceIter::new(sequence, 3)
}
pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
ngrams::EveryGramSequenceIter::everygrams(sequence, n)
}
#[cfg(test)]
mod tests {
use std::slice::Iter;
@ -154,10 +158,45 @@ mod tests {
should_be_equal_list_of_lists(&mut bigrams, expected)
}
fn should_be_equal_list_of_lists<'a>(bigrams: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
for (mut left_outer, mut right_outer) in bigrams.zip(expected.into_iter()) {
for (left_inner, right_inner) in left_outer.zip(right_outer) {
assert_eq!(left_inner, right_inner);
#[test]
fn test_everygrams_n_eq_2() {
let sequence = vec!["a", "b", "c", "d"];
let mut bigrams = everygrams(sequence.iter(), 2);
let gram1 = vec!["a"];
let gram2 = vec!["a", "b"];
let gram3 = vec!["b"];
let gram4 = vec!["b", "c"];
let gram5 = vec!["c"];
let gram6 = vec!["c", "d"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter()];
should_be_equal_list_of_lists(&mut bigrams, expected)
}
#[test]
fn test_everygrams_n_eq_3() {
let sequence = vec!["a", "b", "c", "d", "e"];
let mut bigrams = everygrams(sequence.iter(), 3);
let gram1 = vec!["a"];
let gram2 = vec!["a", "b"];
let gram3 = vec!["a", "b", "c"];
let gram4 = vec!["b"];
let gram5 = vec!["b", "c"];
let gram6 = vec!["b", "c", "d"];
let gram7 = vec!["c"];
let gram8 = vec!["c", "d"];
let gram9 = vec!["c", "d", "e"];
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter()];
should_be_equal_list_of_lists(&mut bigrams, expected)
}
fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
for (mut actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
// println!("{} {}", actual_inner, expected_inner);
assert_eq!(actual_inner, expected_inner);
}
}
}

View file

@ -40,3 +40,50 @@ impl<'a> Iterator for NGramSequenceIter<'a> {
};
}
}
pub struct EveryGramSequenceIter<'a> {
sequence: Box<dyn Iterator<Item=&'a &'a str> + 'a>,
n: usize,
current_ngram: Vec<&'a &'a str>,
current_size: usize,
}
impl<'a> EveryGramSequenceIter<'a> {
pub(crate) fn everygrams(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> Self {
Self {
sequence: Box::new(sequence),
n,
current_ngram: Vec::new(),
current_size: 0,
}
}
}
impl<'a> Iterator for EveryGramSequenceIter<'a> {
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.current_ngram.len() == 0 {
for i in 0..self.n {
if let Some(item) = self.sequence.next() {
self.current_ngram.push(item);
} else {
return None; // n > len
}
}
}
self.current_size += 1;
if self.current_size > self.n {
self.current_size = 1;
self.current_ngram.remove(0);
let maybe_next = self.sequence.next();
if maybe_next.is_some() {
self.current_ngram.push(&maybe_next.unwrap());
} else { return None; }
}
return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size)));
}
}