added everygrams
This commit is contained in:
parent
7c0a7adb7b
commit
d907001acd
5 changed files with 96 additions and 7 deletions
|
|
@ -1,3 +1,5 @@
|
||||||
|
// sandbox, to be removed
|
||||||
|
|
||||||
use unicode_segmentation::UnicodeSegmentation;
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
//could also be powers of 2 that are combined using bitwise-or
|
//could also be powers of 2 that are combined using bitwise-or
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
mod edit_distance;
|
mod edit_distance; // to be removed
|
||||||
mod ngrams;
|
mod ngrams;// to be removed
|
||||||
pub mod lm;
|
pub mod lm;
|
||||||
pub mod util;
|
pub mod util;
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
// sandbox, to be removed
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -64,6 +64,10 @@ pub fn trigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iter
|
||||||
ngrams::NGramSequenceIter::new(sequence, 3)
|
ngrams::NGramSequenceIter::new(sequence, 3)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
|
||||||
|
ngrams::EveryGramSequenceIter::everygrams(sequence, n)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::slice::Iter;
|
use std::slice::Iter;
|
||||||
|
|
@ -154,10 +158,45 @@ mod tests {
|
||||||
should_be_equal_list_of_lists(&mut bigrams, expected)
|
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn should_be_equal_list_of_lists<'a>(bigrams: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
|
|
||||||
for (mut left_outer, mut right_outer) in bigrams.zip(expected.into_iter()) {
|
#[test]
|
||||||
for (left_inner, right_inner) in left_outer.zip(right_outer) {
|
fn test_everygrams_n_eq_2() {
|
||||||
assert_eq!(left_inner, right_inner);
|
let sequence = vec!["a", "b", "c", "d"];
|
||||||
|
let mut bigrams = everygrams(sequence.iter(), 2);
|
||||||
|
let gram1 = vec!["a"];
|
||||||
|
let gram2 = vec!["a", "b"];
|
||||||
|
let gram3 = vec!["b"];
|
||||||
|
let gram4 = vec!["b", "c"];
|
||||||
|
let gram5 = vec!["c"];
|
||||||
|
let gram6 = vec!["c", "d"];
|
||||||
|
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter()];
|
||||||
|
|
||||||
|
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_everygrams_n_eq_3() {
|
||||||
|
let sequence = vec!["a", "b", "c", "d", "e"];
|
||||||
|
let mut bigrams = everygrams(sequence.iter(), 3);
|
||||||
|
let gram1 = vec!["a"];
|
||||||
|
let gram2 = vec!["a", "b"];
|
||||||
|
let gram3 = vec!["a", "b", "c"];
|
||||||
|
let gram4 = vec!["b"];
|
||||||
|
let gram5 = vec!["b", "c"];
|
||||||
|
let gram6 = vec!["b", "c", "d"];
|
||||||
|
let gram7 = vec!["c"];
|
||||||
|
let gram8 = vec!["c", "d"];
|
||||||
|
let gram9 = vec!["c", "d", "e"];
|
||||||
|
let expected = vec![gram1.iter(), gram2.iter(), gram3.iter(), gram4.iter(), gram5.iter(), gram6.iter(), gram7.iter(), gram8.iter(), gram9.iter()];
|
||||||
|
|
||||||
|
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
|
||||||
|
for (mut actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
|
||||||
|
for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
|
||||||
|
// println!("{} {}", actual_inner, expected_inner);
|
||||||
|
assert_eq!(actual_inner, expected_inner);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ pub struct NGramSequenceIter<'a> {
|
||||||
current_ngram: Vec<&'a &'a str>,
|
current_ngram: Vec<&'a &'a str>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl <'a> NGramSequenceIter<'a> {
|
impl<'a> NGramSequenceIter<'a> {
|
||||||
pub(crate) fn new(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> Self {
|
pub(crate) fn new(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> Self {
|
||||||
Self {
|
Self {
|
||||||
sequence: Box::new(sequence),
|
sequence: Box::new(sequence),
|
||||||
|
|
@ -40,3 +40,50 @@ impl<'a> Iterator for NGramSequenceIter<'a> {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct EveryGramSequenceIter<'a> {
|
||||||
|
sequence: Box<dyn Iterator<Item=&'a &'a str> + 'a>,
|
||||||
|
n: usize,
|
||||||
|
current_ngram: Vec<&'a &'a str>,
|
||||||
|
current_size: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> EveryGramSequenceIter<'a> {
|
||||||
|
pub(crate) fn everygrams(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
sequence: Box::new(sequence),
|
||||||
|
n,
|
||||||
|
current_ngram: Vec::new(),
|
||||||
|
current_size: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for EveryGramSequenceIter<'a> {
|
||||||
|
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if self.current_ngram.len() == 0 {
|
||||||
|
for i in 0..self.n {
|
||||||
|
if let Some(item) = self.sequence.next() {
|
||||||
|
self.current_ngram.push(item);
|
||||||
|
} else {
|
||||||
|
return None; // n > len
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.current_size += 1;
|
||||||
|
|
||||||
|
if self.current_size > self.n {
|
||||||
|
self.current_size = 1;
|
||||||
|
self.current_ngram.remove(0);
|
||||||
|
let maybe_next = self.sequence.next();
|
||||||
|
if maybe_next.is_some() {
|
||||||
|
self.current_ngram.push(&maybe_next.unwrap());
|
||||||
|
} else { return None; }
|
||||||
|
}
|
||||||
|
|
||||||
|
return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size)));
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue