rltk/src/util/ngrams.rs
2022-05-09 22:17:30 +02:00

125 lines
No EOL
3.6 KiB
Rust

pub struct NGramSequenceIter<'a> {
sequence: Box<dyn Iterator<Item=&'a &'a str> + 'a>,
n: usize,
current_ngram: Vec<&'a &'a str>,
}
impl<'a> NGramSequenceIter<'a> {
pub(crate) fn new(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> Self {
Self {
sequence: Box::new(sequence),
n,
current_ngram: Vec::new(),
}
}
}
impl<'a> Iterator for NGramSequenceIter<'a> {
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
fn next(&mut self) -> Option<Self::Item> {
return if self.current_ngram.len() == 0 {
for _ in 0..self.n {
if let Some(item) = self.sequence.next() {
self.current_ngram.push(item);
} else {
return None; // n > len
}
}
Some(Box::new(self.current_ngram.clone().into_iter()))
} else {
self.current_ngram.remove(0);
let maybe_next = self.sequence.next();
if maybe_next.is_some() {
self.current_ngram.push(&maybe_next.unwrap());
Some(Box::new(self.current_ngram.clone().into_iter()))
} else {
None
}
};
}
}
pub struct EveryGramSequenceIter<'a> {
sequence: Box<dyn Iterator<Item=&'a &'a str> + 'a>,
n: usize,
current_ngram: Vec<&'a &'a str>,
current_size: usize,
}
impl<'a> EveryGramSequenceIter<'a> {
pub(crate) fn everygrams(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> Self {
Self {
sequence: Box::new(sequence),
n,
current_ngram: Vec::new(),
current_size: 0,
}
}
}
impl<'a> Iterator for EveryGramSequenceIter<'a> {
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
//noinspection DuplicatedCode, hard to deduplicate because of early return
fn next(&mut self) -> Option<Self::Item> {
if self.current_ngram.len() == 0 {
for _ in 0..self.n {
if let Some(item) = self.sequence.next() {
self.current_ngram.push(item);
} else {
return None; // n > len
}
}
}
self.current_size += 1;
if self.current_size > self.n {
self.current_size = 1;
self.current_ngram.remove(0);
let maybe_next = self.sequence.next();
if maybe_next.is_some() {
self.current_ngram.push(&maybe_next.unwrap());
} else { return None; }
}
return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size)));
}
}
pub struct FlatteningIter<'a> {
ngrams: Box<dyn Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a>,
current_ngram: Option<Box<dyn Iterator<Item=&'a &'a str> + 'a>>,
}
impl<'a> FlatteningIter<'a> {
pub(crate) fn new(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> Self {
Self {
ngrams: Box::new(ngrams),
current_ngram: None,
}
}
}
impl<'a> Iterator for FlatteningIter<'a> {
type Item = &'a &'a str;
fn next(&mut self) -> Option<Self::Item> {
if self.current_ngram.is_none() {
self.current_ngram = self.ngrams.next();
}
while let Some(ref mut current_ngram) = self.current_ngram {
let current_item = current_ngram.next();
if current_item.is_some() {
return current_item;
} else {
self.current_ngram = self.ngrams.next();
}
}
None
}
}