added flatten, moved edit_distance to the right place
This commit is contained in:
parent
d907001acd
commit
294158a640
8 changed files with 153 additions and 293 deletions
|
|
@ -1,99 +0,0 @@
|
||||||
// sandbox, to be removed
|
|
||||||
|
|
||||||
use unicode_segmentation::UnicodeSegmentation;
|
|
||||||
|
|
||||||
//could also be powers of 2 that are combined using bitwise-or
|
|
||||||
// enum Backtrace {
|
|
||||||
// LEFT,
|
|
||||||
// DOWN,
|
|
||||||
// DIAGONAL,
|
|
||||||
// }
|
|
||||||
|
|
||||||
struct Element {
|
|
||||||
value: usize,
|
|
||||||
// backtraces: Vec<Backtrace>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Element {
|
|
||||||
fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
value: 0,
|
|
||||||
// backtraces: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_levenshtein_distance(word1: &str, word2: &str) -> usize {
|
|
||||||
get_edit_distance_table(word1, word2)[word1.len()][word2.len()].value
|
|
||||||
}
|
|
||||||
|
|
||||||
// non recursive implementation requires a table
|
|
||||||
// my guess is that this is more efficient (should check)
|
|
||||||
fn get_edit_distance_table(word1: &str, word2: &str) -> Vec<Vec<Element>> {
|
|
||||||
// create table
|
|
||||||
let mut table = Vec::new();
|
|
||||||
for _ in 0..=word1.len() {
|
|
||||||
let mut row = Vec::new();
|
|
||||||
for _ in 0..=word2.len() {
|
|
||||||
row.push(Element::new())
|
|
||||||
}
|
|
||||||
table.push(row);
|
|
||||||
}
|
|
||||||
|
|
||||||
// set the boundaries
|
|
||||||
for i in 0..=word1.len() {
|
|
||||||
table[i][0].value = i;
|
|
||||||
}
|
|
||||||
for i in 1..=word2.len() {
|
|
||||||
table[0][i].value = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i1, g1) in word1.graphemes(true).enumerate() {
|
|
||||||
for (i2, g2) in word2.graphemes(true).enumerate() {
|
|
||||||
let d_del = table[i1][i2 + 1].value + 1; //deletion
|
|
||||||
let d_ins = table[i1 + 1][i2].value + 1; //insertion
|
|
||||||
let d_sub = table[i1][i2].value + (if g1 == g2 { 0 } else { 2 }); // substitution
|
|
||||||
let min = usize::min(d_del, usize::min(d_ins, d_sub));
|
|
||||||
let element = table[i1 + 1].get_mut(i2 + 1).unwrap();
|
|
||||||
element.value = min;
|
|
||||||
// if d_del == min {
|
|
||||||
// element.backtraces.push(Backtrace::DOWN);
|
|
||||||
// }
|
|
||||||
// if d_ins == min {
|
|
||||||
// element.backtraces.push(Backtrace::LEFT);
|
|
||||||
// }
|
|
||||||
// if d_sub == min {
|
|
||||||
// element.backtraces.push(Backtrace::DIAGONAL);
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
table
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::{get_edit_distance_table, get_levenshtein_distance};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_get_levenshtein_distance() {
|
|
||||||
assert_eq!(get_levenshtein_distance("intention", "execution"), 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_get_edit_distance_table() {
|
|
||||||
// example from Stanford NLP course: https://youtu.be/kgcEaoM_QJA
|
|
||||||
let word1 = "intention";
|
|
||||||
let word2 = "execution";
|
|
||||||
|
|
||||||
let outcome: [[usize; 10]; 10] = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 6, 7, 8], [2, 3, 4, 5, 6, 7, 8, 7, 8, 7], [3, 4, 5, 6, 7, 8, 7, 8, 9, 8], [4, 3, 4, 5, 6, 7, 8, 9, 10, 9],
|
|
||||||
[5, 4, 5, 6, 7, 8, 9, 10, 11, 10], [6, 5, 6, 7, 8, 9, 8, 9, 10, 11], [7, 6, 7, 8, 9, 10, 9, 8, 9, 10], [8, 7, 8, 9, 10, 11, 10, 9, 8, 9], [9, 8, 9, 10, 11, 12, 11, 10, 9, 8]];
|
|
||||||
|
|
||||||
let tab = get_edit_distance_table(word1, word2);
|
|
||||||
|
|
||||||
for (rowindex, row) in tab.iter().enumerate() {
|
|
||||||
for (colindex, element) in row.iter().enumerate() {
|
|
||||||
assert_eq!(outcome[rowindex][colindex], element.value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
mod edit_distance; // to be removed
|
|
||||||
mod ngrams;// to be removed
|
|
||||||
pub mod lm;
|
pub mod lm;
|
||||||
pub mod util;
|
pub mod util;
|
||||||
|
pub mod metrics;
|
||||||
|
|
@ -1 +1 @@
|
||||||
mod preprocessing;
|
pub mod preprocessing;
|
||||||
52
src/metrics/distance.rs
Normal file
52
src/metrics/distance.rs
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
// sandbox, to be removed
|
||||||
|
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
|
pub(crate) struct Element {
|
||||||
|
pub(crate) value: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Element {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
value: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// non recursive implementation requires a table
|
||||||
|
// my guess is that this is more efficient (should check)
|
||||||
|
pub(crate) fn get_edit_distance_table(word1: &str, word2: &str) -> Vec<Vec<Element>> {
|
||||||
|
// create table
|
||||||
|
let mut table = Vec::new();
|
||||||
|
for _ in 0..=word1.len() {
|
||||||
|
let mut row = Vec::new();
|
||||||
|
for _ in 0..=word2.len() {
|
||||||
|
row.push(Element::new())
|
||||||
|
}
|
||||||
|
table.push(row);
|
||||||
|
}
|
||||||
|
|
||||||
|
// set the boundaries
|
||||||
|
for i in 0..=word1.len() {
|
||||||
|
table[i][0].value = i;
|
||||||
|
}
|
||||||
|
for i in 1..=word2.len() {
|
||||||
|
table[0][i].value = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i1, g1) in word1.graphemes(true).enumerate() {
|
||||||
|
for (i2, g2) in word2.graphemes(true).enumerate() {
|
||||||
|
let d_del = table[i1][i2 + 1].value + 1; //deletion
|
||||||
|
let d_ins = table[i1 + 1][i2].value + 1; //insertion
|
||||||
|
let d_sub = table[i1][i2].value + (if g1 == g2 { 0 } else { 2 }); // substitution
|
||||||
|
let min = usize::min(d_del, usize::min(d_ins, d_sub));
|
||||||
|
let element = table[i1 + 1].get_mut(i2 + 1).unwrap();
|
||||||
|
element.value = min;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
table
|
||||||
|
}
|
||||||
|
|
||||||
33
src/metrics/mod.rs
Normal file
33
src/metrics/mod.rs
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
pub mod distance;
|
||||||
|
|
||||||
|
pub fn edit_distance(s1: &str, s2: &str) -> usize {
|
||||||
|
distance::get_edit_distance_table(s1, s2)[s1.len()][s2.len()].value
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_get_levenshtein_distance() {
|
||||||
|
assert_eq!(edit_distance("intention", "execution"), 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_get_edit_distance_table() {
|
||||||
|
// example from Stanford NLP course: https://youtu.be/kgcEaoM_QJA
|
||||||
|
let word1 = "intention";
|
||||||
|
let word2 = "execution";
|
||||||
|
|
||||||
|
let outcome: [[usize; 10]; 10] = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 6, 7, 8], [2, 3, 4, 5, 6, 7, 8, 7, 8, 7], [3, 4, 5, 6, 7, 8, 7, 8, 9, 8], [4, 3, 4, 5, 6, 7, 8, 9, 10, 9],
|
||||||
|
[5, 4, 5, 6, 7, 8, 9, 10, 11, 10], [6, 5, 6, 7, 8, 9, 8, 9, 10, 11], [7, 6, 7, 8, 9, 10, 9, 8, 9, 10], [8, 7, 8, 9, 10, 11, 10, 9, 8, 9], [9, 8, 9, 10, 11, 12, 11, 10, 9, 8]];
|
||||||
|
|
||||||
|
let tab = distance::get_edit_distance_table(word1, word2);
|
||||||
|
|
||||||
|
for (rowindex, row) in tab.iter().enumerate() {
|
||||||
|
for (colindex, element) in row.iter().enumerate() {
|
||||||
|
assert_eq!(outcome[rowindex][colindex], element.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
179
src/ngrams.rs
179
src/ngrams.rs
|
|
@ -1,179 +0,0 @@
|
||||||
// sandbox, to be removed
|
|
||||||
use std::cmp::Ordering;
|
|
||||||
use std::collections::BTreeMap;
|
|
||||||
|
|
||||||
#[derive(Debug, Eq, Ord)]
|
|
||||||
pub struct NGram(Vec<&'static str>);
|
|
||||||
|
|
||||||
impl NGram {
|
|
||||||
pub fn new(elements: Vec<&'static str>) -> Self {
|
|
||||||
Self {
|
|
||||||
0: elements
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn new_bigram(element1: &'static str, element2: &'static str) -> Self {
|
|
||||||
Self {
|
|
||||||
0: vec![element1, element2]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn new_trigram(element1: &'static str, element2: &'static str, element3: &'static str) -> Self {
|
|
||||||
Self {
|
|
||||||
0: vec![element1, element2, element3]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn tail(&self) -> Self {
|
|
||||||
Self {
|
|
||||||
0: self.0[1..].to_vec()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
|
||||||
self.0.len()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get(&self, index: usize) -> &'static str {
|
|
||||||
unsafe {
|
|
||||||
self.0.get_unchecked(index)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartialEq for NGram {
|
|
||||||
fn eq(&self, other: &Self) -> bool {
|
|
||||||
if self.len() != other.len() {
|
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
for (i, element) in self.0.iter().enumerate() {
|
|
||||||
if *element != other.get(i) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartialOrd for NGram {
|
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
|
||||||
if self.eq(other) {
|
|
||||||
Some(Ordering::Equal)
|
|
||||||
} else {
|
|
||||||
for (i, element) in self.0.iter().enumerate() {
|
|
||||||
if let Some(ordering) = element.partial_cmp(&other.get(i)) {
|
|
||||||
if ordering != Ordering::Equal {
|
|
||||||
return Some(ordering);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(Ordering::Equal)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Model {
|
|
||||||
word_counts: BTreeMap<&'static str, u32>,
|
|
||||||
ngram_counts: BTreeMap<NGram, u32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Model {
|
|
||||||
pub fn calc_digrams(corpus: Vec<Vec<&'static str>>) -> Self {
|
|
||||||
let mut word_counts = BTreeMap::new();
|
|
||||||
let mut ngram_counts = BTreeMap::new();
|
|
||||||
for sentence in corpus {
|
|
||||||
for word in sentence.iter() {
|
|
||||||
let count = word_counts.entry(*word).or_insert(0);
|
|
||||||
*count += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
for i in 0..sentence.len() - 1 {
|
|
||||||
let ngram = NGram::new(vec![sentence[i], sentence[i + 1]]);
|
|
||||||
let count = ngram_counts.entry(ngram).or_insert(0);
|
|
||||||
*count += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Self {
|
|
||||||
ngram_counts,
|
|
||||||
word_counts,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// only tested for 2-grams, and that's only happy cases
|
|
||||||
pub fn p(&self, ngram: NGram) -> Option<f64> {
|
|
||||||
// let mut probability = (*self.word_counts.get(ngram.get(0)).unwrap() as f64) / self.word_counts.len() as f64;
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// for index in 0..ngram.len() - 1 {
|
|
||||||
// println!("{}", probability);
|
|
||||||
//
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// println!("{}", ng_p);
|
|
||||||
// probability = ng_p * probability;
|
|
||||||
// }
|
|
||||||
// println!("{}", probability);
|
|
||||||
// Some(probability)
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
fn p_ngram(&self, ngram: NGram, intermediate: f64) -> f64 {
|
|
||||||
// for index in 0..ngram.len() - 1 {
|
|
||||||
// self.ngram_counts.get(&ngram)
|
|
||||||
// .map(|count| (*count as f64) / (self.word_counts[ngram.0.get(0).unwrap()] as f64)).unwrap()
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// if ngram.len() > 2 {
|
|
||||||
// println!("{}", intermediate);
|
|
||||||
// intermediate * self.pp(ngram.tail(), intermediate) //TODO
|
|
||||||
// } else {
|
|
||||||
// self.ngram_counts.get(&ngram)
|
|
||||||
// .map(|count| (*count as f64) / (self.word_counts[ngram.0.get(0).unwrap()] as f64)).unwrap() //TODO
|
|
||||||
// }
|
|
||||||
0.0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_create_model() {
|
|
||||||
let corpus = vec![
|
|
||||||
vec!["<s>", "I", "am", "Sam", "</s>"],
|
|
||||||
vec!["<s>", "Sam", "I", "am", "</s>"],
|
|
||||||
vec!["<s>", "I", "do", "not", "like", "eggs", "and", "ham", "</s>"],
|
|
||||||
];
|
|
||||||
|
|
||||||
let model = Model::calc_digrams(corpus);
|
|
||||||
|
|
||||||
// assert_eq!(model.p(NGram::new(vec!["<s>", "I"])), Some(0.6666666666666666_f64));
|
|
||||||
// assert_eq!(model.p(NGram::new(vec!["Sam", "</s>"])), Some(0.5_f64));
|
|
||||||
// assert_eq!(model.p(NGram::new(vec!["<s>", "Sam"])), Some(0.33333333333333333_f64));
|
|
||||||
// assert_eq!(model.p(NGram::new(vec!["am", "Sam"])), Some(0.5_f64));
|
|
||||||
// assert_eq!(model.p(NGram::new(vec!["I", "am"])), Some(0.6666666666666666_f64));
|
|
||||||
|
|
||||||
println!("{:?}", model.p(NGram::new(vec!["I", "am", "Sam"])));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ngram_eq() {
|
|
||||||
let n1 = NGram::new(vec!["1", "2"]);
|
|
||||||
let n2 = NGram::new(vec!["1", "2"]);
|
|
||||||
let n3 = NGram::new(vec!["3", "4"]);
|
|
||||||
|
|
||||||
assert_eq!(n1, n2);
|
|
||||||
assert_ne!(n1, n3);
|
|
||||||
assert_ne!(n2, n3);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ngram_tail() {
|
|
||||||
let n1 = NGram::new(vec!["1", "2", "3"]);
|
|
||||||
let n2 = NGram::new(vec!["2", "3"]);
|
|
||||||
assert_eq!(n1.tail(), n2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
pub(crate) mod padding;
|
pub(crate) mod padding;
|
||||||
mod ngrams;
|
pub(crate) mod ngrams;
|
||||||
|
|
||||||
use padding::Padder;
|
use padding::Padder;
|
||||||
|
|
||||||
|
|
@ -64,10 +64,14 @@ pub fn trigrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a) -> impl Iter
|
||||||
ngrams::NGramSequenceIter::new(sequence, 3)
|
ngrams::NGramSequenceIter::new(sequence, 3)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
|
pub fn everygrams<'a>(sequence: impl Iterator<Item=&'a &'a str> + 'a, n: usize) -> impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a {
|
||||||
ngrams::EveryGramSequenceIter::everygrams(sequence, n)
|
ngrams::EveryGramSequenceIter::everygrams(sequence, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn flatten<'a>(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> impl Iterator<Item=&'a &'a str> + 'a {
|
||||||
|
ngrams::FlatteningIter::new(ngrams)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::slice::Iter;
|
use std::slice::Iter;
|
||||||
|
|
@ -77,14 +81,14 @@ mod tests {
|
||||||
fn test_pad_both_ends_default_n2() {
|
fn test_pad_both_ends_default_n2() {
|
||||||
let text = vec!["a", "b", "c"].into_iter();
|
let text = vec!["a", "b", "c"].into_iter();
|
||||||
let padded = pad_sequence(text, true, "<s>", true, "</s>", 2);
|
let padded = pad_sequence(text, true, "<s>", true, "</s>", 2);
|
||||||
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c", "</s>"]);
|
should_be_equal_lists2(padded, vec!["<s>", "a", "b", "c", "</s>"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_pad_left() {
|
fn test_pad_left() {
|
||||||
let text = vec!["a", "b", "c"].into_iter();
|
let text = vec!["a", "b", "c"].into_iter();
|
||||||
let padded = pad_sequence_left(text, "<s>", 2);
|
let padded = pad_sequence_left(text, "<s>", 2);
|
||||||
should_be_equal_lists(padded, vec!["<s>", "a", "b", "c"]);
|
should_be_equal_lists2(padded, vec!["<s>", "a", "b", "c"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
@ -92,14 +96,14 @@ mod tests {
|
||||||
let text = vec!["a", "b", "c"].into_iter();
|
let text = vec!["a", "b", "c"].into_iter();
|
||||||
let padded = pad_sequence_right(text, "</s>", 2);
|
let padded = pad_sequence_right(text, "</s>", 2);
|
||||||
|
|
||||||
should_be_equal_lists(padded, vec!["a", "b", "c", "</s>"]);
|
should_be_equal_lists2(padded, vec!["a", "b", "c", "</s>"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_pad_both_ends_default_n_eq_3() {
|
fn test_pad_both_ends_default_n_eq_3() {
|
||||||
let text = vec!["a", "b", "c"].into_iter();
|
let text = vec!["a", "b", "c"].into_iter();
|
||||||
let padded = pad_sequence(text, true, "<s>", true, "</s>", 3);
|
let padded = pad_sequence(text, true, "<s>", true, "</s>", 3);
|
||||||
should_be_equal_lists(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
|
should_be_equal_lists2(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
@ -107,7 +111,7 @@ mod tests {
|
||||||
let text = vec!["a", "b", "c"].into_iter();
|
let text = vec!["a", "b", "c"].into_iter();
|
||||||
let padded = pad_sequence(text, true, "left", true, "right", 2);
|
let padded = pad_sequence(text, true, "left", true, "right", 2);
|
||||||
|
|
||||||
should_be_equal_lists(padded, vec!["left", "a", "b", "c", "right"]);
|
should_be_equal_lists2(padded, vec!["left", "a", "b", "c", "right"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
@ -192,8 +196,16 @@ mod tests {
|
||||||
should_be_equal_list_of_lists(&mut bigrams, expected)
|
should_be_equal_list_of_lists(&mut bigrams, expected)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_flatten(){
|
||||||
|
let sequence = vec!["a", "b", "c", "d", "e"];
|
||||||
|
let expected = vec!["a", "a", "b", "a", "b", "c", "b", "b", "c", "b", "c", "d", "c", "c", "d", "c", "d", "e"];
|
||||||
|
|
||||||
|
should_be_equal_lists(flatten(everygrams(sequence.iter(), 3)), expected);
|
||||||
|
}
|
||||||
|
|
||||||
fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
|
fn should_be_equal_list_of_lists<'a>(actual: &mut impl Iterator<Item=impl Iterator<Item=&'a &'a str>>, expected: Vec<Iter<&'a str>>) {
|
||||||
for (mut actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
|
for (actual_outer, expected_outer) in actual.zip(expected.into_iter()) {
|
||||||
for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
|
for (actual_inner, expected_inner) in actual_outer.zip(expected_outer) {
|
||||||
// println!("{} {}", actual_inner, expected_inner);
|
// println!("{} {}", actual_inner, expected_inner);
|
||||||
assert_eq!(actual_inner, expected_inner);
|
assert_eq!(actual_inner, expected_inner);
|
||||||
|
|
@ -201,7 +213,13 @@ mod tests {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a str>, right: Vec<&'a str>) {
|
fn should_be_equal_lists<'a>(left: impl Iterator<Item=&'a &'a str>, right: Vec<&'a str>) {
|
||||||
|
for (left, right) in left.zip(right.into_iter()) {
|
||||||
|
assert_eq!(*left, right);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn should_be_equal_lists2<'a>(left: impl Iterator<Item=&'a str>, right: Vec<&'a str>) {
|
||||||
for (left, right) in left.zip(right.into_iter()) {
|
for (left, right) in left.zip(right.into_iter()) {
|
||||||
assert_eq!(left, right);
|
assert_eq!(left, right);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ impl<'a> Iterator for NGramSequenceIter<'a> {
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
return if self.current_ngram.len() == 0 {
|
return if self.current_ngram.len() == 0 {
|
||||||
for i in 0..self.n {
|
for _ in 0..self.n {
|
||||||
if let Some(item) = self.sequence.next() {
|
if let Some(item) = self.sequence.next() {
|
||||||
self.current_ngram.push(item);
|
self.current_ngram.push(item);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -62,9 +62,10 @@ impl<'a> EveryGramSequenceIter<'a> {
|
||||||
impl<'a> Iterator for EveryGramSequenceIter<'a> {
|
impl<'a> Iterator for EveryGramSequenceIter<'a> {
|
||||||
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
|
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
|
||||||
|
|
||||||
|
//noinspection DuplicatedCode, hard to deduplicate because of early return
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
if self.current_ngram.len() == 0 {
|
if self.current_ngram.len() == 0 {
|
||||||
for i in 0..self.n {
|
for _ in 0..self.n {
|
||||||
if let Some(item) = self.sequence.next() {
|
if let Some(item) = self.sequence.next() {
|
||||||
self.current_ngram.push(item);
|
self.current_ngram.push(item);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -86,4 +87,39 @@ impl<'a> Iterator for EveryGramSequenceIter<'a> {
|
||||||
|
|
||||||
return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size)));
|
return Some(Box::new(self.current_ngram.clone().into_iter().take(self.current_size)));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FlatteningIter<'a> {
|
||||||
|
ngrams: Box<dyn Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a>,
|
||||||
|
current_ngram: Option<Box<dyn Iterator<Item=&'a &'a str> + 'a>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> FlatteningIter<'a> {
|
||||||
|
pub(crate) fn new(ngrams: impl Iterator<Item=Box<dyn Iterator<Item=&'a &'a str> + 'a>> + 'a) -> Self {
|
||||||
|
Self {
|
||||||
|
ngrams: Box::new(ngrams),
|
||||||
|
current_ngram: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for FlatteningIter<'a> {
|
||||||
|
type Item = &'a &'a str;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if self.current_ngram.is_none() {
|
||||||
|
self.current_ngram = self.ngrams.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
while let Some(ref mut current_ngram) = self.current_ngram {
|
||||||
|
let current_item = current_ngram.next();
|
||||||
|
if current_item.is_some() {
|
||||||
|
return current_item;
|
||||||
|
} else {
|
||||||
|
self.current_ngram = self.ngrams.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Loading…
Add table
Reference in a new issue