From d8b79c2e36933dc8d4fc1faa4faad96dc7786d95 Mon Sep 17 00:00:00 2001 From: Sander Hautvast Date: Fri, 29 Apr 2022 12:27:22 +0200 Subject: [PATCH] getting there --- Cargo.lock | 7 ++++ Cargo.toml | 5 ++- src/util/mod.rs | 93 +++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 94 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5972a9d..98bb894 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,10 +2,17 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "more-asserts" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7843ec2de400bcbc6a6328c958dc38e5359da6e93e72e37bc5246bf1ae776389" + [[package]] name = "rltk" version = "0.1.0" dependencies = [ + "more-asserts", "unicode-segmentation", ] diff --git a/Cargo.toml b/Cargo.toml index 79f17da..bb580aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,4 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -unicode-segmentation = "1.9.0" \ No newline at end of file +unicode-segmentation = "1.9.0" + +[dev-dependencies] +more-asserts = "0.2.2" \ No newline at end of file diff --git a/src/util/mod.rs b/src/util/mod.rs index 9d2ef9b..eb57a42 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,8 @@ -/// Pads a sequence of words -/// sentence: sequence to pad, in the form of an Iterator of string slices. +use std::slice::Iter; + +/// Returns a padded sequence of items before ngram extraction. +/// +/// sequence: sequence of items to pad, in the form of an Iterator of string slices. /// pad_left: if set to true, prepends a padding symbol to the sentence /// left_pad_symbol: the padding symbol to prepend /// pad_right: if set to true, appends a padding symbol after the sentence @@ -9,22 +12,65 @@ pub fn pad_sequence<'a>(sentence: impl Iterator + 'static, pad_lef Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n) } -/// Pads a sequence of words, left-padding only. Convenience function that prevents useless arguments -/// sentence: sequence to pad, in the form of an Iterator of string slices. +/// Returns a padded sequence of items before ngram extraction, left-padding only. Convenience function that prevents useless arguments +/// sequence: sequence of items to pad, in the form of an Iterator of string slices. /// left_pad_symbol: the padding symbol to prepend /// n: the n in n-grams; so for bigrams set to 2, etc -pub fn pad_sequence_left<'a>(text: impl Iterator + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator { - Padder::new(Box::new(text), true, left_pad_symbol, false, "", n) +pub fn pad_sequence_left<'a>(sequence: impl Iterator + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator { + Padder::new(Box::new(sequence), true, left_pad_symbol, false, "", n) } -/// Pads a sequence of words, right-padding only. Convenience function that prevents useless arguments +/// Returns a padded sequence of items before ngram extraction, right-padding only. Convenience function that prevents useless arguments /// -/// sentence: sequence to pad, in the form of an Iterator of string slices. +/// sequence: sequence of items to pad, in the form of an Iterator of string slices. /// pad_right: if set to true, appends a padding symbol after the sentence /// right_pad_symbol: the padding symbol to append /// n: the n in n-grams; so for bigrams set to 2, etc -pub fn pad_sequence_right<'a>(text: impl Iterator + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator { - Padder::new(Box::new(text), false, "", true, right_pad_symbol, n) +pub fn pad_sequence_right<'a>(sequence: impl Iterator + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator + 'a { + Padder::new(Box::new(sequence), false, "", true, right_pad_symbol, n) +} + +/// Return the ngrams generated from a sequence of items, as an iterator. +// this is a windowing function on a list +// pub fn ngrams<'a>(mut sequence: impl Iterator + 'static, n: usize) -> impl Iterator + 'a> + 'a { +pub fn ngrams<'a>(sequence: &'a Vec<&'a str>, n: usize) -> impl Iterator + 'a> + 'a { + let mut ngram = Vec::new(); + + NGramSequenceIter { sequence: sequence, n, current_ngram: ngram, index: 0, sequence_iter: None } +} + +struct NGramSequenceIter<'a> { + sequence_iter: Option + 'a>>, + sequence: &'a Vec<&'a str>, + n: usize, + current_ngram: Vec<&'a &'a str>, + index: usize, +} + +impl<'a> Iterator for NGramSequenceIter<'a> { + type Item = Box + 'a>; + + fn next(&mut self) -> Option { + if self.current_ngram.len() == 0 { + self.sequence_iter = Some(Box::new(self.sequence.iter())); + for i in 0..self.n { + self.current_ngram.push(self.sequence_iter.as_mut().unwrap().next().unwrap()); + self.index += 1; + } + + return Some(Box::new(self.current_ngram.clone().into_iter())); + } else { + self.current_ngram.remove(0); + let maybe_next = self.sequence_iter.as_mut().unwrap().next(); + self.index += 1; + return if maybe_next.is_some() { + self.current_ngram.push(&maybe_next.unwrap()); + Some(Box::new(self.current_ngram.clone().into_iter())) + } else { + None + }; + } + } } pub(crate) struct Padder<'a> { @@ -108,6 +154,33 @@ mod tests { assert!(equal(padded, vec!["left", "a", "b", "c", "right"].into_iter())); } + #[test] + fn test_bigrams() { + let sequence = vec!["a", "b", "c", "d"]; + let mut bigrams = ngrams(&sequence, 2); + let mut bigram = bigrams.next().unwrap(); + let item = bigram.next().unwrap(); + assert_eq!(*item, "a"); + let item = bigram.next().unwrap(); + assert_eq!(*item, "b"); + assert!(bigram.next().is_none()); + + let mut bigram = bigrams.next().unwrap(); + let item = bigram.next().unwrap(); + assert_eq!(*item, "b"); + let item = bigram.next().unwrap(); + assert_eq!(*item, "c"); + assert!(bigram.next().is_none()); + + let mut bigram = bigrams.next().unwrap(); + let item = bigram.next().unwrap(); + assert_eq!(*item, "c"); + let item = bigram.next().unwrap(); + assert_eq!(*item, "d"); + assert!(bigram.next().is_none()); + } + + fn equal<'a>(mut l1: impl Iterator, mut l2: impl Iterator) -> bool { loop { let e1 = l1.next();