getting there

This commit is contained in:
Sander Hautvast 2022-04-29 12:27:22 +02:00
parent 88acea97b7
commit d8b79c2e36
3 changed files with 94 additions and 11 deletions

7
Cargo.lock generated
View file

@ -2,10 +2,17 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "more-asserts"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7843ec2de400bcbc6a6328c958dc38e5359da6e93e72e37bc5246bf1ae776389"
[[package]]
name = "rltk"
version = "0.1.0"
dependencies = [
"more-asserts",
"unicode-segmentation",
]

View file

@ -5,3 +5,6 @@ edition = "2021"
[dependencies]
unicode-segmentation = "1.9.0"
[dev-dependencies]
more-asserts = "0.2.2"

View file

@ -1,5 +1,8 @@
/// Pads a sequence of words
/// sentence: sequence to pad, in the form of an Iterator of string slices.
use std::slice::Iter;
/// Returns a padded sequence of items before ngram extraction.
///
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
/// pad_left: if set to true, prepends a padding symbol to the sentence
/// left_pad_symbol: the padding symbol to prepend
/// pad_right: if set to true, appends a padding symbol after the sentence
@ -9,22 +12,65 @@ pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a str> + 'static, pad_lef
Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n)
}
/// Pads a sequence of words, left-padding only. Convenience function that prevents useless arguments
/// sentence: sequence to pad, in the form of an Iterator of string slices.
/// Returns a padded sequence of items before ngram extraction, left-padding only. Convenience function that prevents useless arguments
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
/// left_pad_symbol: the padding symbol to prepend
/// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_sequence_left<'a>(text: impl Iterator<Item=&'a str> + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
Padder::new(Box::new(text), true, left_pad_symbol, false, "", n)
pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a str> + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
Padder::new(Box::new(sequence), true, left_pad_symbol, false, "", n)
}
/// Pads a sequence of words, right-padding only. Convenience function that prevents useless arguments
/// Returns a padded sequence of items before ngram extraction, right-padding only. Convenience function that prevents useless arguments
///
/// sentence: sequence to pad, in the form of an Iterator of string slices.
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
/// pad_right: if set to true, appends a padding symbol after the sentence
/// right_pad_symbol: the padding symbol to append
/// n: the n in n-grams; so for bigrams set to 2, etc
pub fn pad_sequence_right<'a>(text: impl Iterator<Item=&'a str> + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
Padder::new(Box::new(text), false, "", true, right_pad_symbol, n)
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a str> + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> + 'a {
Padder::new(Box::new(sequence), false, "", true, right_pad_symbol, n)
}
/// Return the ngrams generated from a sequence of items, as an iterator.
// this is a windowing function on a list
// pub fn ngrams<'a>(mut sequence: impl Iterator<Item=&'a str> + 'static, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a str> + 'a> + 'a {
pub fn ngrams<'a>(sequence: &'a Vec<&'a str>, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
let mut ngram = Vec::new();
NGramSequenceIter { sequence: sequence, n, current_ngram: ngram, index: 0, sequence_iter: None }
}
struct NGramSequenceIter<'a> {
sequence_iter: Option<Box<dyn Iterator<Item=&'a &'a str> + 'a>>,
sequence: &'a Vec<&'a str>,
n: usize,
current_ngram: Vec<&'a &'a str>,
index: usize,
}
impl<'a> Iterator for NGramSequenceIter<'a> {
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.current_ngram.len() == 0 {
self.sequence_iter = Some(Box::new(self.sequence.iter()));
for i in 0..self.n {
self.current_ngram.push(self.sequence_iter.as_mut().unwrap().next().unwrap());
self.index += 1;
}
return Some(Box::new(self.current_ngram.clone().into_iter()));
} else {
self.current_ngram.remove(0);
let maybe_next = self.sequence_iter.as_mut().unwrap().next();
self.index += 1;
return if maybe_next.is_some() {
self.current_ngram.push(&maybe_next.unwrap());
Some(Box::new(self.current_ngram.clone().into_iter()))
} else {
None
};
}
}
}
pub(crate) struct Padder<'a> {
@ -108,6 +154,33 @@ mod tests {
assert!(equal(padded, vec!["left", "a", "b", "c", "right"].into_iter()));
}
#[test]
fn test_bigrams() {
let sequence = vec!["a", "b", "c", "d"];
let mut bigrams = ngrams(&sequence, 2);
let mut bigram = bigrams.next().unwrap();
let item = bigram.next().unwrap();
assert_eq!(*item, "a");
let item = bigram.next().unwrap();
assert_eq!(*item, "b");
assert!(bigram.next().is_none());
let mut bigram = bigrams.next().unwrap();
let item = bigram.next().unwrap();
assert_eq!(*item, "b");
let item = bigram.next().unwrap();
assert_eq!(*item, "c");
assert!(bigram.next().is_none());
let mut bigram = bigrams.next().unwrap();
let item = bigram.next().unwrap();
assert_eq!(*item, "c");
let item = bigram.next().unwrap();
assert_eq!(*item, "d");
assert!(bigram.next().is_none());
}
fn equal<'a>(mut l1: impl Iterator<Item=&'a str>, mut l2: impl Iterator<Item=&'a str>) -> bool {
loop {
let e1 = l1.next();