getting there
This commit is contained in:
parent
88acea97b7
commit
d8b79c2e36
3 changed files with 94 additions and 11 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
|
@ -2,10 +2,17 @@
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 3
|
version = 3
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "more-asserts"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7843ec2de400bcbc6a6328c958dc38e5359da6e93e72e37bc5246bf1ae776389"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rltk"
|
name = "rltk"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"more-asserts",
|
||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,4 +4,7 @@ version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
unicode-segmentation = "1.9.0"
|
unicode-segmentation = "1.9.0"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
more-asserts = "0.2.2"
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
/// Pads a sequence of words
|
use std::slice::Iter;
|
||||||
/// sentence: sequence to pad, in the form of an Iterator of string slices.
|
|
||||||
|
/// Returns a padded sequence of items before ngram extraction.
|
||||||
|
///
|
||||||
|
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
|
||||||
/// pad_left: if set to true, prepends a padding symbol to the sentence
|
/// pad_left: if set to true, prepends a padding symbol to the sentence
|
||||||
/// left_pad_symbol: the padding symbol to prepend
|
/// left_pad_symbol: the padding symbol to prepend
|
||||||
/// pad_right: if set to true, appends a padding symbol after the sentence
|
/// pad_right: if set to true, appends a padding symbol after the sentence
|
||||||
|
|
@ -9,22 +12,65 @@ pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a str> + 'static, pad_lef
|
||||||
Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n)
|
Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Pads a sequence of words, left-padding only. Convenience function that prevents useless arguments
|
/// Returns a padded sequence of items before ngram extraction, left-padding only. Convenience function that prevents useless arguments
|
||||||
/// sentence: sequence to pad, in the form of an Iterator of string slices.
|
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
|
||||||
/// left_pad_symbol: the padding symbol to prepend
|
/// left_pad_symbol: the padding symbol to prepend
|
||||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||||
pub fn pad_sequence_left<'a>(text: impl Iterator<Item=&'a str> + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a str> + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
||||||
Padder::new(Box::new(text), true, left_pad_symbol, false, "", n)
|
Padder::new(Box::new(sequence), true, left_pad_symbol, false, "", n)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Pads a sequence of words, right-padding only. Convenience function that prevents useless arguments
|
/// Returns a padded sequence of items before ngram extraction, right-padding only. Convenience function that prevents useless arguments
|
||||||
///
|
///
|
||||||
/// sentence: sequence to pad, in the form of an Iterator of string slices.
|
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
|
||||||
/// pad_right: if set to true, appends a padding symbol after the sentence
|
/// pad_right: if set to true, appends a padding symbol after the sentence
|
||||||
/// right_pad_symbol: the padding symbol to append
|
/// right_pad_symbol: the padding symbol to append
|
||||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||||
pub fn pad_sequence_right<'a>(text: impl Iterator<Item=&'a str> + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a str> + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> + 'a {
|
||||||
Padder::new(Box::new(text), false, "", true, right_pad_symbol, n)
|
Padder::new(Box::new(sequence), false, "", true, right_pad_symbol, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the ngrams generated from a sequence of items, as an iterator.
|
||||||
|
// this is a windowing function on a list
|
||||||
|
// pub fn ngrams<'a>(mut sequence: impl Iterator<Item=&'a str> + 'static, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a str> + 'a> + 'a {
|
||||||
|
pub fn ngrams<'a>(sequence: &'a Vec<&'a str>, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
|
||||||
|
let mut ngram = Vec::new();
|
||||||
|
|
||||||
|
NGramSequenceIter { sequence: sequence, n, current_ngram: ngram, index: 0, sequence_iter: None }
|
||||||
|
}
|
||||||
|
|
||||||
|
struct NGramSequenceIter<'a> {
|
||||||
|
sequence_iter: Option<Box<dyn Iterator<Item=&'a &'a str> + 'a>>,
|
||||||
|
sequence: &'a Vec<&'a str>,
|
||||||
|
n: usize,
|
||||||
|
current_ngram: Vec<&'a &'a str>,
|
||||||
|
index: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for NGramSequenceIter<'a> {
|
||||||
|
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if self.current_ngram.len() == 0 {
|
||||||
|
self.sequence_iter = Some(Box::new(self.sequence.iter()));
|
||||||
|
for i in 0..self.n {
|
||||||
|
self.current_ngram.push(self.sequence_iter.as_mut().unwrap().next().unwrap());
|
||||||
|
self.index += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Some(Box::new(self.current_ngram.clone().into_iter()));
|
||||||
|
} else {
|
||||||
|
self.current_ngram.remove(0);
|
||||||
|
let maybe_next = self.sequence_iter.as_mut().unwrap().next();
|
||||||
|
self.index += 1;
|
||||||
|
return if maybe_next.is_some() {
|
||||||
|
self.current_ngram.push(&maybe_next.unwrap());
|
||||||
|
Some(Box::new(self.current_ngram.clone().into_iter()))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct Padder<'a> {
|
pub(crate) struct Padder<'a> {
|
||||||
|
|
@ -108,6 +154,33 @@ mod tests {
|
||||||
assert!(equal(padded, vec!["left", "a", "b", "c", "right"].into_iter()));
|
assert!(equal(padded, vec!["left", "a", "b", "c", "right"].into_iter()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bigrams() {
|
||||||
|
let sequence = vec!["a", "b", "c", "d"];
|
||||||
|
let mut bigrams = ngrams(&sequence, 2);
|
||||||
|
let mut bigram = bigrams.next().unwrap();
|
||||||
|
let item = bigram.next().unwrap();
|
||||||
|
assert_eq!(*item, "a");
|
||||||
|
let item = bigram.next().unwrap();
|
||||||
|
assert_eq!(*item, "b");
|
||||||
|
assert!(bigram.next().is_none());
|
||||||
|
|
||||||
|
let mut bigram = bigrams.next().unwrap();
|
||||||
|
let item = bigram.next().unwrap();
|
||||||
|
assert_eq!(*item, "b");
|
||||||
|
let item = bigram.next().unwrap();
|
||||||
|
assert_eq!(*item, "c");
|
||||||
|
assert!(bigram.next().is_none());
|
||||||
|
|
||||||
|
let mut bigram = bigrams.next().unwrap();
|
||||||
|
let item = bigram.next().unwrap();
|
||||||
|
assert_eq!(*item, "c");
|
||||||
|
let item = bigram.next().unwrap();
|
||||||
|
assert_eq!(*item, "d");
|
||||||
|
assert!(bigram.next().is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
fn equal<'a>(mut l1: impl Iterator<Item=&'a str>, mut l2: impl Iterator<Item=&'a str>) -> bool {
|
fn equal<'a>(mut l1: impl Iterator<Item=&'a str>, mut l2: impl Iterator<Item=&'a str>) -> bool {
|
||||||
loop {
|
loop {
|
||||||
let e1 = l1.next();
|
let e1 = l1.next();
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue