getting there
This commit is contained in:
parent
88acea97b7
commit
d8b79c2e36
3 changed files with 94 additions and 11 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
|
@ -2,10 +2,17 @@
|
|||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "more-asserts"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7843ec2de400bcbc6a6328c958dc38e5359da6e93e72e37bc5246bf1ae776389"
|
||||
|
||||
[[package]]
|
||||
name = "rltk"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"more-asserts",
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -4,4 +4,7 @@ version = "0.1.0"
|
|||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
unicode-segmentation = "1.9.0"
|
||||
unicode-segmentation = "1.9.0"
|
||||
|
||||
[dev-dependencies]
|
||||
more-asserts = "0.2.2"
|
||||
|
|
@ -1,5 +1,8 @@
|
|||
/// Pads a sequence of words
|
||||
/// sentence: sequence to pad, in the form of an Iterator of string slices.
|
||||
use std::slice::Iter;
|
||||
|
||||
/// Returns a padded sequence of items before ngram extraction.
|
||||
///
|
||||
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
|
||||
/// pad_left: if set to true, prepends a padding symbol to the sentence
|
||||
/// left_pad_symbol: the padding symbol to prepend
|
||||
/// pad_right: if set to true, appends a padding symbol after the sentence
|
||||
|
|
@ -9,22 +12,65 @@ pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a str> + 'static, pad_lef
|
|||
Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n)
|
||||
}
|
||||
|
||||
/// Pads a sequence of words, left-padding only. Convenience function that prevents useless arguments
|
||||
/// sentence: sequence to pad, in the form of an Iterator of string slices.
|
||||
/// Returns a padded sequence of items before ngram extraction, left-padding only. Convenience function that prevents useless arguments
|
||||
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
|
||||
/// left_pad_symbol: the padding symbol to prepend
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_sequence_left<'a>(text: impl Iterator<Item=&'a str> + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
Padder::new(Box::new(text), true, left_pad_symbol, false, "", n)
|
||||
pub fn pad_sequence_left<'a>(sequence: impl Iterator<Item=&'a str> + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
Padder::new(Box::new(sequence), true, left_pad_symbol, false, "", n)
|
||||
}
|
||||
|
||||
/// Pads a sequence of words, right-padding only. Convenience function that prevents useless arguments
|
||||
/// Returns a padded sequence of items before ngram extraction, right-padding only. Convenience function that prevents useless arguments
|
||||
///
|
||||
/// sentence: sequence to pad, in the form of an Iterator of string slices.
|
||||
/// sequence: sequence of items to pad, in the form of an Iterator of string slices.
|
||||
/// pad_right: if set to true, appends a padding symbol after the sentence
|
||||
/// right_pad_symbol: the padding symbol to append
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_sequence_right<'a>(text: impl Iterator<Item=&'a str> + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
Padder::new(Box::new(text), false, "", true, right_pad_symbol, n)
|
||||
pub fn pad_sequence_right<'a>(sequence: impl Iterator<Item=&'a str> + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> + 'a {
|
||||
Padder::new(Box::new(sequence), false, "", true, right_pad_symbol, n)
|
||||
}
|
||||
|
||||
/// Return the ngrams generated from a sequence of items, as an iterator.
|
||||
// this is a windowing function on a list
|
||||
// pub fn ngrams<'a>(mut sequence: impl Iterator<Item=&'a str> + 'static, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a str> + 'a> + 'a {
|
||||
pub fn ngrams<'a>(sequence: &'a Vec<&'a str>, n: usize) -> impl Iterator<Item=impl Iterator<Item=&'a &'a str> + 'a> + 'a {
|
||||
let mut ngram = Vec::new();
|
||||
|
||||
NGramSequenceIter { sequence: sequence, n, current_ngram: ngram, index: 0, sequence_iter: None }
|
||||
}
|
||||
|
||||
struct NGramSequenceIter<'a> {
|
||||
sequence_iter: Option<Box<dyn Iterator<Item=&'a &'a str> + 'a>>,
|
||||
sequence: &'a Vec<&'a str>,
|
||||
n: usize,
|
||||
current_ngram: Vec<&'a &'a str>,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for NGramSequenceIter<'a> {
|
||||
type Item = Box<dyn Iterator<Item=&'a &'a str> + 'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.current_ngram.len() == 0 {
|
||||
self.sequence_iter = Some(Box::new(self.sequence.iter()));
|
||||
for i in 0..self.n {
|
||||
self.current_ngram.push(self.sequence_iter.as_mut().unwrap().next().unwrap());
|
||||
self.index += 1;
|
||||
}
|
||||
|
||||
return Some(Box::new(self.current_ngram.clone().into_iter()));
|
||||
} else {
|
||||
self.current_ngram.remove(0);
|
||||
let maybe_next = self.sequence_iter.as_mut().unwrap().next();
|
||||
self.index += 1;
|
||||
return if maybe_next.is_some() {
|
||||
self.current_ngram.push(&maybe_next.unwrap());
|
||||
Some(Box::new(self.current_ngram.clone().into_iter()))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Padder<'a> {
|
||||
|
|
@ -108,6 +154,33 @@ mod tests {
|
|||
assert!(equal(padded, vec!["left", "a", "b", "c", "right"].into_iter()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bigrams() {
|
||||
let sequence = vec!["a", "b", "c", "d"];
|
||||
let mut bigrams = ngrams(&sequence, 2);
|
||||
let mut bigram = bigrams.next().unwrap();
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "a");
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "b");
|
||||
assert!(bigram.next().is_none());
|
||||
|
||||
let mut bigram = bigrams.next().unwrap();
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "b");
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "c");
|
||||
assert!(bigram.next().is_none());
|
||||
|
||||
let mut bigram = bigrams.next().unwrap();
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "c");
|
||||
let item = bigram.next().unwrap();
|
||||
assert_eq!(*item, "d");
|
||||
assert!(bigram.next().is_none());
|
||||
}
|
||||
|
||||
|
||||
fn equal<'a>(mut l1: impl Iterator<Item=&'a str>, mut l2: impl Iterator<Item=&'a str>) -> bool {
|
||||
loop {
|
||||
let e1 = l1.next();
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue