commit bdd1dcea065a87ba2920db41382179c54c27818b Author: Sander Hautvast Date: Thu Apr 28 16:18:17 2022 +0200 first commit: padding, and draft for ngrams, probably useless diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..7c30d21 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,4 @@ +mod edit_distance; +mod ngrams; +mod lm; +mod util; \ No newline at end of file diff --git a/src/lm/mod.rs b/src/lm/mod.rs new file mode 100644 index 0000000..c8db2af --- /dev/null +++ b/src/lm/mod.rs @@ -0,0 +1 @@ +mod preprocessing; \ No newline at end of file diff --git a/src/lm/preprocessing.rs b/src/lm/preprocessing.rs new file mode 100644 index 0000000..e1e6a14 --- /dev/null +++ b/src/lm/preprocessing.rs @@ -0,0 +1,11 @@ +/// Pads a sequence of words with defaults; prepends "" and appends "" +/// +/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices. +/// n: the n in n-grams; so for bigrams set to 2, etc +pub fn pad_both_ends<'a>(text: impl Iterator + 'static, n: usize) -> impl Iterator { + crate::util::Padder::new(Box::new(text), true, "", true,"", n) +} + + + + diff --git a/src/ngrams.rs b/src/ngrams.rs new file mode 100644 index 0000000..cfdaadc --- /dev/null +++ b/src/ngrams.rs @@ -0,0 +1,178 @@ +use std::cmp::Ordering; +use std::collections::BTreeMap; + +#[derive(Debug, Eq, Ord)] +pub struct NGram(Vec<&'static str>); + +impl NGram { + pub fn new(elements: Vec<&'static str>) -> Self { + Self { + 0: elements + } + } + + pub fn new_bigram(element1: &'static str, element2: &'static str) -> Self { + Self { + 0: vec![element1, element2] + } + } + + pub fn new_trigram(element1: &'static str, element2: &'static str, element3: &'static str) -> Self { + Self { + 0: vec![element1, element2, element3] + } + } + + pub fn tail(&self) -> Self { + Self { + 0: self.0[1..].to_vec() + } + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn get(&self, index: usize) -> &'static str { + unsafe { + self.0.get_unchecked(index) + } + } +} + +impl PartialEq for NGram { + fn eq(&self, other: &Self) -> bool { + if self.len() != other.len() { + return false; + } else { + for (i, element) in self.0.iter().enumerate() { + if *element != other.get(i) { + return false; + } + } + } + + true + } +} + +impl PartialOrd for NGram { + fn partial_cmp(&self, other: &Self) -> Option { + if self.eq(other) { + Some(Ordering::Equal) + } else { + for (i, element) in self.0.iter().enumerate() { + if let Some(ordering) = element.partial_cmp(&other.get(i)) { + if ordering != Ordering::Equal { + return Some(ordering); + } + } + } + Some(Ordering::Equal) + } + } +} + +pub struct Model { + word_counts: BTreeMap<&'static str, u32>, + ngram_counts: BTreeMap, +} + +impl Model { + pub fn calc_digrams(corpus: Vec>) -> Self { + let mut word_counts = BTreeMap::new(); + let mut ngram_counts = BTreeMap::new(); + for sentence in corpus { + for word in sentence.iter() { + let count = word_counts.entry(*word).or_insert(0); + *count += 1; + } + + for i in 0..sentence.len() - 1 { + let ngram = NGram::new(vec![sentence[i], sentence[i + 1]]); + let count = ngram_counts.entry(ngram).or_insert(0); + *count += 1; + } + } + Self { + ngram_counts, + word_counts, + } + } + + // only tested for 2-grams, and that's only happy cases + pub fn p(&self, ngram: NGram) -> Option { + // let mut probability = (*self.word_counts.get(ngram.get(0)).unwrap() as f64) / self.word_counts.len() as f64; + // + // + // for index in 0..ngram.len() - 1 { + // println!("{}", probability); + // + // + // + // println!("{}", ng_p); + // probability = ng_p * probability; + // } + // println!("{}", probability); + // Some(probability) + None + } + + fn p_ngram(&self, ngram: NGram, intermediate: f64) -> f64 { + // for index in 0..ngram.len() - 1 { + // self.ngram_counts.get(&ngram) + // .map(|count| (*count as f64) / (self.word_counts[ngram.0.get(0).unwrap()] as f64)).unwrap() + // } + // + // if ngram.len() > 2 { + // println!("{}", intermediate); + // intermediate * self.pp(ngram.tail(), intermediate) //TODO + // } else { + // self.ngram_counts.get(&ngram) + // .map(|count| (*count as f64) / (self.word_counts[ngram.0.get(0).unwrap()] as f64)).unwrap() //TODO + // } + 0.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_create_model() { + let corpus = vec![ + vec!["", "I", "am", "Sam", ""], + vec!["", "Sam", "I", "am", ""], + vec!["", "I", "do", "not", "like", "eggs", "and", "ham", ""], + ]; + + let model = Model::calc_digrams(corpus); + + // assert_eq!(model.p(NGram::new(vec!["", "I"])), Some(0.6666666666666666_f64)); + // assert_eq!(model.p(NGram::new(vec!["Sam", ""])), Some(0.5_f64)); + // assert_eq!(model.p(NGram::new(vec!["", "Sam"])), Some(0.33333333333333333_f64)); + // assert_eq!(model.p(NGram::new(vec!["am", "Sam"])), Some(0.5_f64)); + // assert_eq!(model.p(NGram::new(vec!["I", "am"])), Some(0.6666666666666666_f64)); + + println!("{:?}", model.p(NGram::new(vec!["I", "am", "Sam"]))); + } + + #[test] + fn test_ngram_eq() { + let n1 = NGram::new(vec!["1", "2"]); + let n2 = NGram::new(vec!["1", "2"]); + let n3 = NGram::new(vec!["3", "4"]); + + assert_eq!(n1, n2); + assert_ne!(n1, n3); + assert_ne!(n2, n3); + } + + #[test] + fn test_ngram_tail() { + let n1 = NGram::new(vec!["1", "2", "3"]); + let n2 = NGram::new(vec!["2", "3"]); + assert_eq!(n1.tail(), n2); + } +} \ No newline at end of file diff --git a/src/util/mod.rs b/src/util/mod.rs new file mode 100644 index 0000000..9d2ef9b --- /dev/null +++ b/src/util/mod.rs @@ -0,0 +1,130 @@ +/// Pads a sequence of words +/// sentence: sequence to pad, in the form of an Iterator of string slices. +/// pad_left: if set to true, prepends a padding symbol to the sentence +/// left_pad_symbol: the padding symbol to prepend +/// pad_right: if set to true, appends a padding symbol after the sentence +/// right_pad_symbol: the padding symbol to append +/// n: the n in n-grams; so for bigrams set to 2, etc +pub fn pad_sequence<'a>(sentence: impl Iterator + 'static, pad_left: bool, left_pad_symbol: &'static str, pad_right: bool, right_pad_symbol: &'static str, n: usize) -> impl Iterator { + Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n) +} + +/// Pads a sequence of words, left-padding only. Convenience function that prevents useless arguments +/// sentence: sequence to pad, in the form of an Iterator of string slices. +/// left_pad_symbol: the padding symbol to prepend +/// n: the n in n-grams; so for bigrams set to 2, etc +pub fn pad_sequence_left<'a>(text: impl Iterator + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator { + Padder::new(Box::new(text), true, left_pad_symbol, false, "", n) +} + +/// Pads a sequence of words, right-padding only. Convenience function that prevents useless arguments +/// +/// sentence: sequence to pad, in the form of an Iterator of string slices. +/// pad_right: if set to true, appends a padding symbol after the sentence +/// right_pad_symbol: the padding symbol to append +/// n: the n in n-grams; so for bigrams set to 2, etc +pub fn pad_sequence_right<'a>(text: impl Iterator + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator { + Padder::new(Box::new(text), false, "", true, right_pad_symbol, n) +} + +pub(crate) struct Padder<'a> { + n: usize, + text: Box>, + pad_left: bool, + left_index: isize, + left_pad_symbol: &'static str, + pad_right: bool, + right_index: isize, + right_pad_symbol: &'static str, +} + +impl<'a> Iterator for Padder<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option { + if self.pad_left && self.left_index < self.n as isize { + self.left_index += 1; + return Some(self.left_pad_symbol); + } else { + let maybe_next = self.text.next(); + if maybe_next.is_some() { + return maybe_next; + } else { + if self.pad_right && self.right_index < self.n as isize { + self.right_index += 1; + return Some(self.right_pad_symbol); + } + } + } + + None + } +} + +impl<'a> Padder<'a> { + pub(crate) fn new(text: Box>, pad_left: bool, left_pad_symbol: &'static str, + pad_right: bool, right_pad_symbol: &'static str, n: usize, ) -> Self { + Self { text, n, pad_left, left_index: 1, left_pad_symbol, pad_right, right_index: 1, right_pad_symbol } + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pad_both_ends_default_n2() { + let text = vec!["a", "b", "c"].into_iter(); + let padded = pad_sequence(text, true, "", true, "", 2); + assert!(equal(padded, vec!["", "a", "b", "c", ""].into_iter())); + } + + #[test] + fn test_pad_left() { + let text = vec!["a", "b", "c"].into_iter(); + let padded = pad_sequence_left(text, "", 2); + assert!(equal(padded, vec!["", "a", "b", "c"].into_iter())); + } + + #[test] + fn test_pad_right() { + let text = vec!["a", "b", "c"].into_iter(); + let padded = pad_sequence_right(text, "", 2); + assert!(equal(padded, vec!["a", "b", "c", ""].into_iter())); + } + + #[test] + fn test_pad_both_ends_default_n_eq_3() { + let text = vec!["a", "b", "c"].into_iter(); + let padded = pad_sequence(text, true, "", true, "", 3); + assert!(equal(padded, vec!["", "", "a", "b", "c", "", ""].into_iter())); + } + + #[test] + fn test_pad_both_ends_non_default_symbols() { + let text = vec!["a", "b", "c"].into_iter(); + let padded = pad_sequence(text, true, "left", true, "right", 2); + assert!(equal(padded, vec!["left", "a", "b", "c", "right"].into_iter())); + } + + fn equal<'a>(mut l1: impl Iterator, mut l2: impl Iterator) -> bool { + loop { + let e1 = l1.next(); + let e2 = l2.next(); + if e1.is_none() { + return if e2.is_none() { + true + } else { + false + }; + } else if e2.is_none() { + return false; + } else { + if e1.unwrap() != e2.unwrap() { + return false; + } + } + } + } +} \ No newline at end of file