first commit: padding, and draft for ngrams, probably useless
This commit is contained in:
commit
bdd1dcea06
6 changed files with 332 additions and 0 deletions
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
4
src/lib.rs
Normal file
4
src/lib.rs
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
mod edit_distance;
|
||||
mod ngrams;
|
||||
mod lm;
|
||||
mod util;
|
||||
1
src/lm/mod.rs
Normal file
1
src/lm/mod.rs
Normal file
|
|
@ -0,0 +1 @@
|
|||
mod preprocessing;
|
||||
11
src/lm/preprocessing.rs
Normal file
11
src/lm/preprocessing.rs
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
/// Pads a sequence of words with defaults; prepends "<s>" and appends "<s>"
|
||||
///
|
||||
/// sentence: sequence of words, tokens, to pad, in the form of an Iterator of string slices.
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_both_ends<'a>(text: impl Iterator<Item=&'a str> + 'static, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
crate::util::Padder::new(Box::new(text), true, "<s>", true,"</s>", n)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
178
src/ngrams.rs
Normal file
178
src/ngrams.rs
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
#[derive(Debug, Eq, Ord)]
|
||||
pub struct NGram(Vec<&'static str>);
|
||||
|
||||
impl NGram {
|
||||
pub fn new(elements: Vec<&'static str>) -> Self {
|
||||
Self {
|
||||
0: elements
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_bigram(element1: &'static str, element2: &'static str) -> Self {
|
||||
Self {
|
||||
0: vec![element1, element2]
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_trigram(element1: &'static str, element2: &'static str, element3: &'static str) -> Self {
|
||||
Self {
|
||||
0: vec![element1, element2, element3]
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tail(&self) -> Self {
|
||||
Self {
|
||||
0: self.0[1..].to_vec()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
|
||||
pub fn get(&self, index: usize) -> &'static str {
|
||||
unsafe {
|
||||
self.0.get_unchecked(index)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for NGram {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
if self.len() != other.len() {
|
||||
return false;
|
||||
} else {
|
||||
for (i, element) in self.0.iter().enumerate() {
|
||||
if *element != other.get(i) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for NGram {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
if self.eq(other) {
|
||||
Some(Ordering::Equal)
|
||||
} else {
|
||||
for (i, element) in self.0.iter().enumerate() {
|
||||
if let Some(ordering) = element.partial_cmp(&other.get(i)) {
|
||||
if ordering != Ordering::Equal {
|
||||
return Some(ordering);
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(Ordering::Equal)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Model {
|
||||
word_counts: BTreeMap<&'static str, u32>,
|
||||
ngram_counts: BTreeMap<NGram, u32>,
|
||||
}
|
||||
|
||||
impl Model {
|
||||
pub fn calc_digrams(corpus: Vec<Vec<&'static str>>) -> Self {
|
||||
let mut word_counts = BTreeMap::new();
|
||||
let mut ngram_counts = BTreeMap::new();
|
||||
for sentence in corpus {
|
||||
for word in sentence.iter() {
|
||||
let count = word_counts.entry(*word).or_insert(0);
|
||||
*count += 1;
|
||||
}
|
||||
|
||||
for i in 0..sentence.len() - 1 {
|
||||
let ngram = NGram::new(vec![sentence[i], sentence[i + 1]]);
|
||||
let count = ngram_counts.entry(ngram).or_insert(0);
|
||||
*count += 1;
|
||||
}
|
||||
}
|
||||
Self {
|
||||
ngram_counts,
|
||||
word_counts,
|
||||
}
|
||||
}
|
||||
|
||||
// only tested for 2-grams, and that's only happy cases
|
||||
pub fn p(&self, ngram: NGram) -> Option<f64> {
|
||||
// let mut probability = (*self.word_counts.get(ngram.get(0)).unwrap() as f64) / self.word_counts.len() as f64;
|
||||
//
|
||||
//
|
||||
// for index in 0..ngram.len() - 1 {
|
||||
// println!("{}", probability);
|
||||
//
|
||||
//
|
||||
//
|
||||
// println!("{}", ng_p);
|
||||
// probability = ng_p * probability;
|
||||
// }
|
||||
// println!("{}", probability);
|
||||
// Some(probability)
|
||||
None
|
||||
}
|
||||
|
||||
fn p_ngram(&self, ngram: NGram, intermediate: f64) -> f64 {
|
||||
// for index in 0..ngram.len() - 1 {
|
||||
// self.ngram_counts.get(&ngram)
|
||||
// .map(|count| (*count as f64) / (self.word_counts[ngram.0.get(0).unwrap()] as f64)).unwrap()
|
||||
// }
|
||||
//
|
||||
// if ngram.len() > 2 {
|
||||
// println!("{}", intermediate);
|
||||
// intermediate * self.pp(ngram.tail(), intermediate) //TODO
|
||||
// } else {
|
||||
// self.ngram_counts.get(&ngram)
|
||||
// .map(|count| (*count as f64) / (self.word_counts[ngram.0.get(0).unwrap()] as f64)).unwrap() //TODO
|
||||
// }
|
||||
0.0
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_create_model() {
|
||||
let corpus = vec![
|
||||
vec!["<s>", "I", "am", "Sam", "</s>"],
|
||||
vec!["<s>", "Sam", "I", "am", "</s>"],
|
||||
vec!["<s>", "I", "do", "not", "like", "eggs", "and", "ham", "</s>"],
|
||||
];
|
||||
|
||||
let model = Model::calc_digrams(corpus);
|
||||
|
||||
// assert_eq!(model.p(NGram::new(vec!["<s>", "I"])), Some(0.6666666666666666_f64));
|
||||
// assert_eq!(model.p(NGram::new(vec!["Sam", "</s>"])), Some(0.5_f64));
|
||||
// assert_eq!(model.p(NGram::new(vec!["<s>", "Sam"])), Some(0.33333333333333333_f64));
|
||||
// assert_eq!(model.p(NGram::new(vec!["am", "Sam"])), Some(0.5_f64));
|
||||
// assert_eq!(model.p(NGram::new(vec!["I", "am"])), Some(0.6666666666666666_f64));
|
||||
|
||||
println!("{:?}", model.p(NGram::new(vec!["I", "am", "Sam"])));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_eq() {
|
||||
let n1 = NGram::new(vec!["1", "2"]);
|
||||
let n2 = NGram::new(vec!["1", "2"]);
|
||||
let n3 = NGram::new(vec!["3", "4"]);
|
||||
|
||||
assert_eq!(n1, n2);
|
||||
assert_ne!(n1, n3);
|
||||
assert_ne!(n2, n3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tail() {
|
||||
let n1 = NGram::new(vec!["1", "2", "3"]);
|
||||
let n2 = NGram::new(vec!["2", "3"]);
|
||||
assert_eq!(n1.tail(), n2);
|
||||
}
|
||||
}
|
||||
130
src/util/mod.rs
Normal file
130
src/util/mod.rs
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
/// Pads a sequence of words
|
||||
/// sentence: sequence to pad, in the form of an Iterator of string slices.
|
||||
/// pad_left: if set to true, prepends a padding symbol to the sentence
|
||||
/// left_pad_symbol: the padding symbol to prepend
|
||||
/// pad_right: if set to true, appends a padding symbol after the sentence
|
||||
/// right_pad_symbol: the padding symbol to append
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_sequence<'a>(sentence: impl Iterator<Item=&'a str> + 'static, pad_left: bool, left_pad_symbol: &'static str, pad_right: bool, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
Padder::new(Box::new(sentence), pad_left, left_pad_symbol, pad_right, right_pad_symbol, n)
|
||||
}
|
||||
|
||||
/// Pads a sequence of words, left-padding only. Convenience function that prevents useless arguments
|
||||
/// sentence: sequence to pad, in the form of an Iterator of string slices.
|
||||
/// left_pad_symbol: the padding symbol to prepend
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_sequence_left<'a>(text: impl Iterator<Item=&'a str> + 'static, left_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
Padder::new(Box::new(text), true, left_pad_symbol, false, "", n)
|
||||
}
|
||||
|
||||
/// Pads a sequence of words, right-padding only. Convenience function that prevents useless arguments
|
||||
///
|
||||
/// sentence: sequence to pad, in the form of an Iterator of string slices.
|
||||
/// pad_right: if set to true, appends a padding symbol after the sentence
|
||||
/// right_pad_symbol: the padding symbol to append
|
||||
/// n: the n in n-grams; so for bigrams set to 2, etc
|
||||
pub fn pad_sequence_right<'a>(text: impl Iterator<Item=&'a str> + 'static, right_pad_symbol: &'static str, n: usize) -> impl Iterator<Item=&'a str> {
|
||||
Padder::new(Box::new(text), false, "", true, right_pad_symbol, n)
|
||||
}
|
||||
|
||||
pub(crate) struct Padder<'a> {
|
||||
n: usize,
|
||||
text: Box<dyn Iterator<Item=&'a str>>,
|
||||
pad_left: bool,
|
||||
left_index: isize,
|
||||
left_pad_symbol: &'static str,
|
||||
pad_right: bool,
|
||||
right_index: isize,
|
||||
right_pad_symbol: &'static str,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Padder<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.pad_left && self.left_index < self.n as isize {
|
||||
self.left_index += 1;
|
||||
return Some(self.left_pad_symbol);
|
||||
} else {
|
||||
let maybe_next = self.text.next();
|
||||
if maybe_next.is_some() {
|
||||
return maybe_next;
|
||||
} else {
|
||||
if self.pad_right && self.right_index < self.n as isize {
|
||||
self.right_index += 1;
|
||||
return Some(self.right_pad_symbol);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Padder<'a> {
|
||||
pub(crate) fn new(text: Box<dyn Iterator<Item=&'a str>>, pad_left: bool, left_pad_symbol: &'static str,
|
||||
pad_right: bool, right_pad_symbol: &'static str, n: usize, ) -> Self {
|
||||
Self { text, n, pad_left, left_index: 1, left_pad_symbol, pad_right, right_index: 1, right_pad_symbol }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pad_both_ends_default_n2() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence(text, true, "<s>", true, "</s>", 2);
|
||||
assert!(equal(padded, vec!["<s>", "a", "b", "c", "</s>"].into_iter()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_left() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence_left(text, "<s>", 2);
|
||||
assert!(equal(padded, vec!["<s>", "a", "b", "c"].into_iter()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_right() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence_right(text, "</s>", 2);
|
||||
assert!(equal(padded, vec!["a", "b", "c", "</s>"].into_iter()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_both_ends_default_n_eq_3() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence(text, true, "<s>", true, "</s>", 3);
|
||||
assert!(equal(padded, vec!["<s>", "<s>", "a", "b", "c", "</s>", "</s>"].into_iter()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pad_both_ends_non_default_symbols() {
|
||||
let text = vec!["a", "b", "c"].into_iter();
|
||||
let padded = pad_sequence(text, true, "left", true, "right", 2);
|
||||
assert!(equal(padded, vec!["left", "a", "b", "c", "right"].into_iter()));
|
||||
}
|
||||
|
||||
fn equal<'a>(mut l1: impl Iterator<Item=&'a str>, mut l2: impl Iterator<Item=&'a str>) -> bool {
|
||||
loop {
|
||||
let e1 = l1.next();
|
||||
let e2 = l2.next();
|
||||
if e1.is_none() {
|
||||
return if e2.is_none() {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
} else if e2.is_none() {
|
||||
return false;
|
||||
} else {
|
||||
if e1.unwrap() != e2.unwrap() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue