diff --git a/.gitignore b/.gitignore index bb38f85..ad1deb9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ /target *.iml -/.idea \ No newline at end of file +/.idea + +nl.txt \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 98bb894..9506d07 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "anyhow" +version = "1.0.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7724808837b77f4b4de9d283820f9d98bcf496d5692934b857a2399d31ff22e6" + [[package]] name = "more-asserts" version = "0.2.2" @@ -12,6 +18,7 @@ checksum = "7843ec2de400bcbc6a6328c958dc38e5359da6e93e72e37bc5246bf1ae776389" name = "rltk" version = "0.1.0" dependencies = [ + "anyhow", "more-asserts", "unicode-segmentation", ] diff --git a/Cargo.toml b/Cargo.toml index bb580aa..a52d734 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,13 @@ version = "0.1.0" edition = "2021" [dependencies] -unicode-segmentation = "1.9.0" +unicode-segmentation = "1" +anyhow = "1" +#regex = "1" [dev-dependencies] -more-asserts = "0.2.2" \ No newline at end of file +more-asserts = "0.2.2" + +[[bin]] +name = "count" +path = "src/bin/count.rs" diff --git a/src/bin/count.rs b/src/bin/count.rs index e69de29..1392404 100644 --- a/src/bin/count.rs +++ b/src/bin/count.rs @@ -0,0 +1,3 @@ +fn main(){ + rltk::pipelines::count(); +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 623b242..b9b99af 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,8 @@ pub mod lm; pub mod util; pub mod metrics; +pub mod mat; +pub mod pipelines; + #[cfg(test)] -pub(crate) mod test; \ No newline at end of file +pub(crate) mod test; diff --git a/src/mat.rs b/src/mat.rs deleted file mode 100644 index 327df15..0000000 --- a/src/mat.rs +++ /dev/null @@ -1,70 +0,0 @@ -pub trait Mat { - fn get(&self, row: usize, column: usize) -> T; - fn set(&mut self, row: usize, column: usize, value: T); - fn rows() -> Vec>; -} - -pub trait Csr: Mat { - -} - -pub struct Csr_f64 { - index_pointers: Vec, - indices: Vec, - data: Vec, -} - -impl Csr_f64 { - pub fn new() -> Self { - Self { - index_pointers: Vec::new(), - indices: Vec::new(), - data: Vec::new(), - } - } -} - -impl Csr for Csr_f64 {} - -impl Mat for Csr_f64 { - fn get(&self, row: usize, column: usize) -> f64 { - if row + 2 > self.index_pointers.len() { - 0.0 - } - let start_index = self.index_pointers[row]; - let end_index = self.index_pointers[row + 1]; - - if start_index == end_index { - 0.0 - } else { - let mut index = start_index; - while index < end_index && column != self.indices[index] { - index += 1; - } - if index == end_index { - 0.0 - } else { - self.data[index] - } - } - } - - fn set(&mut self, _row: usize, _column: usize, _value: T) { - panic!("Csr is immutable") - } - - fn rows() -> Vec> { - todo!() - // public double[][] getRows() { - // return toDense().getRows(); - // } - } - - -} - -impl Into> for Csr_f64{ - fn into(self) -> Box> { - todo!() - } -} \ No newline at end of file diff --git a/src/mat/bitmat.rs b/src/mat/bitmat.rs index e69de29..33d208a 100644 --- a/src/mat/bitmat.rs +++ b/src/mat/bitmat.rs @@ -0,0 +1,84 @@ +use std::collections::BTreeMap; + +use crate::mat::{Mat, Numeric, Shape}; + +// Mutable sparse matrix for boolean values +// stored in u128 +struct SparseBitMat { + data: BTreeMap>, +} + +impl SparseBitMat { + fn new() -> Self { + Self { + data: BTreeMap::new() + } + } + + pub fn set_true(&mut self, row_index: usize, col_index: usize) { + let d = self.get_byte(row_index, col_index); + *d |= 1 << (col_index % 128); + } + + pub fn set_false(&mut self, row_index: usize, col_index: usize) { + let d = self.get_byte(row_index, col_index); + *d &= !(1 << (col_index % 128)); + } + + fn get_byte(&mut self, row_index: usize, col_index: usize) -> &mut u128 { + let row_index_b = row_index >> 7; + let col_index_b = col_index >> 7; + let mut row = self.data.entry(row_index_b).or_insert_with(BTreeMap::new); + let d = row.entry(col_index_b).or_insert(0_u128); + d + } + + +} + +impl Mat for SparseBitMat { + fn get(&self, row_index: usize, col_index: usize) -> bool { + let row_index_b = row_index >> 7; + let col_index_b = col_index >> 7; + let row = self.data.get(&row_index_b); + if let Some(row) = row { + let d = row.get(&col_index_b); + if let Some(d) = d { + let bit = 1 << (col_index % 128); + return (*d & bit) != 0; + } + } + false + } + + fn set(&mut self, row_index: usize, col_index: usize, value: bool) { + if value { + self.set_true(row_index, col_index); + } else { + self.set_false(row_index, col_index); + } + } + + + fn shape(&self) -> Shape { + todo!() + } +} + +#[cfg(test)] +mod test { + use crate::mat::Mat; + + #[test] + fn test_get_and_set() { + let mut mat = super::SparseBitMat::new(); + mat.set(15, 15, true); + assert_eq!(mat.get(0, 0), false); //untouched + assert_eq!(mat.get(15, 15), true); //touched + + mat.set(15, 15, false); + assert_eq!(mat.get(15, 15), false); //touched, set to false + mat.set(1001, 1001, false); + assert_eq!(mat.get(1001, 1001), false); //untouched, set to false + } +} \ No newline at end of file diff --git a/src/mat/csr.rs b/src/mat/csr.rs index 2682814..ca1c3ed 100644 --- a/src/mat/csr.rs +++ b/src/mat/csr.rs @@ -1,12 +1,16 @@ use crate::mat::{Mat, Numeric, Shape}; +use crate::mat::sparse::SparseMat; -pub struct Csr where T: Numeric{ +/// Compressed Sparse Row matrix +/// Immutable, can be constructed from Vec>, or SparseMat +/// Better performance when iterating (i think), less memory +pub struct CsrMat where T: Numeric { index_pointers: Vec, indices: Vec, data: Vec, } -impl Csr where T:Numeric{ +impl CsrMat where T: Numeric { pub fn new() -> Self { Self { index_pointers: Vec::new(), @@ -16,23 +20,23 @@ impl Csr where T:Numeric{ } } -impl Mat for Csr where T:Numeric { +impl Mat for CsrMat where T: Numeric { fn get(&self, row: usize, column: usize) -> T { if row + 2 > self.index_pointers.len() { - return Numeric::default_value(); + return Numeric::default::(); } let start_index = self.index_pointers[row]; let end_index = self.index_pointers[row + 1]; if start_index == end_index { - return Numeric::default_value(); + return Numeric::default::(); } else { let mut index = start_index; while index < end_index && column != self.indices[index] { index += 1; } if index == end_index { - return Numeric::default_value(); + return Numeric::default::(); } else { self.data[index] } @@ -49,15 +53,15 @@ impl Mat for Csr where T:Numeric { } } -impl From>> for Csr where T:Numeric + PartialEq{ +impl From>> for CsrMat where T: Numeric + PartialEq { fn from(rows: Vec>) -> Self { let mut this = Self::new(); this.index_pointers.push(0); for row in rows { - for (index,value) in row.into_iter().enumerate(){ - if value != value.default() { + for (index, value) in row.into_iter().enumerate() { + if value != Numeric::default::() { this.data.push(value); this.indices.push(index); } @@ -66,4 +70,44 @@ impl From>> for Csr where T:Numeric + PartialEq{ } this } +} + +impl From>> for CsrMat where T: Numeric + PartialEq { + fn from(this: Box>) -> Self { + let mut csr = Self::new(); + + csr.index_pointers.push(0); + let (rows, cols) = Shape::into(this.shape()); + for row in 0..rows { + for col in 0..cols { + let value = this.get(row, col); + if value != Numeric::default::() { + csr.data.push(value); + csr.indices.push(col); + } + } + csr.index_pointers.push(csr.indices.len()); + } + csr + } +} + + +#[cfg(test)] +mod test { + use crate::mat::{Mat, Shape}; + use crate::mat::csr::CsrMat; + use crate::mat::sparse::SparseMat; + + #[test] + fn test_from_mat() { + let mut mat: Box> = Box::new(SparseMat::new()); + mat.set(1, 1, 1_u32); + mat.set(2, 2, 2_u32); + + let csr: CsrMat = mat.into(); + assert_eq!(csr.get(1, 1), 1); + assert_eq!(csr.get(2, 2), 2); + assert_eq!(csr.shape(), Shape::new(3, 3)); + } } \ No newline at end of file diff --git a/src/mat/lib.rs b/src/mat/lib.rs deleted file mode 100644 index 327df15..0000000 --- a/src/mat/lib.rs +++ /dev/null @@ -1,70 +0,0 @@ -pub trait Mat { - fn get(&self, row: usize, column: usize) -> T; - fn set(&mut self, row: usize, column: usize, value: T); - fn rows() -> Vec>; -} - -pub trait Csr: Mat { - -} - -pub struct Csr_f64 { - index_pointers: Vec, - indices: Vec, - data: Vec, -} - -impl Csr_f64 { - pub fn new() -> Self { - Self { - index_pointers: Vec::new(), - indices: Vec::new(), - data: Vec::new(), - } - } -} - -impl Csr for Csr_f64 {} - -impl Mat for Csr_f64 { - fn get(&self, row: usize, column: usize) -> f64 { - if row + 2 > self.index_pointers.len() { - 0.0 - } - let start_index = self.index_pointers[row]; - let end_index = self.index_pointers[row + 1]; - - if start_index == end_index { - 0.0 - } else { - let mut index = start_index; - while index < end_index && column != self.indices[index] { - index += 1; - } - if index == end_index { - 0.0 - } else { - self.data[index] - } - } - } - - fn set(&mut self, _row: usize, _column: usize, _value: T) { - panic!("Csr is immutable") - } - - fn rows() -> Vec> { - todo!() - // public double[][] getRows() { - // return toDense().getRows(); - // } - } - - -} - -impl Into> for Csr_f64{ - fn into(self) -> Box> { - todo!() - } -} \ No newline at end of file diff --git a/src/mat/mod.rs b/src/mat/mod.rs index 327df15..a30bcec 100644 --- a/src/mat/mod.rs +++ b/src/mat/mod.rs @@ -1,70 +1,120 @@ -pub trait Mat { - fn get(&self, row: usize, column: usize) -> T; - fn set(&mut self, row: usize, column: usize, value: T); - fn rows() -> Vec>; +mod csr; +mod sparse; +mod bitmat; + +pub trait Mat { + fn get(&self, row_index: usize, col_index: usize) -> T; + fn set(&mut self, row_index: usize, col_index: usize, value: T); + fn shape(&self) -> Shape; } -pub trait Csr: Mat { - +#[derive(PartialEq, Eq, Debug)] +pub struct Shape { + rows: usize, + cols: usize, } -pub struct Csr_f64 { - index_pointers: Vec, - indices: Vec, - data: Vec, -} - -impl Csr_f64 { - pub fn new() -> Self { +impl Shape { + pub fn new(rows: usize, cols: usize) -> Self { Self { - index_pointers: Vec::new(), - indices: Vec::new(), - data: Vec::new(), + rows, + cols, } } } -impl Csr for Csr_f64 {} - -impl Mat for Csr_f64 { - fn get(&self, row: usize, column: usize) -> f64 { - if row + 2 > self.index_pointers.len() { - 0.0 - } - let start_index = self.index_pointers[row]; - let end_index = self.index_pointers[row + 1]; - - if start_index == end_index { - 0.0 - } else { - let mut index = start_index; - while index < end_index && column != self.indices[index] { - index += 1; - } - if index == end_index { - 0.0 - } else { - self.data[index] - } - } +impl From for (usize, usize){ + fn from(this: Shape) -> Self { + (this.rows, this.cols) } - - fn set(&mut self, _row: usize, _column: usize, _value: T) { - panic!("Csr is immutable") - } - - fn rows() -> Vec> { - todo!() - // public double[][] getRows() { - // return toDense().getRows(); - // } - } - - } -impl Into> for Csr_f64{ - fn into(self) -> Box> { - todo!() +pub trait Numeric: Copy + Default { + fn default() -> Self; +} + +impl Numeric for f64 { + fn default() -> f64 { 0.0 } +} + +impl Numeric for f32 { + fn default() -> f32 { 0.0 } +} + +impl Numeric for usize { + fn default() -> usize { 0 } +} + +impl Numeric for isize { + fn default() -> isize { 0 } +} + +impl Numeric for i8 { + fn default() -> i8 { 0 } +} + +impl Numeric for u8 { + fn default() -> u8 { 0 } +} + +impl Numeric for i16 { + fn default() -> i16 { 0 } +} + +impl Numeric for u16 { + fn default() -> u16 { 0 } +} + +impl Numeric for i32 { + fn default() -> i32 { 0 } +} + +impl Numeric for u32 { + fn default() -> u32 { 0 } +} + +impl Numeric for i64 { + fn default() -> i64 { 0 } +} + +impl Numeric for u64 { + fn default() -> u64 { 0 } +} + +impl Numeric for i128 { + fn default() -> i128 { 0 } +} + +impl Numeric for bool { + fn default() -> Self { + false + } +} + +impl Numeric for u128 { + fn default() -> u128 { 0 } +} + +#[cfg(test)] +mod test { + use mat::csr::CsrMat; + use mat::Mat; + + use crate::mat; + + #[test] + fn test_i32() { + let rows = vec![vec![1, 0, 0, 0], vec![2]]; + let new_mat = CsrMat::from(rows); + assert_eq!(2, new_mat.get(1, 0)); + assert_eq!(0, new_mat.get(10, 0)); + } + + #[test] + fn test_f64() { + let rows = vec![vec![1.0, 0.0, 0.0, 0.0], vec![2.0]]; + let new_mat = CsrMat::from(rows); + assert_eq!(2.0, new_mat.get(1, 0)); + assert_eq!(0.0, new_mat.get(10, 0)); } } \ No newline at end of file diff --git a/src/mat/sparse.rs b/src/mat/sparse.rs index e69de29..12271f0 100644 --- a/src/mat/sparse.rs +++ b/src/mat/sparse.rs @@ -0,0 +1,64 @@ +use std::collections::BTreeMap; + +use crate::mat::{Mat, Numeric, Shape}; + +/// BTreeMap based implementation, useful for mutating +/// every row is a map and the matrix is a map +/// resulting in a map> +/// uses a BTreeMap to keep the keys (indexes) ordered. +pub struct SparseMat { + data: BTreeMap>, +} + +impl SparseMat { + pub fn new() -> Self { + Self { + data: BTreeMap::new() + } + } +} + +impl Mat for SparseMat { + fn get(&self, row_index: usize, col_index: usize) -> T { + self.data.get(&row_index) + .map(|row| row.get(&col_index) + .map(|v|*v) + .unwrap_or(Numeric::default::())) + .unwrap_or(Numeric::default::()) + } + + fn set(&mut self, row_index: usize, col_index: usize, value: T) { + let row = self.data.entry(row_index).or_insert_with(BTreeMap::new); + row.insert(col_index, value); + } + + fn shape(&self) -> Shape { + let mut max_rows = 0; + let mut max_cols = 0; + for row_index in self.data.keys() { + let row_index = *row_index; + if row_index > max_rows { + max_rows = row_index; + } + let row = self.data.get(&row_index).unwrap(); + let last_col = *row.keys().max().unwrap(); + if last_col > max_cols { + max_cols = last_col; + } + } + Shape::new(max_rows + 1, max_cols + 1) + } +} + +#[cfg(test)] +mod tests { + use crate::mat::{Mat, Shape}; + use crate::mat::sparse::SparseMat; + + #[test] + fn shape() { + let mut mat = SparseMat::new(); + mat.set(10, 11, 1.5); + assert_eq!(mat.shape(), Shape::new(11, 12)); + } +} diff --git a/src/pipelines.rs b/src/pipelines.rs index e69de29..ceb7d70 100644 --- a/src/pipelines.rs +++ b/src/pipelines.rs @@ -0,0 +1,29 @@ +use std::collections::BTreeMap; +use std::collections::hash_map::DefaultHasher; +use std::fs::File; +use std::hash::{Hash, Hasher}; +use std::io::{self, BufRead}; + +pub fn count() -> anyhow::Result<()> { + let mut store: BTreeMap = BTreeMap::new(); + let stdin = io::stdin(); + for line in stdin.lock().lines() { + let line = line?; + for token in line.split(|c: char| c.is_ascii_punctuation() || c.is_whitespace()) { + let count = store.entry(token.to_owned()).or_insert(0); + *count += 1; + } + } + for (key, value) in store{ + println!("{}:{}", key,value); + } + Ok(()) +} + +fn hash(string: &str) -> u64 { + let mut hasher = DefaultHasher::new(); + string.hash(&mut hasher); + hasher.finish() +} + +pub fn create_binary_bow(file: File) {} \ No newline at end of file