This commit is contained in:
Sander Hautvast 2022-12-18 17:27:56 +01:00
parent f3915c0146
commit 9e919336be
12 changed files with 361 additions and 209 deletions

4
.gitignore vendored
View file

@ -1,3 +1,5 @@
/target
*.iml
/.idea
/.idea
nl.txt

7
Cargo.lock generated
View file

@ -2,6 +2,12 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "anyhow"
version = "1.0.67"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7724808837b77f4b4de9d283820f9d98bcf496d5692934b857a2399d31ff22e6"
[[package]]
name = "more-asserts"
version = "0.2.2"
@ -12,6 +18,7 @@ checksum = "7843ec2de400bcbc6a6328c958dc38e5359da6e93e72e37bc5246bf1ae776389"
name = "rltk"
version = "0.1.0"
dependencies = [
"anyhow",
"more-asserts",
"unicode-segmentation",
]

View file

@ -4,7 +4,13 @@ version = "0.1.0"
edition = "2021"
[dependencies]
unicode-segmentation = "1.9.0"
unicode-segmentation = "1"
anyhow = "1"
#regex = "1"
[dev-dependencies]
more-asserts = "0.2.2"
more-asserts = "0.2.2"
[[bin]]
name = "count"
path = "src/bin/count.rs"

View file

@ -0,0 +1,3 @@
fn main(){
rltk::pipelines::count();
}

View file

@ -1,5 +1,8 @@
pub mod lm;
pub mod util;
pub mod metrics;
pub mod mat;
pub mod pipelines;
#[cfg(test)]
pub(crate) mod test;
pub(crate) mod test;

View file

@ -1,70 +0,0 @@
pub trait Mat<T> {
fn get(&self, row: usize, column: usize) -> T;
fn set(&mut self, row: usize, column: usize, value: T);
fn rows() -> Vec<Vec<T>>;
}
pub trait Csr<T>: Mat<T> {
}
pub struct Csr_f64 {
index_pointers: Vec<usize>,
indices: Vec<usize>,
data: Vec<f64>,
}
impl Csr_f64 {
pub fn new() -> Self {
Self {
index_pointers: Vec::new(),
indices: Vec::new(),
data: Vec::new(),
}
}
}
impl Csr<f64> for Csr_f64 {}
impl Mat<f64> for Csr_f64 {
fn get(&self, row: usize, column: usize) -> f64 {
if row + 2 > self.index_pointers.len() {
0.0
}
let start_index = self.index_pointers[row];
let end_index = self.index_pointers[row + 1];
if start_index == end_index {
0.0
} else {
let mut index = start_index;
while index < end_index && column != self.indices[index] {
index += 1;
}
if index == end_index {
0.0
} else {
self.data[index]
}
}
}
fn set(&mut self, _row: usize, _column: usize, _value: T) {
panic!("Csr is immutable")
}
fn rows() -> Vec<Vec<f64>> {
todo!()
// public double[][] getRows() {
// return toDense().getRows();
// }
}
}
impl Into<dyn Mat<T>> for Csr_f64{
fn into(self) -> Box<dyn Mat<T>> {
todo!()
}
}

View file

@ -0,0 +1,84 @@
use std::collections::BTreeMap;
use crate::mat::{Mat, Numeric, Shape};
// Mutable sparse matrix for boolean values
// stored in u128
struct SparseBitMat {
data: BTreeMap<usize, BTreeMap<usize, u128>>,
}
impl SparseBitMat {
fn new() -> Self {
Self {
data: BTreeMap::new()
}
}
pub fn set_true(&mut self, row_index: usize, col_index: usize) {
let d = self.get_byte(row_index, col_index);
*d |= 1 << (col_index % 128);
}
pub fn set_false(&mut self, row_index: usize, col_index: usize) {
let d = self.get_byte(row_index, col_index);
*d &= !(1 << (col_index % 128));
}
fn get_byte(&mut self, row_index: usize, col_index: usize) -> &mut u128 {
let row_index_b = row_index >> 7;
let col_index_b = col_index >> 7;
let mut row = self.data.entry(row_index_b).or_insert_with(BTreeMap::new);
let d = row.entry(col_index_b).or_insert(0_u128);
d
}
}
impl Mat<bool> for SparseBitMat {
fn get(&self, row_index: usize, col_index: usize) -> bool {
let row_index_b = row_index >> 7;
let col_index_b = col_index >> 7;
let row = self.data.get(&row_index_b);
if let Some(row) = row {
let d = row.get(&col_index_b);
if let Some(d) = d {
let bit = 1 << (col_index % 128);
return (*d & bit) != 0;
}
}
false
}
fn set(&mut self, row_index: usize, col_index: usize, value: bool) {
if value {
self.set_true(row_index, col_index);
} else {
self.set_false(row_index, col_index);
}
}
fn shape(&self) -> Shape {
todo!()
}
}
#[cfg(test)]
mod test {
use crate::mat::Mat;
#[test]
fn test_get_and_set() {
let mut mat = super::SparseBitMat::new();
mat.set(15, 15, true);
assert_eq!(mat.get(0, 0), false); //untouched
assert_eq!(mat.get(15, 15), true); //touched
mat.set(15, 15, false);
assert_eq!(mat.get(15, 15), false); //touched, set to false
mat.set(1001, 1001, false);
assert_eq!(mat.get(1001, 1001), false); //untouched, set to false
}
}

View file

@ -1,12 +1,16 @@
use crate::mat::{Mat, Numeric, Shape};
use crate::mat::sparse::SparseMat;
pub struct Csr<T> where T: Numeric{
/// Compressed Sparse Row matrix
/// Immutable, can be constructed from Vec<Vec<T>>, or SparseMat<T>
/// Better performance when iterating (i think), less memory
pub struct CsrMat<T> where T: Numeric {
index_pointers: Vec<usize>,
indices: Vec<usize>,
data: Vec<T>,
}
impl <T> Csr<T> where T:Numeric{
impl<T> CsrMat<T> where T: Numeric {
pub fn new() -> Self {
Self {
index_pointers: Vec::new(),
@ -16,23 +20,23 @@ impl <T> Csr<T> where T:Numeric{
}
}
impl <T> Mat<T> for Csr<T> where T:Numeric {
impl<T> Mat<T> for CsrMat<T> where T: Numeric {
fn get(&self, row: usize, column: usize) -> T {
if row + 2 > self.index_pointers.len() {
return Numeric::default_value();
return Numeric::default::<T>();
}
let start_index = self.index_pointers[row];
let end_index = self.index_pointers[row + 1];
if start_index == end_index {
return Numeric::default_value();
return Numeric::default::<T>();
} else {
let mut index = start_index;
while index < end_index && column != self.indices[index] {
index += 1;
}
if index == end_index {
return Numeric::default_value();
return Numeric::default::<T>();
} else {
self.data[index]
}
@ -49,15 +53,15 @@ impl <T> Mat<T> for Csr<T> where T:Numeric {
}
}
impl <T> From<Vec<Vec<T>>> for Csr<T> where T:Numeric + PartialEq{
impl<T> From<Vec<Vec<T>>> for CsrMat<T> where T: Numeric + PartialEq {
fn from(rows: Vec<Vec<T>>) -> Self {
let mut this = Self::new();
this.index_pointers.push(0);
for row in rows {
for (index,value) in row.into_iter().enumerate(){
if value != value.default() {
for (index, value) in row.into_iter().enumerate() {
if value != Numeric::default::<T>() {
this.data.push(value);
this.indices.push(index);
}
@ -66,4 +70,44 @@ impl <T> From<Vec<Vec<T>>> for Csr<T> where T:Numeric + PartialEq{
}
this
}
}
impl<T> From<Box<dyn Mat<T>>> for CsrMat<T> where T: Numeric + PartialEq {
fn from(this: Box<dyn Mat<T>>) -> Self {
let mut csr = Self::new();
csr.index_pointers.push(0);
let (rows, cols) = Shape::into(this.shape());
for row in 0..rows {
for col in 0..cols {
let value = this.get(row, col);
if value != Numeric::default::<T>() {
csr.data.push(value);
csr.indices.push(col);
}
}
csr.index_pointers.push(csr.indices.len());
}
csr
}
}
#[cfg(test)]
mod test {
use crate::mat::{Mat, Shape};
use crate::mat::csr::CsrMat;
use crate::mat::sparse::SparseMat;
#[test]
fn test_from_mat() {
let mut mat: Box<dyn Mat<u32>> = Box::new(SparseMat::new());
mat.set(1, 1, 1_u32);
mat.set(2, 2, 2_u32);
let csr: CsrMat<u32> = mat.into();
assert_eq!(csr.get(1, 1), 1);
assert_eq!(csr.get(2, 2), 2);
assert_eq!(csr.shape(), Shape::new(3, 3));
}
}

View file

@ -1,70 +0,0 @@
pub trait Mat<T> {
fn get(&self, row: usize, column: usize) -> T;
fn set(&mut self, row: usize, column: usize, value: T);
fn rows() -> Vec<Vec<T>>;
}
pub trait Csr<T>: Mat<T> {
}
pub struct Csr_f64 {
index_pointers: Vec<usize>,
indices: Vec<usize>,
data: Vec<f64>,
}
impl Csr_f64 {
pub fn new() -> Self {
Self {
index_pointers: Vec::new(),
indices: Vec::new(),
data: Vec::new(),
}
}
}
impl Csr<f64> for Csr_f64 {}
impl Mat<f64> for Csr_f64 {
fn get(&self, row: usize, column: usize) -> f64 {
if row + 2 > self.index_pointers.len() {
0.0
}
let start_index = self.index_pointers[row];
let end_index = self.index_pointers[row + 1];
if start_index == end_index {
0.0
} else {
let mut index = start_index;
while index < end_index && column != self.indices[index] {
index += 1;
}
if index == end_index {
0.0
} else {
self.data[index]
}
}
}
fn set(&mut self, _row: usize, _column: usize, _value: T) {
panic!("Csr is immutable")
}
fn rows() -> Vec<Vec<f64>> {
todo!()
// public double[][] getRows() {
// return toDense().getRows();
// }
}
}
impl Into<dyn Mat<T>> for Csr_f64{
fn into(self) -> Box<dyn Mat<T>> {
todo!()
}
}

View file

@ -1,70 +1,120 @@
pub trait Mat<T> {
fn get(&self, row: usize, column: usize) -> T;
fn set(&mut self, row: usize, column: usize, value: T);
fn rows() -> Vec<Vec<T>>;
mod csr;
mod sparse;
mod bitmat;
pub trait Mat<T: Numeric> {
fn get(&self, row_index: usize, col_index: usize) -> T;
fn set(&mut self, row_index: usize, col_index: usize, value: T);
fn shape(&self) -> Shape;
}
pub trait Csr<T>: Mat<T> {
#[derive(PartialEq, Eq, Debug)]
pub struct Shape {
rows: usize,
cols: usize,
}
pub struct Csr_f64 {
index_pointers: Vec<usize>,
indices: Vec<usize>,
data: Vec<f64>,
}
impl Csr_f64 {
pub fn new() -> Self {
impl Shape {
pub fn new(rows: usize, cols: usize) -> Self {
Self {
index_pointers: Vec::new(),
indices: Vec::new(),
data: Vec::new(),
rows,
cols,
}
}
}
impl Csr<f64> for Csr_f64 {}
impl Mat<f64> for Csr_f64 {
fn get(&self, row: usize, column: usize) -> f64 {
if row + 2 > self.index_pointers.len() {
0.0
}
let start_index = self.index_pointers[row];
let end_index = self.index_pointers[row + 1];
if start_index == end_index {
0.0
} else {
let mut index = start_index;
while index < end_index && column != self.indices[index] {
index += 1;
}
if index == end_index {
0.0
} else {
self.data[index]
}
}
impl From<Shape> for (usize, usize){
fn from(this: Shape) -> Self {
(this.rows, this.cols)
}
fn set(&mut self, _row: usize, _column: usize, _value: T) {
panic!("Csr is immutable")
}
fn rows() -> Vec<Vec<f64>> {
todo!()
// public double[][] getRows() {
// return toDense().getRows();
// }
}
}
impl Into<dyn Mat<T>> for Csr_f64{
fn into(self) -> Box<dyn Mat<T>> {
todo!()
pub trait Numeric: Copy + Default {
fn default<T>() -> Self;
}
impl Numeric for f64 {
fn default<T>() -> f64 { 0.0 }
}
impl Numeric for f32 {
fn default<T>() -> f32 { 0.0 }
}
impl Numeric for usize {
fn default<T>() -> usize { 0 }
}
impl Numeric for isize {
fn default<T>() -> isize { 0 }
}
impl Numeric for i8 {
fn default<T>() -> i8 { 0 }
}
impl Numeric for u8 {
fn default<T>() -> u8 { 0 }
}
impl Numeric for i16 {
fn default<T>() -> i16 { 0 }
}
impl Numeric for u16 {
fn default<T>() -> u16 { 0 }
}
impl Numeric for i32 {
fn default<T>() -> i32 { 0 }
}
impl Numeric for u32 {
fn default<T>() -> u32 { 0 }
}
impl Numeric for i64 {
fn default<T>() -> i64 { 0 }
}
impl Numeric for u64 {
fn default<T>() -> u64 { 0 }
}
impl Numeric for i128 {
fn default<T>() -> i128 { 0 }
}
impl Numeric for bool {
fn default<T>() -> Self {
false
}
}
impl Numeric for u128 {
fn default<T>() -> u128 { 0 }
}
#[cfg(test)]
mod test {
use mat::csr::CsrMat;
use mat::Mat;
use crate::mat;
#[test]
fn test_i32() {
let rows = vec![vec![1, 0, 0, 0], vec![2]];
let new_mat = CsrMat::from(rows);
assert_eq!(2, new_mat.get(1, 0));
assert_eq!(0, new_mat.get(10, 0));
}
#[test]
fn test_f64() {
let rows = vec![vec![1.0, 0.0, 0.0, 0.0], vec![2.0]];
let new_mat = CsrMat::from(rows);
assert_eq!(2.0, new_mat.get(1, 0));
assert_eq!(0.0, new_mat.get(10, 0));
}
}

View file

@ -0,0 +1,64 @@
use std::collections::BTreeMap;
use crate::mat::{Mat, Numeric, Shape};
/// BTreeMap based implementation, useful for mutating
/// every row is a map<index, value> and the matrix is a map<index, map>
/// resulting in a map<row_index<map<col_index,value>>
/// uses a BTreeMap to keep the keys (indexes) ordered.
pub struct SparseMat<T: Numeric> {
data: BTreeMap<usize, BTreeMap<usize, T>>,
}
impl<T: Numeric> SparseMat<T> {
pub fn new() -> Self {
Self {
data: BTreeMap::new()
}
}
}
impl<T: Numeric> Mat<T> for SparseMat<T> {
fn get(&self, row_index: usize, col_index: usize) -> T {
self.data.get(&row_index)
.map(|row| row.get(&col_index)
.map(|v|*v)
.unwrap_or(Numeric::default::<T>()))
.unwrap_or(Numeric::default::<T>())
}
fn set(&mut self, row_index: usize, col_index: usize, value: T) {
let row = self.data.entry(row_index).or_insert_with(BTreeMap::new);
row.insert(col_index, value);
}
fn shape(&self) -> Shape {
let mut max_rows = 0;
let mut max_cols = 0;
for row_index in self.data.keys() {
let row_index = *row_index;
if row_index > max_rows {
max_rows = row_index;
}
let row = self.data.get(&row_index).unwrap();
let last_col = *row.keys().max().unwrap();
if last_col > max_cols {
max_cols = last_col;
}
}
Shape::new(max_rows + 1, max_cols + 1)
}
}
#[cfg(test)]
mod tests {
use crate::mat::{Mat, Shape};
use crate::mat::sparse::SparseMat;
#[test]
fn shape() {
let mut mat = SparseMat::new();
mat.set(10, 11, 1.5);
assert_eq!(mat.shape(), Shape::new(11, 12));
}
}

View file

@ -0,0 +1,29 @@
use std::collections::BTreeMap;
use std::collections::hash_map::DefaultHasher;
use std::fs::File;
use std::hash::{Hash, Hasher};
use std::io::{self, BufRead};
pub fn count() -> anyhow::Result<()> {
let mut store: BTreeMap<String, usize> = BTreeMap::new();
let stdin = io::stdin();
for line in stdin.lock().lines() {
let line = line?;
for token in line.split(|c: char| c.is_ascii_punctuation() || c.is_whitespace()) {
let count = store.entry(token.to_owned()).or_insert(0);
*count += 1;
}
}
for (key, value) in store{
println!("{}:{}", key,value);
}
Ok(())
}
fn hash(string: &str) -> u64 {
let mut hasher = DefaultHasher::new();
string.hash(&mut hasher);
hasher.finish()
}
pub fn create_binary_bow(file: File) {}