From 9e808b2b478b869638a1b203b6f73b810e9f9425 Mon Sep 17 00:00:00 2001 From: Sander Hautvast Date: Thu, 13 Feb 2025 21:54:41 +0100 Subject: [PATCH] first commit --- .gitignore | 2 + Cargo.lock | 23 +++++ Cargo.toml | 8 ++ README.md | 1 + src/groupby.rs | 19 ++++ src/join.rs | 29 ++++++ src/lib.rs | 265 +++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 21 ++++ src/order.rs | 17 ++++ src/print.rs | 104 +++++++++++++++++++ src/read.rs | 21 ++++ src/value.rs | 76 ++++++++++++++ 12 files changed, 586 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 src/groupby.rs create mode 100644 src/join.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/order.rs create mode 100644 src/print.rs create mode 100644 src/read.rs create mode 100644 src/value.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0592392 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +.DS_Store diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..3d8f336 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,23 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "anyhow" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "csv" +version = "0.1.0" +dependencies = [ + "anyhow", + "byteorder", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..3928faa --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "csv" +version = "0.1.0" +edition = "2021" + +[dependencies] +byteorder = "1.5" +anyhow = "1.0" diff --git a/README.md b/README.md new file mode 100644 index 0000000..5f90364 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +An attempt to query csv files, (like in SQLite after csv import) diff --git a/src/groupby.rs b/src/groupby.rs new file mode 100644 index 0000000..cb89f41 --- /dev/null +++ b/src/groupby.rs @@ -0,0 +1,19 @@ +use crate::Table; + +impl Table { + pub fn group_by(&self, select_expression: &str, group_by_expression: &str) -> Table { + let table = Table::new(""); + + table + } +} + +// fn parse_select(select: &str){ + +// } + +// enum Aggregation { +// Sum(String), +// Max(String), +// Min(String), +// } diff --git a/src/join.rs b/src/join.rs new file mode 100644 index 0000000..e55cde6 --- /dev/null +++ b/src/join.rs @@ -0,0 +1,29 @@ +use crate::Table; + +impl Table { + pub fn left_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { + join(self, right, left_col, right_col, outer) + } + + pub fn right_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { + join(right, self, right_col, left_col, outer) + } +} + +pub fn join(left: &Table, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { + let mut joined = Table::new("join"); + left.cols.iter().for_each(|c| joined.add_column(c, true)); + right.cols.iter().for_each(|c| joined.add_column(c, true)); + let left_col_index = left.get_index(left_col); + let right_col_index = right.get_index(right_col); + + for record in left.iter_records() { + let lv = record.get(left_col_index); + if let Some(right_record) = right.where_clause(right_col_index, lv) { + joined.add_record(record + right_record); + } else if outer { + joined.add_record(record.clone()); + } + } + joined +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..dc55045 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,265 @@ +pub mod groupby; +pub mod join; +pub mod order; +pub mod print; +pub mod read; +pub mod value; + +use std::{ + cmp::Ordering, + collections::{BTreeMap, HashMap}, + ops::Add, +}; + +use value::Value; + +pub struct Table { + name: String, + cols_by_name: HashMap, + cols: Vec, + records: BTreeMap, +} + +impl Table { + pub fn new(name: impl Into) -> Self { + Self { + name: name.into(), + cols_by_name: HashMap::new(), + cols: vec![], + records: BTreeMap::new(), + } + } + + /// Creates a new table with the same name and columns as self, + /// but without data + // Note to self: be careful, might be dangerous to use once tables can be altered. + // That is not yet implemented. May need full copies + pub fn empty_copy(&self) -> Self { + let mut result = Table::new(self.name.clone()); + result.cols_by_name = self.cols_by_name.clone(); + result.cols = self.cols.clone(); + result + } + + pub fn add_record(&mut self, record: Record) { + let index = self.records.len(); + self.records.insert(Key::integer(index), record); + } + + pub fn has_column(&self, name: impl Into) -> bool { + self.cols_by_name.contains_key(&name.into()) + } + + pub fn add_column(&mut self, name: impl Into, allow_duplicates: bool) { + let col_index = self.cols.len(); + let orig_name: String = name.into(); + + let name = if allow_duplicates { + // append an index when there are duplicate column names + let mut col_name = orig_name.to_string(); + let mut index = 2; + + while self.has_column(&col_name) { + col_name = orig_name.to_string(); + col_name.push_str(format!("{}", index).as_str()); + index += 1; + } + col_name + } else { + orig_name + }; + + self.cols_by_name.insert(name.clone(), col_index); + self.cols.push(name); + } + + fn get_indexes(&self, expression: &str) -> Vec { + expression + .split(",") + .map(|c| self.get_index(c.trim())) + .collect::>() + } + + fn get_index(&self, col_name: &str) -> usize { + *self.cols_by_name.get(col_name).unwrap() + } + + pub fn iter(&self) -> TableIter { + self.iter_records() + } + + pub fn iter_records(&self) -> TableIter { + TableIter { + table_iter: self.records.iter(), + } + } + + pub fn select_columns<'a>(&'a self, columns: &'a Vec<&'a str>) -> OwnedColIter<'a> { + OwnedColIter { + cols: columns, + index: 0, + } + } + + pub fn iter_colums(&self) -> ColIter { + ColIter { + cols: &self.cols, + index: 0, + } + } + + pub fn where_clause(&self, colindex: usize, value: &Value) -> Option<&Record> { + for record in self.iter_records() { + let r = record.get(colindex); + if r == value { + return Some(record); + } + } + None + } +} + +#[derive(Debug, Clone)] +pub struct Record { + values: Vec, +} + +impl Record { + pub fn len(&self) -> usize { + self.values.iter().map(Value::len).sum() + } + + pub fn add_value(&mut self, value: impl Into) { + self.values.push(value.into()); + } + + pub fn get(&self, index: usize) -> &Value { + self.values.get(index).unwrap_or(&Value::NULL) + } +} + +impl Add for &Record { + type Output = Record; + + fn add(self, rhs: Self) -> Self::Output { + let mut sum = Record::default(); + sum.values.append(&mut self.values.clone()); + sum.values.append(&mut rhs.values.clone()); // use refs? + sum + } +} + +impl Default for Record { + fn default() -> Self { + Self { values: vec![] } + } +} + +pub struct TableIter<'a> { + table_iter: std::collections::btree_map::Iter<'a, Key, Record>, +} + +impl<'a> Iterator for TableIter<'a> { + type Item = &'a Record; + + fn next(&mut self) -> Option { + self.table_iter.next().map(|e| e.1) + } +} + +pub struct ColIter<'a> { + cols: &'a Vec, + index: usize, +} + +pub struct OwnedColIter<'a> { + cols: &'a Vec<&'a str>, + index: usize, +} + +impl<'a> Iterator for ColIter<'a> { + type Item = &'a String; + + fn next(&mut self) -> Option { + if let Some(v) = self.cols.get(self.index) { + self.index += 1; + Some(v) + } else { + None + } + } +} + +impl<'a> Iterator for OwnedColIter<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option { + if let Some(v) = self.cols.get(self.index) { + self.index += 1; + Some(v) + } else { + None + } + } +} + +struct Key { + values: Vec, +} + +impl Key { + fn integer(integer: usize) -> Self { + Self { + values: vec![Value::Integer(integer as i64)], + } + } + + fn compound(keys: Vec) -> Self { + Self { values: keys } + } +} +impl Ord for Key { + fn cmp(&self, other: &Self) -> Ordering { + self.partial_cmp(other).unwrap() + } +} +impl Eq for Key {} + +impl PartialEq for Key { + fn eq(&self, other: &Self) -> bool { + if self.values.len() != other.values.len() { + false + } else { + for (l, r) in self.values.iter().zip(&other.values) { + if l != r { + return false; + } + } + true + } + } +} + +impl PartialOrd for Key { + fn partial_cmp(&self, other: &Self) -> Option { + let len = self.values.len().min(other.values.len()); + for i in 0..len { + let ord = self + .values + .get(i) + .unwrap() + .partial_cmp(other.values.get(i).unwrap()) + .unwrap(); + match ord { + Ordering::Less => { + return Some(Ordering::Less); + } + Ordering::Greater => { + return Some(Ordering::Greater); + } + _ => {} + } + } + Some(Ordering::Equal) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..4510820 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,21 @@ +use csv::Table; + +fn main() { + let companies = Table::from_csv(include_str!("data/companies.csv"), "\t"); + let remove = Table::from_csv(include_str!("data/remove.csv"), "\t"); + + // companies.pretty_print("*"); + // remove.pretty_print("*"); + let left = Table::from_csv(include_str!("data/left.csv"), "\t"); + let right = Table::from_csv(include_str!("data/right.csv"), "\t"); + // left.pretty_print("*"); + // right.pretty_print("*"); + let join1 = left.left_join(&right, "name", "name", true); + let join2 = left.right_join(&right, "name", "name", true); + // + companies + .left_join(&remove, "aisAccountID", "aisaccountid", false) + .order_by("aisAccountID") + .select("aisAccountID"); + // join2.pretty_print("*"); +} diff --git a/src/order.rs b/src/order.rs new file mode 100644 index 0000000..8b7d3c0 --- /dev/null +++ b/src/order.rs @@ -0,0 +1,17 @@ +use std::collections::BTreeMap; + +use crate::{Key, Table}; + +impl Table { + pub fn order_by(&self, expression: &str) -> Self { + let indexes = self.get_indexes(expression); + let mut sorted_records = BTreeMap::new(); + for record in self.iter() { + let key = indexes.iter().map(|i| record.get(*i).clone()).collect(); + sorted_records.insert(Key::compound(key), record.clone()); + } + let mut ordered = Table::empty_copy(self); + ordered.records = sorted_records; + ordered + } +} diff --git a/src/print.rs b/src/print.rs new file mode 100644 index 0000000..d542b3d --- /dev/null +++ b/src/print.rs @@ -0,0 +1,104 @@ +use std::collections::HashMap; + +use crate::Table; + +impl Table { + /// prints the table contents in nice columns on the command line + pub fn select(&self, expression: &str) { + if expression == "*" { + self.pretty_print_all(); + } else { + let cols = expression + .split(",") + .map(|c| c.trim()) + .collect::>(); + cols.iter() + .filter(|c| !self.has_column(**c)) + .any(|invalid| panic!("{} is not a column in this table", invalid)); + self.pretty_print_select(cols); + } + } + + fn pretty_print_all(&self) { + let column_widths = self.get_column_widths(0, usize::MAX); + // let total = column_widths.values().iter(); + for col in self.iter_colums() { + let w = column_widths.get(col).unwrap_or(&0); + print!("| {:) { + let column_widths = self.select_column_widths(0, usize::MAX, &columns); + // let total = column_widths.values().iter(); + for col in self.select_columns(&columns) { + let w = column_widths.get(col).unwrap_or(&0); + print!("| {: max length of column name/value in any of the rows + /// needed for printing nice columns + /// the following parameters allow for paging views + /// offset: start at rowindex + /// nrecords: take n records after offset + fn get_column_widths(&self, offset: usize, nrecords: usize) -> HashMap<&String, usize> { + let mut widths = HashMap::new(); + // initialize count with the length of the column name + for col in self.iter_colums() { + widths.insert(col, col.len()); + } + for record in self.iter_records().skip(offset).take(nrecords) { + for col in self.iter_colums() { + let e = widths.get_mut(&col).unwrap(); + let index = self.get_index(col); + *e = (*e).max(record.get(index).len()); + } + } + widths + } + + // returns a map of column index -> max length of column name/value in any of the rows + /// needed for printing nice columns + /// the following parameters allow for paging views + /// offset: start at rowindex + /// nrecords: take n records after offset + fn select_column_widths<'a>( + &'a self, + offset: usize, + nrecords: usize, + columns: &'a Vec<&'a str>, + ) -> HashMap<&'a str, usize> { + let mut widths = HashMap::new(); + // initialize count with the length of the column name + for col in self.select_columns(columns) { + widths.insert(col, col.len()); + } + for record in self.iter_records().skip(offset).take(nrecords) { + for col in self.select_columns(columns) { + let e = widths.get_mut(&col).unwrap(); + let index = self.get_index(&col); + *e = (*e).max(record.get(index).len()); + } + } + widths + } +} diff --git a/src/read.rs b/src/read.rs new file mode 100644 index 0000000..d084ecd --- /dev/null +++ b/src/read.rs @@ -0,0 +1,21 @@ +use crate::{Record, Table}; + +impl Table { + pub fn from_csv(csv: &str, separator: &str) -> Self { + let mut table = Table::new("test"); + for (index, row) in csv.split("\n").enumerate() { + if index == 0 { + for col in row.split(separator) { + table.add_column(col, true); + } + } else if row.len() > 0 { + let mut record = Record::default(); + for value in row.split(separator) { + record.add_value(value); + } + table.add_record(record); + } + } + table + } +} diff --git a/src/value.rs b/src/value.rs new file mode 100644 index 0000000..d0eda9e --- /dev/null +++ b/src/value.rs @@ -0,0 +1,76 @@ +use std::fmt::Display; + +#[derive(Debug, PartialEq, PartialOrd, Clone)] +pub enum Value { + Text(String), + Float(f64), + Integer(i64), + NULL, +} + +impl Value { + pub fn len(&self) -> usize { + match self { + Value::Text(text) => text.len(), + Value::Float(float) => format!("{}", float).len(), + Value::Integer(integer) => format!("{}", integer).len(), + Value::NULL => 0, + } + } +} + +impl Display for Value { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let text = match self { + Value::Float(float) => format!("{}", float), + Value::Integer(integer) => format!("{}", integer), + Value::Text(text) => format!("\"{}\"", text), + Value::NULL => "NULL".to_string(), + }; + write!(f, "{}", text) + } +} + +impl Into for &str { + fn into(self) -> Value { + if let Ok(f) = self.parse::() { + Value::Float(f) + } else if let Ok(i) = self.parse::() { + Value::Integer(i) + } else { + Value::Text(strip_quotes(self)) + } + } +} + +impl Into for String { + fn into(self) -> Value { + if let Ok(f) = self.parse::() { + Value::Float(f) + } else if let Ok(i) = self.parse::() { + Value::Integer(i) + } else { + Value::Text(strip_quotes(self)) + } + } +} + +impl Into for f64 { + fn into(self) -> Value { + Value::Float(self) + } +} + +impl Into for i64 { + fn into(self) -> Value { + Value::Integer(self) + } +} + +fn strip_quotes(text: impl Into) -> String { + let mut text = text.into(); + if text.starts_with("\"") && text.ends_with("\"") { + text = text[1..text.len() - 1].to_string(); + } + text +}