From 9e808b2b478b869638a1b203b6f73b810e9f9425 Mon Sep 17 00:00:00 2001
From: Sander Hautvast <sander.hautvast@ing.com>
Date: Thu, 13 Feb 2025 21:54:41 +0100
Subject: [PATCH] first commit

---
 .gitignore     |   2 +
 Cargo.lock     |  23 +++++
 Cargo.toml     |   8 ++
 README.md      |   1 +
 src/groupby.rs |  19 ++++
 src/join.rs    |  29 ++++++
 src/lib.rs     | 265 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/main.rs    |  21 ++++
 src/order.rs   |  17 ++++
 src/print.rs   | 104 +++++++++++++++++++
 src/read.rs    |  21 ++++
 src/value.rs   |  76 ++++++++++++++
 12 files changed, 586 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 README.md
 create mode 100644 src/groupby.rs
 create mode 100644 src/join.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/main.rs
 create mode 100644 src/order.rs
 create mode 100644 src/print.rs
 create mode 100644 src/read.rs
 create mode 100644 src/value.rs

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0592392
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+.DS_Store
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..3d8f336
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,23 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "anyhow"
+version = "1.0.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "csv"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "byteorder",
+]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..3928faa
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "csv"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+byteorder = "1.5"
+anyhow = "1.0"
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5f90364
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+An attempt to query csv files, (like in SQLite after csv import)
diff --git a/src/groupby.rs b/src/groupby.rs
new file mode 100644
index 0000000..cb89f41
--- /dev/null
+++ b/src/groupby.rs
@@ -0,0 +1,19 @@
+use crate::Table;
+
+impl Table {
+    pub fn group_by(&self, select_expression: &str, group_by_expression: &str) -> Table {
+        let table = Table::new("");
+
+        table
+    }
+}
+
+// fn parse_select(select: &str){
+
+// }
+
+// enum Aggregation {
+//     Sum(String),
+//     Max(String),
+//     Min(String),
+// }
diff --git a/src/join.rs b/src/join.rs
new file mode 100644
index 0000000..e55cde6
--- /dev/null
+++ b/src/join.rs
@@ -0,0 +1,29 @@
+use crate::Table;
+
+impl Table {
+    pub fn left_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table {
+        join(self, right, left_col, right_col, outer)
+    }
+
+    pub fn right_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table {
+        join(right, self, right_col, left_col, outer)
+    }
+}
+
+pub fn join(left: &Table, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table {
+    let mut joined = Table::new("join");
+    left.cols.iter().for_each(|c| joined.add_column(c, true));
+    right.cols.iter().for_each(|c| joined.add_column(c, true));
+    let left_col_index = left.get_index(left_col);
+    let right_col_index = right.get_index(right_col);
+
+    for record in left.iter_records() {
+        let lv = record.get(left_col_index);
+        if let Some(right_record) = right.where_clause(right_col_index, lv) {
+            joined.add_record(record + right_record);
+        } else if outer {
+            joined.add_record(record.clone());
+        }
+    }
+    joined
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..dc55045
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,265 @@
+pub mod groupby;
+pub mod join;
+pub mod order;
+pub mod print;
+pub mod read;
+pub mod value;
+
+use std::{
+    cmp::Ordering,
+    collections::{BTreeMap, HashMap},
+    ops::Add,
+};
+
+use value::Value;
+
+pub struct Table {
+    name: String,
+    cols_by_name: HashMap<String, usize>,
+    cols: Vec<String>,
+    records: BTreeMap<Key, Record>,
+}
+
+impl Table {
+    pub fn new(name: impl Into<String>) -> Self {
+        Self {
+            name: name.into(),
+            cols_by_name: HashMap::new(),
+            cols: vec![],
+            records: BTreeMap::new(),
+        }
+    }
+
+    /// Creates a new table with the same name and columns as self,
+    /// but without data
+    // Note to self: be careful, might be dangerous to use once tables can be altered.
+    // That is not yet implemented. May need full copies
+    pub fn empty_copy(&self) -> Self {
+        let mut result = Table::new(self.name.clone());
+        result.cols_by_name = self.cols_by_name.clone();
+        result.cols = self.cols.clone();
+        result
+    }
+
+    pub fn add_record(&mut self, record: Record) {
+        let index = self.records.len();
+        self.records.insert(Key::integer(index), record);
+    }
+
+    pub fn has_column(&self, name: impl Into<String>) -> bool {
+        self.cols_by_name.contains_key(&name.into())
+    }
+
+    pub fn add_column(&mut self, name: impl Into<String>, allow_duplicates: bool) {
+        let col_index = self.cols.len();
+        let orig_name: String = name.into();
+
+        let name = if allow_duplicates {
+            // append an index when there are duplicate column names
+            let mut col_name = orig_name.to_string();
+            let mut index = 2;
+
+            while self.has_column(&col_name) {
+                col_name = orig_name.to_string();
+                col_name.push_str(format!("{}", index).as_str());
+                index += 1;
+            }
+            col_name
+        } else {
+            orig_name
+        };
+
+        self.cols_by_name.insert(name.clone(), col_index);
+        self.cols.push(name);
+    }
+
+    fn get_indexes(&self, expression: &str) -> Vec<usize> {
+        expression
+            .split(",")
+            .map(|c| self.get_index(c.trim()))
+            .collect::<Vec<usize>>()
+    }
+
+    fn get_index(&self, col_name: &str) -> usize {
+        *self.cols_by_name.get(col_name).unwrap()
+    }
+
+    pub fn iter(&self) -> TableIter {
+        self.iter_records()
+    }
+
+    pub fn iter_records(&self) -> TableIter {
+        TableIter {
+            table_iter: self.records.iter(),
+        }
+    }
+
+    pub fn select_columns<'a>(&'a self, columns: &'a Vec<&'a str>) -> OwnedColIter<'a> {
+        OwnedColIter {
+            cols: columns,
+            index: 0,
+        }
+    }
+
+    pub fn iter_colums(&self) -> ColIter {
+        ColIter {
+            cols: &self.cols,
+            index: 0,
+        }
+    }
+
+    pub fn where_clause(&self, colindex: usize, value: &Value) -> Option<&Record> {
+        for record in self.iter_records() {
+            let r = record.get(colindex);
+            if r == value {
+                return Some(record);
+            }
+        }
+        None
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Record {
+    values: Vec<Value>,
+}
+
+impl Record {
+    pub fn len(&self) -> usize {
+        self.values.iter().map(Value::len).sum()
+    }
+
+    pub fn add_value(&mut self, value: impl Into<Value>) {
+        self.values.push(value.into());
+    }
+
+    pub fn get(&self, index: usize) -> &Value {
+        self.values.get(index).unwrap_or(&Value::NULL)
+    }
+}
+
+impl Add for &Record {
+    type Output = Record;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        let mut sum = Record::default();
+        sum.values.append(&mut self.values.clone());
+        sum.values.append(&mut rhs.values.clone()); // use refs?
+        sum
+    }
+}
+
+impl Default for Record {
+    fn default() -> Self {
+        Self { values: vec![] }
+    }
+}
+
+pub struct TableIter<'a> {
+    table_iter: std::collections::btree_map::Iter<'a, Key, Record>,
+}
+
+impl<'a> Iterator for TableIter<'a> {
+    type Item = &'a Record;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.table_iter.next().map(|e| e.1)
+    }
+}
+
+pub struct ColIter<'a> {
+    cols: &'a Vec<String>,
+    index: usize,
+}
+
+pub struct OwnedColIter<'a> {
+    cols: &'a Vec<&'a str>,
+    index: usize,
+}
+
+impl<'a> Iterator for ColIter<'a> {
+    type Item = &'a String;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(v) = self.cols.get(self.index) {
+            self.index += 1;
+            Some(v)
+        } else {
+            None
+        }
+    }
+}
+
+impl<'a> Iterator for OwnedColIter<'a> {
+    type Item = &'a str;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(v) = self.cols.get(self.index) {
+            self.index += 1;
+            Some(v)
+        } else {
+            None
+        }
+    }
+}
+
+struct Key {
+    values: Vec<Value>,
+}
+
+impl Key {
+    fn integer(integer: usize) -> Self {
+        Self {
+            values: vec![Value::Integer(integer as i64)],
+        }
+    }
+
+    fn compound(keys: Vec<Value>) -> Self {
+        Self { values: keys }
+    }
+}
+impl Ord for Key {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.partial_cmp(other).unwrap()
+    }
+}
+impl Eq for Key {}
+
+impl PartialEq for Key {
+    fn eq(&self, other: &Self) -> bool {
+        if self.values.len() != other.values.len() {
+            false
+        } else {
+            for (l, r) in self.values.iter().zip(&other.values) {
+                if l != r {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+}
+
+impl PartialOrd for Key {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        let len = self.values.len().min(other.values.len());
+        for i in 0..len {
+            let ord = self
+                .values
+                .get(i)
+                .unwrap()
+                .partial_cmp(other.values.get(i).unwrap())
+                .unwrap();
+            match ord {
+                Ordering::Less => {
+                    return Some(Ordering::Less);
+                }
+                Ordering::Greater => {
+                    return Some(Ordering::Greater);
+                }
+                _ => {}
+            }
+        }
+        Some(Ordering::Equal)
+    }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..4510820
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,21 @@
+use csv::Table;
+
+fn main() {
+    let companies = Table::from_csv(include_str!("data/companies.csv"), "\t");
+    let remove = Table::from_csv(include_str!("data/remove.csv"), "\t");
+
+    // companies.pretty_print("*");
+    // remove.pretty_print("*");
+    let left = Table::from_csv(include_str!("data/left.csv"), "\t");
+    let right = Table::from_csv(include_str!("data/right.csv"), "\t");
+    // left.pretty_print("*");
+    // right.pretty_print("*");
+    let join1 = left.left_join(&right, "name", "name", true);
+    let join2 = left.right_join(&right, "name", "name", true);
+    //
+    companies
+        .left_join(&remove, "aisAccountID", "aisaccountid", false)
+        .order_by("aisAccountID")
+        .select("aisAccountID");
+    // join2.pretty_print("*");
+}
diff --git a/src/order.rs b/src/order.rs
new file mode 100644
index 0000000..8b7d3c0
--- /dev/null
+++ b/src/order.rs
@@ -0,0 +1,17 @@
+use std::collections::BTreeMap;
+
+use crate::{Key, Table};
+
+impl Table {
+    pub fn order_by(&self, expression: &str) -> Self {
+        let indexes = self.get_indexes(expression);
+        let mut sorted_records = BTreeMap::new();
+        for record in self.iter() {
+            let key = indexes.iter().map(|i| record.get(*i).clone()).collect();
+            sorted_records.insert(Key::compound(key), record.clone());
+        }
+        let mut ordered = Table::empty_copy(self);
+        ordered.records = sorted_records;
+        ordered
+    }
+}
diff --git a/src/print.rs b/src/print.rs
new file mode 100644
index 0000000..d542b3d
--- /dev/null
+++ b/src/print.rs
@@ -0,0 +1,104 @@
+use std::collections::HashMap;
+
+use crate::Table;
+
+impl Table {
+    /// prints the table contents in nice columns on the command line
+    pub fn select(&self, expression: &str) {
+        if expression == "*" {
+            self.pretty_print_all();
+        } else {
+            let cols = expression
+                .split(",")
+                .map(|c| c.trim())
+                .collect::<Vec<&str>>();
+            cols.iter()
+                .filter(|c| !self.has_column(**c))
+                .any(|invalid| panic!("{} is not a column in this table", invalid));
+            self.pretty_print_select(cols);
+        }
+    }
+
+    fn pretty_print_all(&self) {
+        let column_widths = self.get_column_widths(0, usize::MAX);
+        // let total = column_widths.values().iter();
+        for col in self.iter_colums() {
+            let w = column_widths.get(col).unwrap_or(&0);
+            print!("| {:<w$} ", col);
+        }
+        println!("|");
+        for record in self.iter_records() {
+            for col in self.iter_colums() {
+                let w = column_widths.get(col).unwrap_or(&0);
+                // eprintln!("{}", w);
+                print!("| {:<w$} ", record.get(self.get_index(col)).to_string());
+            }
+            println!("|");
+        }
+    }
+
+    fn pretty_print_select(&self, columns: Vec<&str>) {
+        let column_widths = self.select_column_widths(0, usize::MAX, &columns);
+        // let total = column_widths.values().iter();
+        for col in self.select_columns(&columns) {
+            let w = column_widths.get(col).unwrap_or(&0);
+            print!("| {:<w$} ", col);
+        }
+        println!("|");
+        for record in self.iter_records() {
+            for col in self.select_columns(&columns) {
+                let w = column_widths.get(col).unwrap_or(&0);
+                // eprintln!("{}", w);
+                print!("| {:<w$} ", record.get(self.get_index(col)).to_string());
+            }
+            println!("|");
+        }
+    }
+
+    /// returns a map of column index -> max length of column name/value in any of the rows
+    /// needed for printing nice columns
+    /// the following parameters allow for paging views
+    /// offset: start at rowindex
+    /// nrecords: take n records after offset
+    fn get_column_widths(&self, offset: usize, nrecords: usize) -> HashMap<&String, usize> {
+        let mut widths = HashMap::new();
+        // initialize count with the length of the column name
+        for col in self.iter_colums() {
+            widths.insert(col, col.len());
+        }
+        for record in self.iter_records().skip(offset).take(nrecords) {
+            for col in self.iter_colums() {
+                let e = widths.get_mut(&col).unwrap();
+                let index = self.get_index(col);
+                *e = (*e).max(record.get(index).len());
+            }
+        }
+        widths
+    }
+
+    // returns a map of column index -> max length of column name/value in any of the rows
+    /// needed for printing nice columns
+    /// the following parameters allow for paging views
+    /// offset: start at rowindex
+    /// nrecords: take n records after offset
+    fn select_column_widths<'a>(
+        &'a self,
+        offset: usize,
+        nrecords: usize,
+        columns: &'a Vec<&'a str>,
+    ) -> HashMap<&'a str, usize> {
+        let mut widths = HashMap::new();
+        // initialize count with the length of the column name
+        for col in self.select_columns(columns) {
+            widths.insert(col, col.len());
+        }
+        for record in self.iter_records().skip(offset).take(nrecords) {
+            for col in self.select_columns(columns) {
+                let e = widths.get_mut(&col).unwrap();
+                let index = self.get_index(&col);
+                *e = (*e).max(record.get(index).len());
+            }
+        }
+        widths
+    }
+}
diff --git a/src/read.rs b/src/read.rs
new file mode 100644
index 0000000..d084ecd
--- /dev/null
+++ b/src/read.rs
@@ -0,0 +1,21 @@
+use crate::{Record, Table};
+
+impl Table {
+    pub fn from_csv(csv: &str, separator: &str) -> Self {
+        let mut table = Table::new("test");
+        for (index, row) in csv.split("\n").enumerate() {
+            if index == 0 {
+                for col in row.split(separator) {
+                    table.add_column(col, true);
+                }
+            } else if row.len() > 0 {
+                let mut record = Record::default();
+                for value in row.split(separator) {
+                    record.add_value(value);
+                }
+                table.add_record(record);
+            }
+        }
+        table
+    }
+}
diff --git a/src/value.rs b/src/value.rs
new file mode 100644
index 0000000..d0eda9e
--- /dev/null
+++ b/src/value.rs
@@ -0,0 +1,76 @@
+use std::fmt::Display;
+
+#[derive(Debug, PartialEq, PartialOrd, Clone)]
+pub enum Value {
+    Text(String),
+    Float(f64),
+    Integer(i64),
+    NULL,
+}
+
+impl Value {
+    pub fn len(&self) -> usize {
+        match self {
+            Value::Text(text) => text.len(),
+            Value::Float(float) => format!("{}", float).len(),
+            Value::Integer(integer) => format!("{}", integer).len(),
+            Value::NULL => 0,
+        }
+    }
+}
+
+impl Display for Value {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let text = match self {
+            Value::Float(float) => format!("{}", float),
+            Value::Integer(integer) => format!("{}", integer),
+            Value::Text(text) => format!("\"{}\"", text),
+            Value::NULL => "NULL".to_string(),
+        };
+        write!(f, "{}", text)
+    }
+}
+
+impl Into<Value> for &str {
+    fn into(self) -> Value {
+        if let Ok(f) = self.parse::<f64>() {
+            Value::Float(f)
+        } else if let Ok(i) = self.parse::<i64>() {
+            Value::Integer(i)
+        } else {
+            Value::Text(strip_quotes(self))
+        }
+    }
+}
+
+impl Into<Value> for String {
+    fn into(self) -> Value {
+        if let Ok(f) = self.parse::<f64>() {
+            Value::Float(f)
+        } else if let Ok(i) = self.parse::<i64>() {
+            Value::Integer(i)
+        } else {
+            Value::Text(strip_quotes(self))
+        }
+    }
+}
+
+impl Into<Value> for f64 {
+    fn into(self) -> Value {
+        Value::Float(self)
+    }
+}
+
+impl Into<Value> for i64 {
+    fn into(self) -> Value {
+        Value::Integer(self)
+    }
+}
+
+fn strip_quotes(text: impl Into<String>) -> String {
+    let mut text = text.into();
+    if text.starts_with("\"") && text.ends_with("\"") {
+        text = text[1..text.len() - 1].to_string();
+    }
+    text
+}