first commit

This commit is contained in:
Sander Hautvast 2025-02-13 21:54:41 +01:00
commit 9e808b2b47
12 changed files with 586 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target
.DS_Store

23
Cargo.lock generated Normal file
View file

@ -0,0 +1,23 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "anyhow"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "csv"
version = "0.1.0"
dependencies = [
"anyhow",
"byteorder",
]

8
Cargo.toml Normal file
View file

@ -0,0 +1,8 @@
[package]
name = "csv"
version = "0.1.0"
edition = "2021"
[dependencies]
byteorder = "1.5"
anyhow = "1.0"

1
README.md Normal file
View file

@ -0,0 +1 @@
An attempt to query csv files, (like in SQLite after csv import)

19
src/groupby.rs Normal file
View file

@ -0,0 +1,19 @@
use crate::Table;
impl Table {
pub fn group_by(&self, select_expression: &str, group_by_expression: &str) -> Table {
let table = Table::new("");
table
}
}
// fn parse_select(select: &str){
// }
// enum Aggregation {
// Sum(String),
// Max(String),
// Min(String),
// }

29
src/join.rs Normal file
View file

@ -0,0 +1,29 @@
use crate::Table;
impl Table {
pub fn left_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table {
join(self, right, left_col, right_col, outer)
}
pub fn right_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table {
join(right, self, right_col, left_col, outer)
}
}
pub fn join(left: &Table, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table {
let mut joined = Table::new("join");
left.cols.iter().for_each(|c| joined.add_column(c, true));
right.cols.iter().for_each(|c| joined.add_column(c, true));
let left_col_index = left.get_index(left_col);
let right_col_index = right.get_index(right_col);
for record in left.iter_records() {
let lv = record.get(left_col_index);
if let Some(right_record) = right.where_clause(right_col_index, lv) {
joined.add_record(record + right_record);
} else if outer {
joined.add_record(record.clone());
}
}
joined
}

265
src/lib.rs Normal file
View file

@ -0,0 +1,265 @@
pub mod groupby;
pub mod join;
pub mod order;
pub mod print;
pub mod read;
pub mod value;
use std::{
cmp::Ordering,
collections::{BTreeMap, HashMap},
ops::Add,
};
use value::Value;
pub struct Table {
name: String,
cols_by_name: HashMap<String, usize>,
cols: Vec<String>,
records: BTreeMap<Key, Record>,
}
impl Table {
pub fn new(name: impl Into<String>) -> Self {
Self {
name: name.into(),
cols_by_name: HashMap::new(),
cols: vec![],
records: BTreeMap::new(),
}
}
/// Creates a new table with the same name and columns as self,
/// but without data
// Note to self: be careful, might be dangerous to use once tables can be altered.
// That is not yet implemented. May need full copies
pub fn empty_copy(&self) -> Self {
let mut result = Table::new(self.name.clone());
result.cols_by_name = self.cols_by_name.clone();
result.cols = self.cols.clone();
result
}
pub fn add_record(&mut self, record: Record) {
let index = self.records.len();
self.records.insert(Key::integer(index), record);
}
pub fn has_column(&self, name: impl Into<String>) -> bool {
self.cols_by_name.contains_key(&name.into())
}
pub fn add_column(&mut self, name: impl Into<String>, allow_duplicates: bool) {
let col_index = self.cols.len();
let orig_name: String = name.into();
let name = if allow_duplicates {
// append an index when there are duplicate column names
let mut col_name = orig_name.to_string();
let mut index = 2;
while self.has_column(&col_name) {
col_name = orig_name.to_string();
col_name.push_str(format!("{}", index).as_str());
index += 1;
}
col_name
} else {
orig_name
};
self.cols_by_name.insert(name.clone(), col_index);
self.cols.push(name);
}
fn get_indexes(&self, expression: &str) -> Vec<usize> {
expression
.split(",")
.map(|c| self.get_index(c.trim()))
.collect::<Vec<usize>>()
}
fn get_index(&self, col_name: &str) -> usize {
*self.cols_by_name.get(col_name).unwrap()
}
pub fn iter(&self) -> TableIter {
self.iter_records()
}
pub fn iter_records(&self) -> TableIter {
TableIter {
table_iter: self.records.iter(),
}
}
pub fn select_columns<'a>(&'a self, columns: &'a Vec<&'a str>) -> OwnedColIter<'a> {
OwnedColIter {
cols: columns,
index: 0,
}
}
pub fn iter_colums(&self) -> ColIter {
ColIter {
cols: &self.cols,
index: 0,
}
}
pub fn where_clause(&self, colindex: usize, value: &Value) -> Option<&Record> {
for record in self.iter_records() {
let r = record.get(colindex);
if r == value {
return Some(record);
}
}
None
}
}
#[derive(Debug, Clone)]
pub struct Record {
values: Vec<Value>,
}
impl Record {
pub fn len(&self) -> usize {
self.values.iter().map(Value::len).sum()
}
pub fn add_value(&mut self, value: impl Into<Value>) {
self.values.push(value.into());
}
pub fn get(&self, index: usize) -> &Value {
self.values.get(index).unwrap_or(&Value::NULL)
}
}
impl Add for &Record {
type Output = Record;
fn add(self, rhs: Self) -> Self::Output {
let mut sum = Record::default();
sum.values.append(&mut self.values.clone());
sum.values.append(&mut rhs.values.clone()); // use refs?
sum
}
}
impl Default for Record {
fn default() -> Self {
Self { values: vec![] }
}
}
pub struct TableIter<'a> {
table_iter: std::collections::btree_map::Iter<'a, Key, Record>,
}
impl<'a> Iterator for TableIter<'a> {
type Item = &'a Record;
fn next(&mut self) -> Option<Self::Item> {
self.table_iter.next().map(|e| e.1)
}
}
pub struct ColIter<'a> {
cols: &'a Vec<String>,
index: usize,
}
pub struct OwnedColIter<'a> {
cols: &'a Vec<&'a str>,
index: usize,
}
impl<'a> Iterator for ColIter<'a> {
type Item = &'a String;
fn next(&mut self) -> Option<Self::Item> {
if let Some(v) = self.cols.get(self.index) {
self.index += 1;
Some(v)
} else {
None
}
}
}
impl<'a> Iterator for OwnedColIter<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
if let Some(v) = self.cols.get(self.index) {
self.index += 1;
Some(v)
} else {
None
}
}
}
struct Key {
values: Vec<Value>,
}
impl Key {
fn integer(integer: usize) -> Self {
Self {
values: vec![Value::Integer(integer as i64)],
}
}
fn compound(keys: Vec<Value>) -> Self {
Self { values: keys }
}
}
impl Ord for Key {
fn cmp(&self, other: &Self) -> Ordering {
self.partial_cmp(other).unwrap()
}
}
impl Eq for Key {}
impl PartialEq for Key {
fn eq(&self, other: &Self) -> bool {
if self.values.len() != other.values.len() {
false
} else {
for (l, r) in self.values.iter().zip(&other.values) {
if l != r {
return false;
}
}
true
}
}
}
impl PartialOrd for Key {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
let len = self.values.len().min(other.values.len());
for i in 0..len {
let ord = self
.values
.get(i)
.unwrap()
.partial_cmp(other.values.get(i).unwrap())
.unwrap();
match ord {
Ordering::Less => {
return Some(Ordering::Less);
}
Ordering::Greater => {
return Some(Ordering::Greater);
}
_ => {}
}
}
Some(Ordering::Equal)
}
}

21
src/main.rs Normal file
View file

@ -0,0 +1,21 @@
use csv::Table;
fn main() {
let companies = Table::from_csv(include_str!("data/companies.csv"), "\t");
let remove = Table::from_csv(include_str!("data/remove.csv"), "\t");
// companies.pretty_print("*");
// remove.pretty_print("*");
let left = Table::from_csv(include_str!("data/left.csv"), "\t");
let right = Table::from_csv(include_str!("data/right.csv"), "\t");
// left.pretty_print("*");
// right.pretty_print("*");
let join1 = left.left_join(&right, "name", "name", true);
let join2 = left.right_join(&right, "name", "name", true);
//
companies
.left_join(&remove, "aisAccountID", "aisaccountid", false)
.order_by("aisAccountID")
.select("aisAccountID");
// join2.pretty_print("*");
}

17
src/order.rs Normal file
View file

@ -0,0 +1,17 @@
use std::collections::BTreeMap;
use crate::{Key, Table};
impl Table {
pub fn order_by(&self, expression: &str) -> Self {
let indexes = self.get_indexes(expression);
let mut sorted_records = BTreeMap::new();
for record in self.iter() {
let key = indexes.iter().map(|i| record.get(*i).clone()).collect();
sorted_records.insert(Key::compound(key), record.clone());
}
let mut ordered = Table::empty_copy(self);
ordered.records = sorted_records;
ordered
}
}

104
src/print.rs Normal file
View file

@ -0,0 +1,104 @@
use std::collections::HashMap;
use crate::Table;
impl Table {
/// prints the table contents in nice columns on the command line
pub fn select(&self, expression: &str) {
if expression == "*" {
self.pretty_print_all();
} else {
let cols = expression
.split(",")
.map(|c| c.trim())
.collect::<Vec<&str>>();
cols.iter()
.filter(|c| !self.has_column(**c))
.any(|invalid| panic!("{} is not a column in this table", invalid));
self.pretty_print_select(cols);
}
}
fn pretty_print_all(&self) {
let column_widths = self.get_column_widths(0, usize::MAX);
// let total = column_widths.values().iter();
for col in self.iter_colums() {
let w = column_widths.get(col).unwrap_or(&0);
print!("| {:<w$} ", col);
}
println!("|");
for record in self.iter_records() {
for col in self.iter_colums() {
let w = column_widths.get(col).unwrap_or(&0);
// eprintln!("{}", w);
print!("| {:<w$} ", record.get(self.get_index(col)).to_string());
}
println!("|");
}
}
fn pretty_print_select(&self, columns: Vec<&str>) {
let column_widths = self.select_column_widths(0, usize::MAX, &columns);
// let total = column_widths.values().iter();
for col in self.select_columns(&columns) {
let w = column_widths.get(col).unwrap_or(&0);
print!("| {:<w$} ", col);
}
println!("|");
for record in self.iter_records() {
for col in self.select_columns(&columns) {
let w = column_widths.get(col).unwrap_or(&0);
// eprintln!("{}", w);
print!("| {:<w$} ", record.get(self.get_index(col)).to_string());
}
println!("|");
}
}
/// returns a map of column index -> max length of column name/value in any of the rows
/// needed for printing nice columns
/// the following parameters allow for paging views
/// offset: start at rowindex
/// nrecords: take n records after offset
fn get_column_widths(&self, offset: usize, nrecords: usize) -> HashMap<&String, usize> {
let mut widths = HashMap::new();
// initialize count with the length of the column name
for col in self.iter_colums() {
widths.insert(col, col.len());
}
for record in self.iter_records().skip(offset).take(nrecords) {
for col in self.iter_colums() {
let e = widths.get_mut(&col).unwrap();
let index = self.get_index(col);
*e = (*e).max(record.get(index).len());
}
}
widths
}
// returns a map of column index -> max length of column name/value in any of the rows
/// needed for printing nice columns
/// the following parameters allow for paging views
/// offset: start at rowindex
/// nrecords: take n records after offset
fn select_column_widths<'a>(
&'a self,
offset: usize,
nrecords: usize,
columns: &'a Vec<&'a str>,
) -> HashMap<&'a str, usize> {
let mut widths = HashMap::new();
// initialize count with the length of the column name
for col in self.select_columns(columns) {
widths.insert(col, col.len());
}
for record in self.iter_records().skip(offset).take(nrecords) {
for col in self.select_columns(columns) {
let e = widths.get_mut(&col).unwrap();
let index = self.get_index(&col);
*e = (*e).max(record.get(index).len());
}
}
widths
}
}

21
src/read.rs Normal file
View file

@ -0,0 +1,21 @@
use crate::{Record, Table};
impl Table {
pub fn from_csv(csv: &str, separator: &str) -> Self {
let mut table = Table::new("test");
for (index, row) in csv.split("\n").enumerate() {
if index == 0 {
for col in row.split(separator) {
table.add_column(col, true);
}
} else if row.len() > 0 {
let mut record = Record::default();
for value in row.split(separator) {
record.add_value(value);
}
table.add_record(record);
}
}
table
}
}

76
src/value.rs Normal file
View file

@ -0,0 +1,76 @@
use std::fmt::Display;
#[derive(Debug, PartialEq, PartialOrd, Clone)]
pub enum Value {
Text(String),
Float(f64),
Integer(i64),
NULL,
}
impl Value {
pub fn len(&self) -> usize {
match self {
Value::Text(text) => text.len(),
Value::Float(float) => format!("{}", float).len(),
Value::Integer(integer) => format!("{}", integer).len(),
Value::NULL => 0,
}
}
}
impl Display for Value {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let text = match self {
Value::Float(float) => format!("{}", float),
Value::Integer(integer) => format!("{}", integer),
Value::Text(text) => format!("\"{}\"", text),
Value::NULL => "NULL".to_string(),
};
write!(f, "{}", text)
}
}
impl Into<Value> for &str {
fn into(self) -> Value {
if let Ok(f) = self.parse::<f64>() {
Value::Float(f)
} else if let Ok(i) = self.parse::<i64>() {
Value::Integer(i)
} else {
Value::Text(strip_quotes(self))
}
}
}
impl Into<Value> for String {
fn into(self) -> Value {
if let Ok(f) = self.parse::<f64>() {
Value::Float(f)
} else if let Ok(i) = self.parse::<i64>() {
Value::Integer(i)
} else {
Value::Text(strip_quotes(self))
}
}
}
impl Into<Value> for f64 {
fn into(self) -> Value {
Value::Float(self)
}
}
impl Into<Value> for i64 {
fn into(self) -> Value {
Value::Integer(self)
}
}
fn strip_quotes(text: impl Into<String>) -> String {
let mut text = text.into();
if text.starts_with("\"") && text.ends_with("\"") {
text = text[1..text.len() - 1].to_string();
}
text
}