From 3281edc3509dd55e1b4d08d3b8d1d811cd3e93ea Mon Sep 17 00:00:00 2001 From: Sander Hautvast Date: Tue, 18 Feb 2025 22:09:50 +0100 Subject: [PATCH] binary storage, start of vm --- Cargo.lock | 7 + Cargo.toml | 1 + examples/join/main.rs | 6 +- examples/orderby/main.rs | 4 +- src/join.rs | 2 +- src/lib.rs | 263 +---------------------------- src/main.rs | 7 +- src/order.rs | 6 +- src/page.rs | 71 ++++++++ src/print.rs | 6 +- src/read.rs | 57 ++++++- src/sql/scanner.rs | 4 +- src/table.rs | 273 ++++++++++++++++++++++++++++++ src/value.rs | 348 +++++++++++++++++++++++++++++++++++---- src/varint.rs | 132 +++++++++++++++ src/vm/mod.rs | 40 +++++ 16 files changed, 921 insertions(+), 306 deletions(-) create mode 100644 src/page.rs create mode 100644 src/table.rs create mode 100644 src/varint.rs create mode 100644 src/vm/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 00761aa..3d8f336 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,9 +8,16 @@ version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "csv" version = "0.1.0" dependencies = [ "anyhow", + "byteorder", ] diff --git a/Cargo.toml b/Cargo.toml index 9c93cee..3928faa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,4 +4,5 @@ version = "0.1.0" edition = "2021" [dependencies] +byteorder = "1.5" anyhow = "1.0" diff --git a/examples/join/main.rs b/examples/join/main.rs index 0eb9532..0a77ba7 100644 --- a/examples/join/main.rs +++ b/examples/join/main.rs @@ -1,8 +1,8 @@ -use csv::Table; +use csv::table::Table; fn main() { - let left = Table::from_csv(include_str!("data/left.csv"), "\t"); - let right = Table::from_csv(include_str!("data/right.csv"), "\t"); + let left = Table::from_csv(include_str!("data/left.csv"), Some("\t")); + let right = Table::from_csv(include_str!("data/right.csv"), Some("\t")); println!("left:"); left.select("*"); println!("\nright:"); diff --git a/examples/orderby/main.rs b/examples/orderby/main.rs index 4aa8c7d..24ab143 100644 --- a/examples/orderby/main.rs +++ b/examples/orderby/main.rs @@ -1,7 +1,7 @@ -use csv::Table; +use csv::table::Table; fn main() { - let table = Table::from_csv(include_str!("data/table.csv"), "\t"); + let table = Table::from_csv(include_str!("data/table.csv"), Some("\t")); println!("not ordered:"); table.select("*"); diff --git a/src/join.rs b/src/join.rs index e55cde6..c85be63 100644 --- a/src/join.rs +++ b/src/join.rs @@ -1,4 +1,4 @@ -use crate::Table; +use crate::table::Table; impl Table { pub fn left_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { diff --git a/src/lib.rs b/src/lib.rs index a0faeb5..595d069 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,266 +1,11 @@ pub mod groupby; pub mod join; pub mod order; +mod page; pub mod print; pub mod read; pub mod sql; +pub mod table; pub mod value; - -use std::{ - cmp::Ordering, - collections::{BTreeMap, HashMap}, - ops::Add, -}; - -use value::Value; - -pub struct Table { - name: String, - cols_by_name: HashMap, - cols: Vec, - records: BTreeMap, -} - -impl Table { - pub fn new(name: impl Into) -> Self { - Self { - name: name.into(), - cols_by_name: HashMap::new(), - cols: vec![], - records: BTreeMap::new(), - } - } - - /// Creates a new table with the same name and columns as self, - /// but without data - // Note to self: be careful, might be dangerous to use once tables can be altered. - // That is not yet implemented. May need full copies - pub fn empty_copy(&self) -> Self { - let mut result = Table::new(self.name.clone()); - result.cols_by_name = self.cols_by_name.clone(); - result.cols = self.cols.clone(); - result - } - - pub fn add_record(&mut self, record: Record) { - let index = self.records.len(); - self.records.insert(Key::integer(index), record); - } - - pub fn has_column(&self, name: impl Into) -> bool { - self.cols_by_name.contains_key(&name.into()) - } - - pub fn add_column(&mut self, name: impl Into, allow_duplicates: bool) { - let col_index = self.cols.len(); - let orig_name: String = name.into(); - - let name = if allow_duplicates { - // append an index when there are duplicate column names - let mut col_name = orig_name.to_string(); - let mut index = 2; - - while self.has_column(&col_name) { - col_name = orig_name.to_string(); - col_name.push_str(format!("{}", index).as_str()); - index += 1; - } - col_name - } else { - orig_name - }; - - self.cols_by_name.insert(name.clone(), col_index); - self.cols.push(name); - } - - fn get_indexes(&self, expression: &str) -> Vec { - expression - .split(",") - .map(|c| self.get_index(c.trim())) - .collect::>() - } - - fn get_index(&self, col_name: &str) -> usize { - *self.cols_by_name.get(col_name).unwrap() - } - - pub fn iter(&self) -> TableIter { - self.iter_records() - } - - pub fn iter_records(&self) -> TableIter { - TableIter { - table_iter: self.records.iter(), - } - } - - pub fn select_columns<'a>(&'a self, columns: &'a Vec<&'a str>) -> OwnedColIter<'a> { - OwnedColIter { - cols: columns, - index: 0, - } - } - - pub fn iter_colums(&self) -> ColIter { - ColIter { - cols: &self.cols, - index: 0, - } - } - - pub fn where_clause(&self, colindex: usize, value: &Value) -> Option<&Record> { - for record in self.iter_records() { - let r = record.get(colindex); - if r == value { - return Some(record); - } - } - None - } -} - -#[derive(Debug, Clone)] -pub struct Record { - values: Vec, -} - -impl Record { - pub fn len(&self) -> usize { - self.values.iter().map(Value::len).sum() - } - - pub fn add_value(&mut self, value: impl Into) { - self.values.push(value.into()); - } - - pub fn get(&self, index: usize) -> &Value { - self.values.get(index).unwrap_or(&Value::NULL) - } -} - -impl Add for &Record { - type Output = Record; - - fn add(self, rhs: Self) -> Self::Output { - let mut sum = Record::default(); - sum.values.append(&mut self.values.clone()); - sum.values.append(&mut rhs.values.clone()); // use refs? - sum - } -} - -impl Default for Record { - fn default() -> Self { - Self { values: vec![] } - } -} - -pub struct TableIter<'a> { - table_iter: std::collections::btree_map::Iter<'a, Key, Record>, -} - -impl<'a> Iterator for TableIter<'a> { - type Item = &'a Record; - - fn next(&mut self) -> Option { - self.table_iter.next().map(|e| e.1) - } -} - -pub struct ColIter<'a> { - cols: &'a Vec, - index: usize, -} - -pub struct OwnedColIter<'a> { - cols: &'a Vec<&'a str>, - index: usize, -} - -impl<'a> Iterator for ColIter<'a> { - type Item = &'a String; - - fn next(&mut self) -> Option { - if let Some(v) = self.cols.get(self.index) { - self.index += 1; - Some(v) - } else { - None - } - } -} - -impl<'a> Iterator for OwnedColIter<'a> { - type Item = &'a str; - - fn next(&mut self) -> Option { - if let Some(v) = self.cols.get(self.index) { - self.index += 1; - Some(v) - } else { - None - } - } -} - -struct Key { - values: Vec, -} - -impl Key { - fn integer(integer: usize) -> Self { - Self { - values: vec![Value::Integer(integer as i64)], - } - } - - fn compound(keys: Vec) -> Self { - Self { values: keys } - } -} -impl Ord for Key { - fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other).unwrap() - } -} -impl Eq for Key {} - -impl PartialEq for Key { - fn eq(&self, other: &Self) -> bool { - if self.values.len() != other.values.len() { - false - } else { - for (l, r) in self.values.iter().zip(&other.values) { - if l != r { - return false; - } - } - true - } - } -} - -impl PartialOrd for Key { - fn partial_cmp(&self, other: &Self) -> Option { - let len = self.values.len().min(other.values.len()); - for i in 0..len { - let ord = self - .values - .get(i) - .unwrap() - .partial_cmp(other.values.get(i).unwrap()) - .unwrap(); - match ord { - Ordering::Less => { - return Some(Ordering::Less); - } - Ordering::Greater => { - return Some(Ordering::Greater); - } - _ => {} - } - } - Some(Ordering::Equal) - } -} +mod varint; +pub mod vm; diff --git a/src/main.rs b/src/main.rs index bc56ed2..94bb43f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,7 @@ +use csv::table::Table; -fn main (){ - +fn main() { + let csv = include_str!("data/portfolios.csv"); + let table = Table::from_csv(csv, None); + table.order_by("name").select("*"); } diff --git a/src/order.rs b/src/order.rs index 8b7d3c0..5fc07ac 100644 --- a/src/order.rs +++ b/src/order.rs @@ -1,10 +1,12 @@ use std::collections::BTreeMap; -use crate::{Key, Table}; +use crate::table::{Key, Table}; impl Table { pub fn order_by(&self, expression: &str) -> Self { - let indexes = self.get_indexes(expression); + let indexes = self.get_column_indexes(expression); + if self.views.contains_key(expression) {} + let mut sorted_records = BTreeMap::new(); for record in self.iter() { let key = indexes.iter().map(|i| record.get(*i).clone()).collect(); diff --git a/src/page.rs b/src/page.rs new file mode 100644 index 0000000..80b2918 --- /dev/null +++ b/src/page.rs @@ -0,0 +1,71 @@ +use std::ops::Add; + +use crate::value::{Value, NULL}; + +const PAGE_SIZE: usize = 4096; + +pub enum PageType { + Root, + Interior, + Leaf, +} + +pub struct Page { + pagetype: PageType, + data: Vec, + index_pos: u16, + data_pos: u16, + key: usize, + children: Vec, +} + +impl Page { + pub fn new(pagetype: PageType) -> Self { + Self { + pagetype, + data: vec![0; PAGE_SIZE], + index_pos: 0, + data_pos: (PAGE_SIZE - 1) as u16, + key: 0, + children: vec![], + } + } + + pub fn add_record(&mut self, record: Record) {} +} + +#[derive(Debug, Clone)] +pub struct Record { + values: Vec, +} + +impl Record { + pub fn string_len(&self) -> usize { + self.values.iter().map(Value::string_len).sum() + } + + pub fn add_value(&mut self, value: impl Into) { + self.values.push(value.into()); + } + + pub fn get(&self, index: usize) -> Value { + self.values.get(index).map(|v| v.clone()).unwrap_or(NULL) + } +} + +impl Add for &Record { + type Output = Record; + + fn add(self, rhs: Self) -> Self::Output { + let mut sum = Record::default(); + sum.values.append(&mut self.values.clone()); + sum.values.append(&mut rhs.values.clone()); // use refs? + sum + } +} + +impl Default for Record { + fn default() -> Self { + Self { values: vec![] } + } +} diff --git a/src/print.rs b/src/print.rs index d542b3d..9b7c9de 100644 --- a/src/print.rs +++ b/src/print.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use crate::Table; +use crate::table::Table; impl Table { /// prints the table contents in nice columns on the command line @@ -70,7 +70,7 @@ impl Table { for col in self.iter_colums() { let e = widths.get_mut(&col).unwrap(); let index = self.get_index(col); - *e = (*e).max(record.get(index).len()); + *e = (*e).max(record.get(index).string_len()); } } widths @@ -96,7 +96,7 @@ impl Table { for col in self.select_columns(columns) { let e = widths.get_mut(&col).unwrap(); let index = self.get_index(&col); - *e = (*e).max(record.get(index).len()); + *e = (*e).max(record.get(index).string_len()); } } widths diff --git a/src/read.rs b/src/read.rs index d084ecd..0614df9 100644 --- a/src/read.rs +++ b/src/read.rs @@ -1,16 +1,32 @@ -use crate::{Record, Table}; +use crate::table::{Record, Table}; +use std::fs; + +const EOL: &str = "\n"; impl Table { - pub fn from_csv(csv: &str, separator: &str) -> Self { - let mut table = Table::new("test"); - for (index, row) in csv.split("\n").enumerate() { + pub fn from_csv_file(name: &str, separator: Option<&str>) -> anyhow::Result { + let csv = fs::read_to_string(name)?; + + Ok(Table::from_csv(csv, separator)) + } + + pub fn from_csv(csv: impl Into, separator: Option<&str>) -> Self { + let csv = csv.into(); + let separator = separator.unwrap_or( + guess_separator(&csv) + .expect("You did not give me a separator and I could not guess it from the data"), + ); + let mut table = Table::new(""); + for (index, row) in csv.split(EOL).enumerate() { if index == 0 { for col in row.split(separator) { table.add_column(col, true); } } else if row.len() > 0 { + // skip empty lines let mut record = Record::default(); for value in row.split(separator) { + //TODO quoted values record.add_value(value); } table.add_record(record); @@ -19,3 +35,36 @@ impl Table { table } } + +fn guess_separator(csv: &String) -> Option<&'static str> { + let mut tabs = 0; + let mut semis = 0; + let mut commas = 0; + let mut pipes = 0; + for c in csv.chars() { + match c { + '\t' => tabs += 1, + ';' => semis += 1, + ',' => commas += 1, + '|' => pipes += 1, + _ => {} + } + } + let values = vec![(tabs, 0), (semis, 1), (commas, 2), (pipes, 3)]; + values.iter().max().map(|m| match m.1 { + 0 => "\t", + 1 => ";", + 2 => ",", + 3 => "|", + _ => "\0", //? + }) +} + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn test() { + guess_separator(&"a,b,c|d".to_string()); + } +} diff --git a/src/sql/scanner.rs b/src/sql/scanner.rs index 59fbf24..6f7b6ee 100644 --- a/src/sql/scanner.rs +++ b/src/sql/scanner.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use anyhow::anyhow; -use crate::value::Value; +use crate::value::{Value, NULL}; use super::tokens::{Token, TokenType}; @@ -154,7 +154,7 @@ impl Scanner { fn add_token(&mut self, tokentype: TokenType) { let text = self.source[self.start..self.current].to_string(); - self.tokens.push(Token::new(tokentype, text, Value::NULL)); + self.tokens.push(Token::new(tokentype, text, NULL)); } fn add_literal(&mut self, tokentype: TokenType, literal: Value) { diff --git a/src/table.rs b/src/table.rs new file mode 100644 index 0000000..0b8293e --- /dev/null +++ b/src/table.rs @@ -0,0 +1,273 @@ +use std::{ + cmp::Ordering, + collections::{BTreeMap, HashMap}, + iter::Map, + ops::Add, +}; + +use crate::value::Value; + +pub struct View { + records: BTreeMap, +} + +pub struct Table { + name: String, + cols_by_name: HashMap, + pub(crate) cols: Vec, + pub(crate) records: BTreeMap, + pub views: HashMap, +} + +impl Table { + pub fn new(name: impl Into) -> Self { + Self { + name: name.into(), + cols_by_name: HashMap::new(), + cols: vec![], + records: BTreeMap::new(), + views: HashMap::new(), + } + } + + /// Creates a new table with the same name and columns as self, + /// but without data + // Note to self: be careful, might be dangerous to use once tables can be altered. + // That is not yet implemented. May need full copies + pub fn empty_copy(&self) -> Self { + let mut result = Table::new(self.name.clone()); + result.cols_by_name = self.cols_by_name.clone(); + result.cols = self.cols.clone(); + result + } + + pub fn add_record(&mut self, record: Record) { + let index = self.records.len(); + self.records.insert(Key::integer(index), record); + } + + pub fn has_column(&self, name: impl Into) -> bool { + self.cols_by_name.contains_key(&name.into()) + } + + pub fn add_column(&mut self, name: impl Into, allow_duplicates: bool) { + let col_index = self.cols.len(); + let orig_name: String = name.into(); + + let name = if allow_duplicates { + // append an index when there are duplicate column names + let mut col_name = orig_name.to_string(); + let mut index = 2; + + while self.has_column(&col_name) { + col_name = orig_name.to_string(); + col_name.push_str(format!("{}", index).as_str()); + index += 1; + } + col_name + } else { + orig_name + }; + + self.cols_by_name.insert(name.clone(), col_index); + self.cols.push(name); + } + + pub fn get_column_indexes(&self, expression: &str) -> Vec { + expression + .split(",") + .map(|c| self.get_index(c.trim())) + .collect::>() + } + + pub fn get_index(&self, col_name: &str) -> usize { + *self.cols_by_name.get(col_name).unwrap() + } + + pub fn iter(&self) -> TableIter { + self.iter_records() + } + + pub fn iter_records(&self) -> TableIter { + TableIter { + table_iter: self.records.iter(), + } + } + + pub fn select_columns<'a>(&'a self, columns: &'a Vec<&'a str>) -> OwnedColIter<'a> { + OwnedColIter { + cols: columns, + index: 0, + } + } + + pub fn iter_colums(&self) -> ColIter { + ColIter { + cols: &self.cols, + index: 0, + } + } + + pub fn where_clause(&self, colindex: usize, value: &Value) -> Option<&Record> { + for record in self.iter_records() { + let r = record.get(colindex); + if r == value { + return Some(record); + } + } + None + } +} + +#[derive(Debug, Clone)] +pub struct Record { + values: Vec, +} + +impl Record { + pub fn string_len(&self) -> usize { + self.values.iter().map(Value::string_len).sum() + } + + pub fn add_value(&mut self, value: impl Into) { + self.values.push(value.into()); + } + + pub fn get(&self, index: usize) -> &Value { + self.values.get(index).unwrap() //TODO + } +} + +impl Add for &Record { + type Output = Record; + + fn add(self, rhs: Self) -> Self::Output { + let mut sum = Record::default(); + sum.values.append(&mut self.values.clone()); + sum.values.append(&mut rhs.values.clone()); // use refs? + sum + } +} + +impl Default for Record { + fn default() -> Self { + Self { values: vec![] } + } +} + +pub struct TableIter<'a> { + table_iter: std::collections::btree_map::Iter<'a, Key, Record>, +} + +pub struct ViewIter<'a> { + iter: Map< + std::collections::btree_map::Iter<'a, Key, Key>, + Box Option<&'a Record>>, + >, +} + +impl<'a> Iterator for TableIter<'a> { + type Item = &'a Record; + + fn next(&mut self) -> Option { + self.table_iter.next().map(|e| e.1) + } +} + +pub struct ColIter<'a> { + cols: &'a Vec, + index: usize, +} + +pub struct OwnedColIter<'a> { + cols: &'a Vec<&'a str>, + index: usize, +} + +impl<'a> Iterator for ColIter<'a> { + type Item = &'a String; + + fn next(&mut self) -> Option { + if let Some(v) = self.cols.get(self.index) { + self.index += 1; + Some(v) + } else { + None + } + } +} + +impl<'a> Iterator for OwnedColIter<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option { + if let Some(v) = self.cols.get(self.index) { + self.index += 1; + Some(v) + } else { + None + } + } +} + +#[derive(Debug)] +pub struct Key { + values: Vec, +} + +impl Key { + pub fn integer(integer: usize) -> Self { + Self { + values: vec![integer.into()], + } + } + + pub fn compound(keys: Vec) -> Self { + Self { values: keys } + } +} +impl Ord for Key { + fn cmp(&self, other: &Self) -> Ordering { + self.partial_cmp(other).unwrap_or(Ordering::Equal) + } +} +impl Eq for Key {} + +impl PartialEq for Key { + fn eq(&self, other: &Self) -> bool { + if self.values.len() != other.values.len() { + false + } else { + for (l, r) in self.values.iter().zip(&other.values) { + if l != r { + return false; + } + } + true + } + } +} + +impl PartialOrd for Key { + fn partial_cmp(&self, other: &Self) -> Option { + let len = self.values.len().min(other.values.len()); + for i in 0..len { + let ord = self + .values + .get(i) + .unwrap() + .partial_cmp(other.values.get(i).unwrap()); + + match ord { + Some(Ordering::Less) => { + return Some(Ordering::Less); + } + Some(Ordering::Greater) => { + return Some(Ordering::Greater); + } + _ => {} + } + } + None + } +} diff --git a/src/value.rs b/src/value.rs index d0eda9e..5e550f1 100644 --- a/src/value.rs +++ b/src/value.rs @@ -1,44 +1,180 @@ -use std::fmt::Display; +use std::{cmp::Ordering, fmt::Display}; -#[derive(Debug, PartialEq, PartialOrd, Clone)] -pub enum Value { - Text(String), - Float(f64), - Integer(i64), - NULL, +use anyhow::anyhow; +use byteorder::{BigEndian, ByteOrder}; + +pub const NULL: Value = Value::null(); + +#[derive(Debug, Clone, PartialEq, Eq, Ord)] +pub struct Value { + datatype: u64, + data: Vec, +} + +impl PartialOrd for Value { + fn partial_cmp(&self, other: &Self) -> Option { + match self.datatype { + 13.. if self.datatype % 2 == 1 => Some(self.to_string().cmp(&other.to_string())), + 12.. if self.datatype % 2 == 0 => None, // can't use blob as key + 8..=9 => integer_cmp(self, other), + 7 => { + let l: anyhow::Result = self.into(); + let r: anyhow::Result = other.into(); + if let Ok(l) = l { + if let Ok(r) = r { + l.partial_cmp(&r) + } else { + None + } + } else { + None + } + } + 1..=6 => integer_cmp(self, other), + 0 => None, + _ => None, + } + } +} + +fn integer_cmp(l: &Value, r: &Value) -> Option { + let l: anyhow::Result = l.into(); + let r: anyhow::Result = r.into(); + if let Ok(l) = l { + if let Ok(r) = r { + l.partial_cmp(&r) + } else { + None + } + } else { + None + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Datatype { + Text, + Blob, + Integer, + Float, + Null, } impl Value { - pub fn len(&self) -> usize { - match self { - Value::Text(text) => text.len(), - Value::Float(float) => format!("{}", float).len(), - Value::Integer(integer) => format!("{}", integer).len(), - Value::NULL => 0, + pub const fn null() -> Self { + // NULL + Self { + data: vec![], + datatype: 0, } } + + pub fn from_f64(value: f64) -> Self { + let mut buf = vec![0; 8]; + BigEndian::write_f64(&mut buf, value); + Self { + datatype: 7, + data: buf, + } + } + + pub fn from_i64(value: i64) -> Self { + let (datatype, data) = match value { + 0 => (8, vec![]), + 1 => (9, vec![]), + _ => { + let data = as_bytes(value); + (int_datatype(data.len()), data) + } + }; + Self { datatype, data } + } + + pub fn from_text(value: impl Into) -> Self { + let value: String = value.into(); + let datatype = (13 + value.len() * 2) as u64; + let data = value.as_bytes().to_vec(); + Self { datatype, data } + } + + pub fn datatype(&self) -> anyhow::Result { + match self.datatype { + 13.. if self.datatype % 2 == 1 => Ok(Datatype::Text), + 12.. if self.datatype % 2 == 0 => Ok(Datatype::Blob), + 8..=9 => Ok(Datatype::Integer), + 7 => Ok(Datatype::Float), + 1..=6 => Ok(Datatype::Integer), + 0 => Ok(Datatype::Null), + _ => Err(anyhow!("Illegal type '{}'", self.datatype)), + } + } + + pub fn string_len(&self) -> usize { + match self.datatype { + 13.. if self.datatype % 2 == 1 => ((self.datatype - 13) >> 1) as usize, + 12.. if self.datatype % 2 == 0 => ((self.datatype - 12) >> 1) as usize, + 8..=9 => 1, + 7 => { + let f = BigEndian::read_f64(&self.data); + format!("{}", f).len() + } + 1..=6 => { + let f = BigEndian::read_i64(&self.data); + format!("{}", f).len() + } + 0 => 4, // NULL + _ => 0, // should be Err + } + } +} + +fn int_datatype(encoded_len: usize) -> u64 { + match encoded_len { + ..5 => encoded_len as u64, + ..7 => 5, + _ => 6, + } +} + +fn as_bytes(v: i64) -> Vec { + encode(v, encoding_len(v)) +} + +fn encode(v: i64, len: usize) -> Vec { + let mut buf = Vec::with_capacity(len); + for i in 0..len { + buf.push((v >> ((len - i - 1) * 8)) as u8); + } + buf +} + +fn encoding_len(v: i64) -> usize { + let u = if v < 0 { !v } else { v }; + match u { + ..128 => 1, + ..32768 => 2, + ..8388607 => 3, + ..2147483648 => 4, + ..140737488355327 => 6, + _ => 8, + } } impl Display for Value { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let text = match self { - Value::Float(float) => format!("{}", float), - Value::Integer(integer) => format!("{}", integer), - Value::Text(text) => format!("\"{}\"", text), - Value::NULL => "NULL".to_string(), - }; - write!(f, "{}", text) + let s: String = self.into(); + write!(f, "{}", s) } } impl Into for &str { fn into(self) -> Value { if let Ok(f) = self.parse::() { - Value::Float(f) + Value::from_f64(f) } else if let Ok(i) = self.parse::() { - Value::Integer(i) + Value::from_i64(i) } else { - Value::Text(strip_quotes(self)) + Value::from_text(strip_quotes(self)) } } } @@ -46,24 +182,102 @@ impl Into for &str { impl Into for String { fn into(self) -> Value { if let Ok(f) = self.parse::() { - Value::Float(f) + Value::from_f64(f) } else if let Ok(i) = self.parse::() { - Value::Integer(i) + Value::from_i64(i) } else { - Value::Text(strip_quotes(self)) + Value::from_text(strip_quotes(self)) } } } impl Into for f64 { fn into(self) -> Value { - Value::Float(self) + Value::from_f64(self) } } impl Into for i64 { fn into(self) -> Value { - Value::Integer(self) + Value::from_i64(self) + } +} + +impl Into for usize { + fn into(self) -> Value { + Value::from_i64(self as i64) + } +} + +impl Into for i32 { + fn into(self) -> Value { + Value::from_i64(self as i64) + } +} + +impl Into for Value { + fn into(self) -> String { + (&self).into() + } +} + +impl Into for &Value { + fn into(self) -> String { + match self.datatype { + 13.. if self.datatype % 2 == 1 => String::from_utf8_lossy(&self.data).into_owned(), // valid? + 12.. if self.datatype % 2 == 0 => String::from_utf8_lossy(&self.data).into_owned(), + 8 => "0".to_string(), + 9 => "1".to_string(), + 7 => { + let f: anyhow::Result = self.into(); + format!("{}", f.unwrap()) + } + 1..=6 => { + let i: anyhow::Result = self.into(); + format!("{}", i.unwrap()) + } + 0 => "NULL".to_string(), // NULL + _ => format!("Illegal type '{}'", self.datatype), // should be Err + } + } +} + +impl Into> for Value { + fn into(self) -> anyhow::Result { + (&self).into() + } +} + +impl Into> for &Value { + fn into(self) -> anyhow::Result { + if self.datatype == 7 { + Ok(BigEndian::read_f64(&self.data)) + } else { + Err(anyhow!("not a float")) + } + } +} + +impl Into> for Value { + fn into(self) -> anyhow::Result { + (&self).into() + } +} + +impl Into> for &Value { + fn into(self) -> anyhow::Result { + match self.datatype { + 0 => Err(anyhow!("value is NULL")), + 1 => Ok(BigEndian::read_int(&self.data, 1) as i64), + 2 => Ok(BigEndian::read_int(&self.data, 2) as i64), + 3 => Ok(BigEndian::read_int(&self.data, 3) as i64), + 4 => Ok(BigEndian::read_int(&self.data, 4) as i64), + 5 => Ok(BigEndian::read_int(&self.data, 6) as i64), + 6 => Ok(BigEndian::read_int(&self.data, 8) as i64), + 8 => Ok(0), + 9 => Ok(1), + _ => Err(anyhow!("not an integer")), + } } } @@ -74,3 +288,81 @@ fn strip_quotes(text: impl Into) -> String { } text } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_int0() { + let i: Value = 0.into(); + assert_eq!(i.datatype, 8); + assert_eq!(i.data, vec![]); + assert_eq!(i.to_string(), "0"); + assert_eq!(i.string_len(), 1); + assert_eq!(i.datatype().unwrap(), Datatype::Integer); + } + + #[test] + fn test_int1() { + let i: Value = 1.into(); + assert_eq!(i.datatype, 9); + assert_eq!(i.data, vec![]); + assert_eq!(i.to_string(), "1"); + assert_eq!(i.string_len(), 1); + assert_eq!(i.datatype().unwrap(), Datatype::Integer); + } + + #[test] + fn test_int50000() { + let i: Value = 50000.into(); + assert_eq!(i.datatype, 3); + assert_eq!(i.data, vec![0, 195, 80]); + assert_eq!(i.to_string(), "50000"); + // assert_eq!(i.string_len(), 5); + assert_eq!(i.datatype().unwrap(), Datatype::Integer); + } + + #[test] + fn test_float0() { + let i: Value = 0.0.into(); + assert_eq!(i.datatype, 7); + assert_eq!(i.data, vec![0; 8]); + assert_eq!(i.to_string(), "0"); + assert_eq!(i.string_len(), 1); + assert_eq!(i.datatype().unwrap(), Datatype::Float); + } + + #[test] + fn test_float1() { + let i: Value = 1.0.into(); + assert_eq!(i.datatype, 7); + assert_eq!(i.data, vec![63, 240, 0, 0, 0, 0, 0, 0]); + assert_eq!(i.to_string(), "1"); + assert_eq!(i.string_len(), 1); + assert_eq!(i.datatype().unwrap(), Datatype::Float); + } + + #[test] + fn test_float50000() { + let i: Value = 50000.2.into(); + assert_eq!(i.datatype, 7); + assert_eq!(i.data, vec![64, 232, 106, 6, 102, 102, 102, 102]); + assert_eq!(i.to_string(), "50000.2"); + assert_eq!(i.string_len(), 7); + assert_eq!(i.datatype().unwrap(), Datatype::Float); + } + + #[test] + fn test_string() { + let i: Value = "hello world".into(); + assert_eq!(i.datatype, ("hello world".len() * 2 + 13) as u64); + assert_eq!( + i.data, + vec![104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100] + ); + assert_eq!(i.to_string(), "hello world"); + assert_eq!(i.string_len(), 11); + assert_eq!(i.datatype().unwrap(), Datatype::Text); + } +} diff --git a/src/varint.rs b/src/varint.rs new file mode 100644 index 0000000..9a36c59 --- /dev/null +++ b/src/varint.rs @@ -0,0 +1,132 @@ +const SLOT_2_0: u64 = 0x001fc07f; +const SLOT_4_2_0: u64 = 0xf01fc07f; + +/// varints as implemented in `SQLite` + +pub fn write(value: i64) -> Vec { + let mut v = value; + if (v & ((0xff00_0000) << 32)) == 0 { + if v == 0 { + return vec![0]; + } + let mut result = Vec::new(); + while v != 0 { + result.push(((v & 0x7f) | 0x80) as u8); + v >>= 7; + } + result[0] &= 0x7f; + + result.reverse(); + result + } else { + let mut result = vec![0_u8; 9]; + result[8] = v as u8; + v >>= 8; + for i in (0..=7).rev() { + result[i] = ((v & 0x7f) | 0x80) as u8; + v >>= 7; + } + result + } +} + +pub fn read(data: Vec) -> u64 { + let mut a = data[0] as u64; + if (data[0] as i8) >= 0 { + return a; + } + + let mut b = data[1] as u64; + if (b & 0x80) == 0 { + return ((a & 0x7f) << 7) | b; + } + + a = (a << 14) | data[2] as u64; + if (a & 0x80) == 0 { + a &= SLOT_2_0; + b = (b & 0x7f) << 7; + a |= b; + return a; + } + + a &= SLOT_2_0; + b = b << 14; + b |= data[3] as u64; + if (b & 0x80) == 0 { + b &= SLOT_2_0; + a = (a << 7) | b; + return a; + } + + b &= SLOT_2_0; + let mut s = a; + a = a << 14; + let m = data[4] as u64; + a |= m; + if (a & 0x80) == 0 { + b = b << 7; + a |= b; + s = s >> 18; + return (s << 32) | a; + } + + s = (s << 7) | b; + b = (b << 14) | data[5] as u64; + if (b & 0x80) == 0 { + a &= SLOT_2_0; + a = (a << 7) | b; + s = s >> 18; + return (s << 32) | a; + } + + a = a << 14; + a |= data[6] as u64; + if (a & 0x80) == 0 { + a &= SLOT_4_2_0; + b &= SLOT_2_0; + b = b << 7; + a |= b; + s = s >> 11; + return (s << 32) | a; + } + + a &= SLOT_2_0; + b = (b << 14) | data[7] as u64; + if (b & 0x80) == 0 { + b &= SLOT_4_2_0; + a = (a << 7) | b; + s = s >> 14; + return (s << 32) | a; + } + + a = a << 15; + a |= data[8] as u64; + b &= SLOT_2_0; + b = b << 8; + a |= b; + s = s << 14; + b = m; + b &= 0x7f; + b = b >> 3; + s |= b; + (s << 32) | a +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_0() { + assert_eq!(0, read(write(0))); + } + + #[test] + fn test_127() { + assert_eq!(127, read(write(127))); + } + #[test] + fn test_m127() { + assert_eq!(398639861, read(write(398639861))); + } +} diff --git a/src/vm/mod.rs b/src/vm/mod.rs new file mode 100644 index 0000000..e294de2 --- /dev/null +++ b/src/vm/mod.rs @@ -0,0 +1,40 @@ +use std::collections::HashMap; + +use crate::table::Table; +use crate::value::Value; + +struct Vm { + tables: HashMap, + stack: Vec, + code: Vec, + table_register: String, + ip: usize, +} + +enum Opcode { + LoadTable(String), + ApplyIndex(String), + FetchRow, + FilterRow, + IncRowPointer, +} + +impl Vm { + fn run(&mut self) { + for op in &self.code { + // match op { + // Opcode::LoadTable(name) => { + // if !self.tables.contains_key(name) { + // let table = self.load_table(name).unwrap(); + // self.tables.insert(name.clone(), table); + // } + // self.table_register = name.clone(); + // } + // } + } + } + + fn load_table(&self, name: &String) -> anyhow::Result
{ + Ok(Table::from_csv_file(name, None)?) + } +}