diff --git a/examples/join/main.rs b/examples/join/main.rs index 0a77ba7..42a8389 100644 --- a/examples/join/main.rs +++ b/examples/join/main.rs @@ -1,18 +1,18 @@ use csv::table::Table; -fn main() { - let left = Table::from_csv(include_str!("data/left.csv"), Some("\t")); - let right = Table::from_csv(include_str!("data/right.csv"), Some("\t")); - println!("left:"); - left.select("*"); - println!("\nright:"); - right.select("*"); - println!("\njoin on name:"); - left.left_join(&right, "name", "name", false) - .select("name, cowdung, value"); - println!("\nleft join on name:"); - left.left_join(&right, "name", "name", true).select("*"); - println!("\nright join on name:"); - left.right_join(&right, "name", "name", true) - .select("name,cowdung,value"); -} +// fn main() { +// let left = Table::from_csv(include_str!("data/left.csv"), Some("\t")); +// let right = Table::from_csv(include_str!("data/right.csv"), Some("\t")); +// println!("left:"); +// left.select("*"); +// println!("\nright:"); +// right.select("*"); +// println!("\njoin on name:"); +// left.left_join(&right, "name", "name", false) +// .select("name, cowdung, value"); +// println!("\nleft join on name:"); +// left.left_join(&right, "name", "name", true).select("*"); +// println!("\nright join on name:"); +// left.right_join(&right, "name", "name", true) +// .select("name,cowdung,value"); +// } diff --git a/examples/orderby/main.rs b/examples/orderby/main.rs index 24ab143..2e7cc58 100644 --- a/examples/orderby/main.rs +++ b/examples/orderby/main.rs @@ -1,12 +1,12 @@ use csv::table::Table; fn main() { - let table = Table::from_csv(include_str!("data/table.csv"), Some("\t")); - println!("not ordered:"); - table.select("*"); - - println!("order by name ascending:"); - table.order_by("name").select("*"); - - println!("\nTODO descending"); + // let table = Table::from_csv(include_str!("data/table.csv"), Some("\t")); + // println!("not ordered:"); + // table.select("*"); + // + // println!("order by name ascending:"); + // table.order_by("name").select("*"); + // + // println!("\nTODO descending"); } diff --git a/src/data/test.csv b/src/data/test.csv new file mode 100644 index 0000000..6734900 --- /dev/null +++ b/src/data/test.csv @@ -0,0 +1,3 @@ +name value +a 3 +b 4 diff --git a/src/id_sequence.rs b/src/id_sequence.rs new file mode 100644 index 0000000..4647534 --- /dev/null +++ b/src/id_sequence.rs @@ -0,0 +1,18 @@ +use std::sync::atomic::{AtomicUsize, Ordering}; + +#[derive(Debug)] +pub struct ThreadSafeIdGenerator { + counter: AtomicUsize, +} + +impl ThreadSafeIdGenerator { + pub fn new(start: usize) -> Self { + Self { + counter: AtomicUsize::new(start), + } + } + + pub fn next(&self) -> usize { + self.counter.fetch_add(1, Ordering::SeqCst) + } +} diff --git a/src/join.rs b/src/join.rs index c85be63..0e74dd3 100644 --- a/src/join.rs +++ b/src/join.rs @@ -1,29 +1,29 @@ use crate::table::Table; impl Table { - pub fn left_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { - join(self, right, left_col, right_col, outer) - } - - pub fn right_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { - join(right, self, right_col, left_col, outer) - } + // pub fn left_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { + // join(self, right, left_col, right_col, outer) + // } + // + // pub fn right_join(&self, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { + // join(right, self, right_col, left_col, outer) + // } } -pub fn join(left: &Table, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { - let mut joined = Table::new("join"); - left.cols.iter().for_each(|c| joined.add_column(c, true)); - right.cols.iter().for_each(|c| joined.add_column(c, true)); - let left_col_index = left.get_index(left_col); - let right_col_index = right.get_index(right_col); - - for record in left.iter_records() { - let lv = record.get(left_col_index); - if let Some(right_record) = right.where_clause(right_col_index, lv) { - joined.add_record(record + right_record); - } else if outer { - joined.add_record(record.clone()); - } - } - joined -} +// pub fn join(left: &Table, right: &Table, left_col: &str, right_col: &str, outer: bool) -> Table { + // let mut joined = Table::new("join"); + // left.cols.iter().for_each(|c| joined.add_column(c, true)); + // right.cols.iter().for_each(|c| joined.add_column(c, true)); + // let left_col_index = left.get_index(left_col); + // let right_col_index = right.get_index(right_col); + // + // for record in left.iter_records() { + // let lv = record.get(left_col_index); + // if let Some(right_record) = right.where_clause(right_col_index, lv) { + // joined.add_record(record + right_record); + // } else if outer { + // joined.add_record(record.clone()); + // } + // } + // joined +// } diff --git a/src/lib.rs b/src/lib.rs index 595d069..32a8f32 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,3 +9,5 @@ pub mod table; pub mod value; mod varint; pub mod vm; +mod id_sequence; +mod record; diff --git a/src/main.rs b/src/main.rs index 94bb43f..c75f2fe 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,8 @@ use csv::table::Table; fn main() { - let csv = include_str!("data/portfolios.csv"); + let csv = include_str!("data/test.csv"); let table = Table::from_csv(csv, None); - table.order_by("name").select("*"); + println!("{:?}",table); + table.select("*"); } diff --git a/src/order.rs b/src/order.rs index 5fc07ac..4adb818 100644 --- a/src/order.rs +++ b/src/order.rs @@ -3,17 +3,17 @@ use std::collections::BTreeMap; use crate::table::{Key, Table}; impl Table { - pub fn order_by(&self, expression: &str) -> Self { - let indexes = self.get_column_indexes(expression); - if self.views.contains_key(expression) {} - - let mut sorted_records = BTreeMap::new(); - for record in self.iter() { - let key = indexes.iter().map(|i| record.get(*i).clone()).collect(); - sorted_records.insert(Key::compound(key), record.clone()); - } - let mut ordered = Table::empty_copy(self); - ordered.records = sorted_records; - ordered - } + // pub fn order_by(&self, expression: &str) -> Self { + // let indexes = self.get_column_indexes(expression); + // if self.views.contains_key(expression) {} + // + // let mut sorted_records = BTreeMap::new(); + // for record in self.iter() { + // let key = indexes.iter().map(|i| record.get(*i).clone()).collect(); + // sorted_records.insert(Key::compound(key), record.clone()); + // } + // let mut ordered = Table::empty_copy(self); + // ordered.records = sorted_records; + // ordered + // } } diff --git a/src/page.rs b/src/page.rs index 80b2918..56d21f2 100644 --- a/src/page.rs +++ b/src/page.rs @@ -1,71 +1,86 @@ -use std::ops::Add; - -use crate::value::{Value, NULL}; +use crate::record::Record; +use crate::value::Value; +use crate::varint; +use byteorder::{BigEndian, ByteOrder}; const PAGE_SIZE: usize = 4096; +#[derive(Debug)] pub enum PageType { Root, Interior, Leaf, } +#[derive(Debug)] pub struct Page { pagetype: PageType, + id: usize, + start: Value, + end: Value, data: Vec, index_pos: u16, data_pos: u16, key: usize, children: Vec, + n_records: usize, } impl Page { - pub fn new(pagetype: PageType) -> Self { + pub fn new(pagetype: PageType, id: usize) -> Self { Self { pagetype, + id, + start: Value::null(), + end: Value::null(), data: vec![0; PAGE_SIZE], index_pos: 0, data_pos: (PAGE_SIZE - 1) as u16, key: 0, children: vec![], + n_records: 0, } } - pub fn add_record(&mut self, record: Record) {} -} - -#[derive(Debug, Clone)] -pub struct Record { - values: Vec, -} - -impl Record { - pub fn string_len(&self) -> usize { - self.values.iter().map(Value::string_len).sum() + pub fn insert(&mut self, record: Record) { + let bytes: Vec = record.into(); + self.insert_data(bytes); + self.insert_index(self.data_pos); + self.n_records += 1; } - pub fn add_value(&mut self, value: impl Into) { - self.values.push(value.into()); + fn insert_data(&mut self, bytes: Vec) { + let end = self.data_pos as usize; + self.data_pos -= bytes.len() as u16; + self.data.splice(self.data_pos as usize..end, bytes); } - pub fn get(&self, index: usize) -> Value { - self.values.get(index).map(|v| v.clone()).unwrap_or(NULL) + fn insert_index(&mut self, value: u16) { + let bytes = u16_to_bytes(value); + let start = self.index_pos as usize; + self.index_pos += bytes.len() as u16; + self.data.splice(start..self.index_pos as usize, bytes); + } + + pub fn get(&self, index: usize) -> Option { + if index < self.n_records { + let index = BigEndian::read_u16(&self.data[index * 2..=index * 2 + 1]); + let (nbytes, len) = varint::read(&self.data[index as usize..]); + Some( + ( + len, + &self.data[nbytes + index as usize..nbytes + index as usize + len as usize], + ) + .into(), + ) + } else { + None + } } } -impl Add for &Record { - type Output = Record; - - fn add(self, rhs: Self) -> Self::Output { - let mut sum = Record::default(); - sum.values.append(&mut self.values.clone()); - sum.values.append(&mut rhs.values.clone()); // use refs? - sum - } -} - -impl Default for Record { - fn default() -> Self { - Self { values: vec![] } - } +fn u16_to_bytes(value: u16) -> Vec { + let mut buf = vec![0; 2]; + BigEndian::write_u16(&mut buf, value); + buf } diff --git a/src/print.rs b/src/print.rs index 9b7c9de..7a63f49 100644 --- a/src/print.rs +++ b/src/print.rs @@ -27,7 +27,7 @@ impl Table { print!("| {:, +} + +impl Record { + pub fn string_len(&self) -> usize { + self.values.iter().map(Value::string_len).sum() + } + + pub fn bytes_len(&self) -> u16 { + let record_length: u16 = self.values.iter().map(Value::bytes_len).sum(); + record_length + 1 + } + + pub fn add_value(&mut self, value: impl Into) { + self.values.push(value.into()); + } + + pub fn get(&self, index: usize) -> &Value { + self.values.get(index).unwrap() //TODO + } +} + +impl Add for &Record { + type Output = Record; + + fn add(self, rhs: Self) -> Self::Output { + let mut sum = Record::default(); + sum.values.append(&mut self.values.clone()); + sum.values.append(&mut rhs.values.clone()); // use refs? + sum + } +} + +impl From for Vec { + fn from(mut record: Record) -> Vec { + let record_length = record.bytes_len(); + let mut length_bytes = varint::write(u64::from(record_length)); + let mut rowid_bytes = varint::write(record.rowid); + + let mut buffer = + Vec::with_capacity(length_bytes.len() + rowid_bytes.len() + record_length as usize); + buffer.append(&mut length_bytes); + buffer.append(&mut rowid_bytes); + + // 'The initial portion of the payload that does not spill to overflow pages.' + let length_of_encoded_column_types: usize = + record.values.iter().map(|v| v.datatype_bytes.len()).sum(); + buffer.append(&mut varint::write( + (length_of_encoded_column_types + 1) as u64, + )); + + //write all types + for v in &mut record.values { + buffer.append(&mut v.datatype_bytes); + } + + // write all values + for v in &mut record.values { + buffer.append(&mut v.data); + } + buffer + } +} + +impl Into for (u64, &[u8]) { + fn into(self) -> Record { + let (len, data) = self; + let len = len as usize; //meh + let (mut offset, rowid) = varint::read(data); + + let mut datatypes = vec![]; + + //read n of fields + while (offset < len) { + let (inc, datatype) = varint::read(&data[offset..]); + datatypes.push(datatype); + offset += inc; + } + + let mut values: Vec = vec![]; + for dt in datatypes { + match dt { + 13.. if dt % 2 == 0 => { + let len = ((dt >> 1) - 13) as usize; + if let Ok(text) = String::from_utf8(data[offset..len].to_vec()) { + values.push(text.into()); + } + offset += len; + } + 12.. if dt % 2 == 0 => { + let len = ((dt >> 1) - 12) as usize; + // no blobs yet + offset += len; + } + 9 => values.push(1.into()), + 8 => values.push(0.into()), + 7 => { + values.push(BigEndian::read_f64(&data[offset..offset + 8]).into()); + offset += 8; + } + 1..=6 => { + let (inc, v) = read_int(&data[offset..], dt); + values.push(v.into()); + offset += inc; + } + 0 => { + values.push(Value::null()); + } + _ => panic!("unknown datatype"), + } + } + + Record { rowid, values } + } +} + +fn read_int(buf: &[u8], datatype: u64) -> (usize, i64) { + let nb = match datatype { + 6 => 8, + 5 => 6, + _ => datatype as usize, + }; + (nb, BigEndian::read_i64(&buf[..nb])) +} + +impl Default for Record { + fn default() -> Self { + Self { + rowid: 0, + values: vec![], + } + } +} diff --git a/src/sql/scanner.rs b/src/sql/scanner.rs index 6f7b6ee..4040269 100644 --- a/src/sql/scanner.rs +++ b/src/sql/scanner.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use anyhow::anyhow; -use crate::value::{Value, NULL}; +use crate::value::Value; use super::tokens::{Token, TokenType}; @@ -154,7 +154,7 @@ impl Scanner { fn add_token(&mut self, tokentype: TokenType) { let text = self.source[self.start..self.current].to_string(); - self.tokens.push(Token::new(tokentype, text, NULL)); + self.tokens.push(Token::new(tokentype, text, Value::null())); } fn add_literal(&mut self, tokentype: TokenType, literal: Value) { diff --git a/src/table.rs b/src/table.rs index 0b8293e..2b470ff 100644 --- a/src/table.rs +++ b/src/table.rs @@ -1,32 +1,43 @@ +use crate::id_sequence::ThreadSafeIdGenerator; +use crate::page::{Page, PageType}; +use crate::record::Record; +use crate::value::Value; +use std::cell::RefCell; +use std::rc::Rc; use std::{ cmp::Ordering, - collections::{BTreeMap, HashMap}, - iter::Map, - ops::Add, + collections::{BTreeMap, HashMap} }; -use crate::value::Value; - +#[derive(Debug)] pub struct View { records: BTreeMap, } +#[derive(Debug)] pub struct Table { name: String, cols_by_name: HashMap, pub(crate) cols: Vec, - pub(crate) records: BTreeMap, + pub(crate) root: Rc>, pub views: HashMap, + page_ids: ThreadSafeIdGenerator, + row_ids: ThreadSafeIdGenerator, + current_page: Rc>, } impl Table { pub fn new(name: impl Into) -> Self { + let root = Rc::new(RefCell::new(Page::new(PageType::Root, 0))); Self { name: name.into(), cols_by_name: HashMap::new(), cols: vec![], - records: BTreeMap::new(), + root: Rc::clone(&root), views: HashMap::new(), + page_ids: ThreadSafeIdGenerator::new(1), + row_ids: ThreadSafeIdGenerator::new(0), + current_page: root, } } @@ -42,8 +53,7 @@ impl Table { } pub fn add_record(&mut self, record: Record) { - let index = self.records.len(); - self.records.insert(Key::integer(index), record); + self.current_page.borrow_mut().insert(record); } pub fn has_column(&self, name: impl Into) -> bool { @@ -85,12 +95,9 @@ impl Table { } pub fn iter(&self) -> TableIter { - self.iter_records() - } - - pub fn iter_records(&self) -> TableIter { TableIter { - table_iter: self.records.iter(), + rootPage: Rc::clone(&self.root), + index: 0, } } @@ -108,69 +115,27 @@ impl Table { } } - pub fn where_clause(&self, colindex: usize, value: &Value) -> Option<&Record> { - for record in self.iter_records() { - let r = record.get(colindex); - if r == value { - return Some(record); - } - } - None - } + // pub fn where_clause(&self, colindex: usize, value: &Value) -> Option<&Record> { + // for record in self.iter() { + // let r = record.get(colindex); + // if r == value { + // return Some(record); + // } + // } + // None + // } } -#[derive(Debug, Clone)] -pub struct Record { - values: Vec, +pub struct TableIter { + rootPage: Rc>, + index: usize, } -impl Record { - pub fn string_len(&self) -> usize { - self.values.iter().map(Value::string_len).sum() - } - - pub fn add_value(&mut self, value: impl Into) { - self.values.push(value.into()); - } - - pub fn get(&self, index: usize) -> &Value { - self.values.get(index).unwrap() //TODO - } -} - -impl Add for &Record { - type Output = Record; - - fn add(self, rhs: Self) -> Self::Output { - let mut sum = Record::default(); - sum.values.append(&mut self.values.clone()); - sum.values.append(&mut rhs.values.clone()); // use refs? - sum - } -} - -impl Default for Record { - fn default() -> Self { - Self { values: vec![] } - } -} - -pub struct TableIter<'a> { - table_iter: std::collections::btree_map::Iter<'a, Key, Record>, -} - -pub struct ViewIter<'a> { - iter: Map< - std::collections::btree_map::Iter<'a, Key, Key>, - Box Option<&'a Record>>, - >, -} - -impl<'a> Iterator for TableIter<'a> { - type Item = &'a Record; +impl Iterator for TableIter { + type Item = Record; fn next(&mut self) -> Option { - self.table_iter.next().map(|e| e.1) + self.rootPage.borrow().get(self.index) } } diff --git a/src/value.rs b/src/value.rs index 5e550f1..61f7bfd 100644 --- a/src/value.rs +++ b/src/value.rs @@ -2,13 +2,13 @@ use std::{cmp::Ordering, fmt::Display}; use anyhow::anyhow; use byteorder::{BigEndian, ByteOrder}; - -pub const NULL: Value = Value::null(); +use crate::varint; #[derive(Debug, Clone, PartialEq, Eq, Ord)] pub struct Value { - datatype: u64, - data: Vec, + pub(crate) datatype: u64, + pub(crate) datatype_bytes: Vec, + pub(crate) data: Vec, } impl PartialOrd for Value { @@ -61,21 +61,26 @@ pub enum Datatype { } impl Value { - pub const fn null() -> Self { - // NULL + fn new(datatype: u64, data: Vec) -> Self { Self { - data: vec![], - datatype: 0, + datatype, + data, + datatype_bytes: varint::write(datatype), } } + pub fn bytes_len(&self) -> u16 { + (self.datatype_bytes.len() + self.data.len()) as u16 + } + + pub fn null() -> Self { + Self::new(0 ,vec![]) + } + pub fn from_f64(value: f64) -> Self { let mut buf = vec![0; 8]; BigEndian::write_f64(&mut buf, value); - Self { - datatype: 7, - data: buf, - } + Self::new(7, buf) } pub fn from_i64(value: i64) -> Self { @@ -87,14 +92,14 @@ impl Value { (int_datatype(data.len()), data) } }; - Self { datatype, data } + Self::new(datatype,data) } pub fn from_text(value: impl Into) -> Self { let value: String = value.into(); let datatype = (13 + value.len() * 2) as u64; let data = value.as_bytes().to_vec(); - Self { datatype, data } + Self::new(datatype,data) } pub fn datatype(&self) -> anyhow::Result { diff --git a/src/varint.rs b/src/varint.rs index 9a36c59..6685f7e 100644 --- a/src/varint.rs +++ b/src/varint.rs @@ -3,7 +3,7 @@ const SLOT_4_2_0: u64 = 0xf01fc07f; /// varints as implemented in `SQLite` -pub fn write(value: i64) -> Vec { +pub fn write(value: u64) -> Vec { let mut v = value; if (v & ((0xff00_0000) << 32)) == 0 { if v == 0 { @@ -30,15 +30,15 @@ pub fn write(value: i64) -> Vec { } } -pub fn read(data: Vec) -> u64 { +pub fn read(data: &[u8]) -> (usize, u64) { let mut a = data[0] as u64; if (data[0] as i8) >= 0 { - return a; + return (1,a); } let mut b = data[1] as u64; if (b & 0x80) == 0 { - return ((a & 0x7f) << 7) | b; + return (2,((a & 0x7f) << 7) | b); } a = (a << 14) | data[2] as u64; @@ -46,7 +46,7 @@ pub fn read(data: Vec) -> u64 { a &= SLOT_2_0; b = (b & 0x7f) << 7; a |= b; - return a; + return (3,a); } a &= SLOT_2_0; @@ -55,7 +55,7 @@ pub fn read(data: Vec) -> u64 { if (b & 0x80) == 0 { b &= SLOT_2_0; a = (a << 7) | b; - return a; + return (4,a); } b &= SLOT_2_0; @@ -67,7 +67,7 @@ pub fn read(data: Vec) -> u64 { b = b << 7; a |= b; s = s >> 18; - return (s << 32) | a; + return (5,(s << 32) | a); } s = (s << 7) | b; @@ -76,7 +76,7 @@ pub fn read(data: Vec) -> u64 { a &= SLOT_2_0; a = (a << 7) | b; s = s >> 18; - return (s << 32) | a; + return (6,(s << 32) | a); } a = a << 14; @@ -87,7 +87,7 @@ pub fn read(data: Vec) -> u64 { b = b << 7; a |= b; s = s >> 11; - return (s << 32) | a; + return (7,(s << 32) | a); } a &= SLOT_2_0; @@ -96,7 +96,7 @@ pub fn read(data: Vec) -> u64 { b &= SLOT_4_2_0; a = (a << 7) | b; s = s >> 14; - return (s << 32) | a; + return (8,(s << 32) | a); } a = a << 15; @@ -109,7 +109,7 @@ pub fn read(data: Vec) -> u64 { b &= 0x7f; b = b >> 3; s |= b; - (s << 32) | a + (9,(s << 32) | a) } #[cfg(test)] @@ -118,15 +118,15 @@ mod test { #[test] fn test_0() { - assert_eq!(0, read(write(0))); + assert_eq!(0, read(&write(0)).1); } #[test] fn test_127() { - assert_eq!(127, read(write(127))); + assert_eq!(127, read(&write(127)).1); } #[test] fn test_m127() { - assert_eq!(398639861, read(write(398639861))); + assert_eq!(398639861, read(&write(398639861)).1); } }