From eb9b62e47c93e0483718ac8005832e2dee526962 Mon Sep 17 00:00:00 2001 From: Sander Hautvast Date: Fri, 14 Feb 2025 17:46:52 +0100 Subject: [PATCH] added scanner --- src/lib.rs | 1 + src/main.rs | 4 + src/sql/mod.rs | 2 + src/sql/scanner.rs | 205 +++++++++++++++++++++++++++++++++++++++++++++ src/sql/tokens.rs | 91 ++++++++++++++++++++ 5 files changed, 303 insertions(+) create mode 100644 src/main.rs create mode 100644 src/sql/mod.rs create mode 100644 src/sql/scanner.rs create mode 100644 src/sql/tokens.rs diff --git a/src/lib.rs b/src/lib.rs index dc55045..a0faeb5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ pub mod join; pub mod order; pub mod print; pub mod read; +pub mod sql; pub mod value; use std::{ diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..bc56ed2 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,4 @@ + +fn main (){ + +} diff --git a/src/sql/mod.rs b/src/sql/mod.rs new file mode 100644 index 0000000..0e9acf6 --- /dev/null +++ b/src/sql/mod.rs @@ -0,0 +1,2 @@ +pub mod scanner; +pub mod tokens; diff --git a/src/sql/scanner.rs b/src/sql/scanner.rs new file mode 100644 index 0000000..59fbf24 --- /dev/null +++ b/src/sql/scanner.rs @@ -0,0 +1,205 @@ +use std::collections::HashMap; + +use anyhow::anyhow; + +use crate::value::Value; + +use super::tokens::{Token, TokenType}; + +pub fn parse(sql: &str) -> anyhow::Result> { + let mut scanner = Scanner::new(sql); + scanner.scan_tokens()?; + Ok(scanner.tokens) +} + +struct Scanner { + source: String, + source_chars: Vec, + tokens: Vec, + start: usize, + current: usize, + keywords: HashMap, +} + +impl Scanner { + fn new(sql: &str) -> Self { + let mut new = Self { + source: sql.to_string(), + source_chars: sql.to_string().chars().collect(), + tokens: vec![], + start: 0, + current: 0, + keywords: HashMap::new(), + }; + + crate::sql::tokens::add_keywords(&mut new.keywords); + new + } + + fn scan_tokens(&mut self) -> anyhow::Result<()> { + self.start = self.current; + while !self.is_at_end() { + self.start = self.current; + self.scan_token()?; + } + Ok(()) + } + + fn scan_token(&mut self) -> anyhow::Result<()> { + let c = self.advance(); + match c { + '(' => self.add_token(TokenType::LeftParen), + ')' => self.add_token(TokenType::RightParen), + ',' => self.add_token(TokenType::Comma), + '.' => self.add_token(TokenType::Dot), + '-' => { + if self.match_token('-') { + while self.peek() != '\n' && !self.is_at_end() { + self.advance(); + } + } else { + self.add_token(TokenType::Minus); + } + } + '+' => self.add_token(TokenType::Plus), + ';' => self.add_token(TokenType::Semicolon), + '*' => self.add_token(TokenType::Star), + '<' => { + let token = if self.match_token('=') { + TokenType::LessEqual + } else { + TokenType::Less + }; + self.add_token(token) + } + '>' => { + let token = if self.match_token('=') { + TokenType::GreaterEqual + } else { + TokenType::Greater + }; + self.add_token(token) + } + ' ' | '\t' | '\r' | '\n' => {} + '\'' => self.string()?, + _ => { + if is_digit(c) { + self.number(); + } else if is_alpha(c) { + self.identifier(); + } else { + return Err(anyhow!("Unexpected character '{}'", c)); + } + } + } + Ok(()) + } + + fn identifier(&mut self) { + while is_alphanumeric(self.peek()) { + self.advance(); + } + let text = self.source[self.start..self.current].to_string(); + let tokentype = self.keywords.get(&text.to_lowercase()); + + self.add_token(if let Some(tokentype) = tokentype { + *tokentype + } else { + TokenType::Identifier + }); + } + + fn number(&mut self) { + while is_digit(self.peek()) { + self.advance(); + } + if (self.peek() == '.' || self.peek() == ',') && is_digit(self.peek_next()) { + self.advance(); + } + + self.add_literal(TokenType::Num, self.source[self.start..self.current].into()); + } + + fn string(&mut self) -> anyhow::Result<()> { + while self.peek() != '\'' && !self.is_at_end() { + self.advance(); + } + + if self.is_at_end() { + return Err(anyhow!("Unterminated string value")); + } + + self.advance(); + + let string = self.source[self.start + 1..self.current - 1].to_string(); + self.add_literal(TokenType::Str, string.into()); + Ok(()) + } + + fn peek(&self) -> char { + if self.is_at_end() { + '\0' + } else { + self.source_chars[self.current] + } + } + + fn peek_next(&self) -> char { + if self.current + 1 > self.source_chars.len() { + '\0' + } else { + self.source_chars[self.current + 1] + } + } + + fn add_token(&mut self, tokentype: TokenType) { + let text = self.source[self.start..self.current].to_string(); + self.tokens.push(Token::new(tokentype, text, Value::NULL)); + } + + fn add_literal(&mut self, tokentype: TokenType, literal: Value) { + let text = self.source[self.start..self.current].to_string(); + self.tokens.push(Token::new(tokentype, text, literal)); + } + + fn advance(&mut self) -> char { + self.current += 1; + self.source_chars[self.current - 1] + } + + fn match_token(&mut self, expected: char) -> bool { + if self.is_at_end() || self.source_chars[self.current] != expected { + false + } else { + self.current += 1; + true + } + } + + fn is_at_end(&self) -> bool { + self.current >= self.source_chars.len() + } +} + +fn is_digit(c: char) -> bool { + c.is_digit(10) +} + +fn is_alpha(c: char) -> bool { + c.is_alphabetic() || c == '_' +} + +fn is_alphanumeric(c: char) -> bool { + is_alpha(c) || is_digit(c) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_parse() { + let tokens = parse("select name from employee;"); + println!("{:?}", tokens); + } +} diff --git a/src/sql/tokens.rs b/src/sql/tokens.rs new file mode 100644 index 0000000..1508082 --- /dev/null +++ b/src/sql/tokens.rs @@ -0,0 +1,91 @@ +use std::collections::HashMap; + +use crate::value::Value; + +#[derive(Debug)] +pub struct Token { + tokentype: TokenType, + lexeme: String, + literal: Value, +} + +impl Token { + pub fn new(tokentype: TokenType, lexeme: impl Into, literal: Value) -> Self { + Self { + tokentype, + lexeme: lexeme.into(), + literal, + } + } +} + +pub(crate) fn add_keywords(keywords: &mut HashMap) { + keywords.insert("and".to_string(), TokenType::And); + keywords.insert("else".to_string(), TokenType::Else); + keywords.insert("false".to_string(), TokenType::False); + keywords.insert("NIL".to_string(), TokenType::Nil); + keywords.insert("or".to_string(), TokenType::Or); + keywords.insert("true".to_string(), TokenType::True); + keywords.insert("select".to_string(), TokenType::Select); + keywords.insert("from".to_string(), TokenType::From); + keywords.insert("where".to_string(), TokenType::Where); + keywords.insert("union".to_string(), TokenType::Union); + keywords.insert("update".to_string(), TokenType::Update); + keywords.insert("insert".to_string(), TokenType::Insert); + keywords.insert("group".to_string(), TokenType::Group); + keywords.insert("order".to_string(), TokenType::Order); + keywords.insert("by".to_string(), TokenType::By); + keywords.insert("having".to_string(), TokenType::Having); + keywords.insert("sum".to_string(), TokenType::Sum); + keywords.insert("max".to_string(), TokenType::Max); + keywords.insert("min".to_string(), TokenType::Min); + keywords.insert("delete".to_string(), TokenType::Delete); + keywords.insert("commit".to_string(), TokenType::Commit); + keywords.insert("describe".to_string(), TokenType::Describe); +} + +#[derive(Debug, Clone, Copy)] +pub enum TokenType { + LeftParen, + RightParen, + Comma, + Dot, + Minus, + Plus, + Star, + Semicolon, + Colon, + Bang, // ! + Equals, + Less, + LessEqual, + Greater, + GreaterEqual, + BangEquals, // != + Unequal, // <> + Str, + Num, + Identifier, + And, + Else, + False, + Nil, + Or, + True, + Select, + From, + Where, + Union, + Update, + Insert, + Group, + Order, + By, + Having, + Sum, + Max, + Min, + Delete, + Commit, + Describe, +}