From eb9b62e47c93e0483718ac8005832e2dee526962 Mon Sep 17 00:00:00 2001
From: Sander Hautvast <sander.hautvast@ing.com>
Date: Fri, 14 Feb 2025 17:46:52 +0100
Subject: [PATCH] added scanner

---
 src/lib.rs         |   1 +
 src/main.rs        |   4 +
 src/sql/mod.rs     |   2 +
 src/sql/scanner.rs | 205 +++++++++++++++++++++++++++++++++++++++++++++
 src/sql/tokens.rs  |  91 ++++++++++++++++++++
 5 files changed, 303 insertions(+)
 create mode 100644 src/main.rs
 create mode 100644 src/sql/mod.rs
 create mode 100644 src/sql/scanner.rs
 create mode 100644 src/sql/tokens.rs
diff --git a/src/lib.rs b/src/lib.rs
index dc55045..a0faeb5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,6 +3,7 @@ pub mod join;
 pub mod order;
 pub mod print;
 pub mod read;
+pub mod sql;
 pub mod value;
 
 use std::{
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..bc56ed2
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,4 @@
+
+fn main (){
+
+}
diff --git a/src/sql/mod.rs b/src/sql/mod.rs
new file mode 100644
index 0000000..0e9acf6
--- /dev/null
+++ b/src/sql/mod.rs
@@ -0,0 +1,2 @@
+pub mod scanner;
+pub mod tokens;
diff --git a/src/sql/scanner.rs b/src/sql/scanner.rs
new file mode 100644
index 0000000..59fbf24
--- /dev/null
+++ b/src/sql/scanner.rs
@@ -0,0 +1,205 @@
+use std::collections::HashMap;
+
+use anyhow::anyhow;
+
+use crate::value::Value;
+
+use super::tokens::{Token, TokenType};
+
+pub fn parse(sql: &str) -> anyhow::Result<Vec<Token>> {
+    let mut scanner = Scanner::new(sql);
+    scanner.scan_tokens()?;
+    Ok(scanner.tokens)
+}
+
+struct Scanner {
+    source: String,
+    source_chars: Vec<char>,
+    tokens: Vec<Token>,
+    start: usize,
+    current: usize,
+    keywords: HashMap<String, TokenType>,
+}
+
+impl Scanner {
+    fn new(sql: &str) -> Self {
+        let mut new = Self {
+            source: sql.to_string(),
+            source_chars: sql.to_string().chars().collect(),
+            tokens: vec![],
+            start: 0,
+            current: 0,
+            keywords: HashMap::new(),
+        };
+
+        crate::sql::tokens::add_keywords(&mut new.keywords);
+        new
+    }
+
+    fn scan_tokens(&mut self) -> anyhow::Result<()> {
+        self.start = self.current;
+        while !self.is_at_end() {
+            self.start = self.current;
+            self.scan_token()?;
+        }
+        Ok(())
+    }
+
+    fn scan_token(&mut self) -> anyhow::Result<()> {
+        let c = self.advance();
+        match c {
+            '(' => self.add_token(TokenType::LeftParen),
+            ')' => self.add_token(TokenType::RightParen),
+            ',' => self.add_token(TokenType::Comma),
+            '.' => self.add_token(TokenType::Dot),
+            '-' => {
+                if self.match_token('-') {
+                    while self.peek() != '\n' && !self.is_at_end() {
+                        self.advance();
+                    }
+                } else {
+                    self.add_token(TokenType::Minus);
+                }
+            }
+            '+' => self.add_token(TokenType::Plus),
+            ';' => self.add_token(TokenType::Semicolon),
+            '*' => self.add_token(TokenType::Star),
+            '<' => {
+                let token = if self.match_token('=') {
+                    TokenType::LessEqual
+                } else {
+                    TokenType::Less
+                };
+                self.add_token(token)
+            }
+            '>' => {
+                let token = if self.match_token('=') {
+                    TokenType::GreaterEqual
+                } else {
+                    TokenType::Greater
+                };
+                self.add_token(token)
+            }
+            ' ' | '\t' | '\r' | '\n' => {}
+            '\'' => self.string()?,
+            _ => {
+                if is_digit(c) {
+                    self.number();
+                } else if is_alpha(c) {
+                    self.identifier();
+                } else {
+                    return Err(anyhow!("Unexpected character '{}'", c));
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn identifier(&mut self) {
+        while is_alphanumeric(self.peek()) {
+            self.advance();
+        }
+        let text = self.source[self.start..self.current].to_string();
+        let tokentype = self.keywords.get(&text.to_lowercase());
+
+        self.add_token(if let Some(tokentype) = tokentype {
+            *tokentype
+        } else {
+            TokenType::Identifier
+        });
+    }
+
+    fn number(&mut self) {
+        while is_digit(self.peek()) {
+            self.advance();
+        }
+        if (self.peek() == '.' || self.peek() == ',') && is_digit(self.peek_next()) {
+            self.advance();
+        }
+
+        self.add_literal(TokenType::Num, self.source[self.start..self.current].into());
+    }
+
+    fn string(&mut self) -> anyhow::Result<()> {
+        while self.peek() != '\'' && !self.is_at_end() {
+            self.advance();
+        }
+
+        if self.is_at_end() {
+            return Err(anyhow!("Unterminated string value"));
+        }
+
+        self.advance();
+
+        let string = self.source[self.start + 1..self.current - 1].to_string();
+        self.add_literal(TokenType::Str, string.into());
+        Ok(())
+    }
+
+    fn peek(&self) -> char {
+        if self.is_at_end() {
+            '\0'
+        } else {
+            self.source_chars[self.current]
+        }
+    }
+
+    fn peek_next(&self) -> char {
+        if self.current + 1 > self.source_chars.len() {
+            '\0'
+        } else {
+            self.source_chars[self.current + 1]
+        }
+    }
+
+    fn add_token(&mut self, tokentype: TokenType) {
+        let text = self.source[self.start..self.current].to_string();
+        self.tokens.push(Token::new(tokentype, text, Value::NULL));
+    }
+
+    fn add_literal(&mut self, tokentype: TokenType, literal: Value) {
+        let text = self.source[self.start..self.current].to_string();
+        self.tokens.push(Token::new(tokentype, text, literal));
+    }
+
+    fn advance(&mut self) -> char {
+        self.current += 1;
+        self.source_chars[self.current - 1]
+    }
+
+    fn match_token(&mut self, expected: char) -> bool {
+        if self.is_at_end() || self.source_chars[self.current] != expected {
+            false
+        } else {
+            self.current += 1;
+            true
+        }
+    }
+
+    fn is_at_end(&self) -> bool {
+        self.current >= self.source_chars.len()
+    }
+}
+
+fn is_digit(c: char) -> bool {
+    c.is_digit(10)
+}
+
+fn is_alpha(c: char) -> bool {
+    c.is_alphabetic() || c == '_'
+}
+
+fn is_alphanumeric(c: char) -> bool {
+    is_alpha(c) || is_digit(c)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_parse() {
+        let tokens = parse("select name from employee;");
+        println!("{:?}", tokens);
+    }
+}
diff --git a/src/sql/tokens.rs b/src/sql/tokens.rs
new file mode 100644
index 0000000..1508082
--- /dev/null
+++ b/src/sql/tokens.rs
@@ -0,0 +1,91 @@
+use std::collections::HashMap;
+
+use crate::value::Value;
+
+#[derive(Debug)]
+pub struct Token {
+    tokentype: TokenType,
+    lexeme: String,
+    literal: Value,
+}
+
+impl Token {
+    pub fn new(tokentype: TokenType, lexeme: impl Into<String>, literal: Value) -> Self {
+        Self {
+            tokentype,
+            lexeme: lexeme.into(),
+            literal,
+        }
+    }
+}
+
+pub(crate) fn add_keywords(keywords: &mut HashMap<String, TokenType>) {
+    keywords.insert("and".to_string(), TokenType::And);
+    keywords.insert("else".to_string(), TokenType::Else);
+    keywords.insert("false".to_string(), TokenType::False);
+    keywords.insert("NIL".to_string(), TokenType::Nil);
+    keywords.insert("or".to_string(), TokenType::Or);
+    keywords.insert("true".to_string(), TokenType::True);
+    keywords.insert("select".to_string(), TokenType::Select);
+    keywords.insert("from".to_string(), TokenType::From);
+    keywords.insert("where".to_string(), TokenType::Where);
+    keywords.insert("union".to_string(), TokenType::Union);
+    keywords.insert("update".to_string(), TokenType::Update);
+    keywords.insert("insert".to_string(), TokenType::Insert);
+    keywords.insert("group".to_string(), TokenType::Group);
+    keywords.insert("order".to_string(), TokenType::Order);
+    keywords.insert("by".to_string(), TokenType::By);
+    keywords.insert("having".to_string(), TokenType::Having);
+    keywords.insert("sum".to_string(), TokenType::Sum);
+    keywords.insert("max".to_string(), TokenType::Max);
+    keywords.insert("min".to_string(), TokenType::Min);
+    keywords.insert("delete".to_string(), TokenType::Delete);
+    keywords.insert("commit".to_string(), TokenType::Commit);
+    keywords.insert("describe".to_string(), TokenType::Describe);
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum TokenType {
+    LeftParen,
+    RightParen,
+    Comma,
+    Dot,
+    Minus,
+    Plus,
+    Star,
+    Semicolon,
+    Colon,
+    Bang, // !
+    Equals,
+    Less,
+    LessEqual,
+    Greater,
+    GreaterEqual,
+    BangEquals, // !=
+    Unequal,    // <>
+    Str,
+    Num,
+    Identifier,
+    And,
+    Else,
+    False,
+    Nil,
+    Or,
+    True,
+    Select,
+    From,
+    Where,
+    Union,
+    Update,
+    Insert,
+    Group,
+    Order,
+    By,
+    Having,
+    Sum,
+    Max,
+    Min,
+    Delete,
+    Commit,
+    Describe,
+}