tipi-lang/src/scanner.rs
2025-11-09 21:57:13 +01:00

345 lines
11 KiB
Rust

use crate::errors::CompilerError::{IllegalCharLength, UnexpectedIdentifier, Unterminated};
use crate::errors::{CompilerError, CompilerErrorAtLine};
use crate::tokens::TokenType::{BitXor, FloatingPoint, Integer, Question, U32, U64};
use crate::{
keywords,
tokens::{
Token,
TokenType::{self},
},
};
pub fn scan(source: &str) -> Result<Vec<Token>, CompilerErrorAtLine> {
let scanner = Scanner {
chars: source.chars().collect(),
current: 0,
start: 0,
line: 1,
tokens: vec![],
new_line: true,
};
scanner.scan()
}
impl Scanner {
fn scan(mut self) -> Result<Vec<Token>, CompilerErrorAtLine> {
while !self.is_at_end() {
self.start = self.current;
self.scan_token()?;
}
self.add_token(TokenType::Eol);
self.add_token(TokenType::Eof);
Ok(self.tokens)
}
fn scan_token(&mut self) -> Result<(), CompilerErrorAtLine> {
let c = self.advance();
if self.new_line && (c == ' ' || c == '\t') {
self.add_token(TokenType::Indent);
self.new_line = false;
} else {
if c != '\n' {
self.new_line = false;
}
match c {
'(' => self.add_token(TokenType::LeftParen),
')' => self.add_token(TokenType::RightParen),
'{' => self.add_token(TokenType::LeftBrace),
'}' => self.add_token(TokenType::RightBrace),
'[' => self.add_token(TokenType::LeftBracket),
']' => self.add_token(TokenType::RightBracket),
',' => self.add_token(TokenType::Comma),
'.' => self.add_token(TokenType::Dot),
'-' => {
let t = if self.match_next('>') {
TokenType::SingleRightArrow
} else {
TokenType::Minus
};
self.add_token(t);
}
'#' => self.add_token(TokenType::Hash),
'+' => self.add_token(TokenType::Plus),
':' => self.add_token(TokenType::Colon),
';' => println!("Warning: Ignoring semicolon at line {}", self.line),
'*' => self.add_token(TokenType::Star),
'!' => {
let t = if self.match_next('=') {
TokenType::BangEqual
} else {
TokenType::Bang
};
self.add_token(t);
}
'=' => {
let t = if self.match_next('=') {
TokenType::EqualEqual
} else {
TokenType::Equal
};
self.add_token(t);
}
'<' => {
let t = if self.match_next('=') {
TokenType::LessEqual
} else if self.match_next('<') {
TokenType::LessLess
} else {
TokenType::Less
};
self.add_token(t)
}
'>' => {
let t = if self.match_next('=') {
TokenType::GreaterEqual
} else if self.match_next('>') {
TokenType::GreaterGreater
} else {
TokenType::Greater
};
self.add_token(t);
}
'/' => {
if self.match_next('/') {
// todo make distinction between comment and doc
while self.peek() != '\n' && !self.is_at_end() {
self.advance();
}
} else {
self.add_token(TokenType::Slash);
}
}
'\'' => self.char()?,
'"' => self.string()?,
't' => {
if self.match_next('"') {
self.datetime()?;
}
}
'\r' | '\t' | ' ' => {}
'\n' => {
self.line += 1;
self.new_line = true;
self.add_token(TokenType::Eol);
}
'&' => {
let t = if self.match_next('&') {
TokenType::LogicalAnd
} else {
TokenType::BitAnd
};
self.add_token(t);
}
'|' => {
let t = if self.match_next('|') {
TokenType::LogicalOr
} else {
TokenType::Pipe
};
self.add_token(t);
}
'^' => self.add_token(BitXor),
'?' => self.add_token(Question),
_ => {
if c == '0' && self.peek() == 'x' {
self.hex_number()?;
} else if is_digit(c) {
self.number();
} else if is_alpha(c) {
self.identifier();
} else {
return Err(self.raise(UnexpectedIdentifier));
}
}
}
}
Ok(())
}
fn identifier(&mut self) {
while is_alphanumeric(self.peek()) {
self.advance();
}
let value: String = self.chars[self.start..self.current].iter().collect();
let tokentype = keywords::get_keyword(&value).unwrap_or(TokenType::Identifier);
self.add_token_with_value(tokentype, value);
}
fn hex_number(&mut self) -> Result<(), CompilerErrorAtLine> {
self.advance();
self.advance();
while is_digit(self.peek()) || is_alpha(self.peek()) {
self.advance();
}
let value: String = self.chars[self.start..self.current].iter().collect();
if value.len() < 5 {
self.add_token_with_value(U32, value);
} else if value.len() < 9 {
self.add_token_with_value(U64, value);
} else {
return Err(self.raise(CompilerError::Overflow));
}
Ok(())
}
fn number(&mut self) {
while is_digit(self.peek()) {
self.advance();
}
let mut has_dot = false;
if self.peek() == '.' && is_digit(self.peek_next()) {
has_dot = true;
self.advance();
}
while is_digit_or_scientific(self.peek()) {
self.advance();
}
let value: String = self.chars[self.start..self.current].iter().collect();
self.add_token_with_value(if has_dot { FloatingPoint } else { Integer }, value);
}
fn char(&mut self) -> Result<(), CompilerErrorAtLine> {
while self.peek() != '\'' && !self.is_at_end() {
self.advance();
}
if self.is_at_end() {
return Err(CompilerErrorAtLine::raise(Unterminated("char"), self.line));
}
self.advance();
let value: String = self.chars[self.start + 1..self.current - 1]
.iter()
.collect();
if value.len() != 1 {
return Err(self.raise(IllegalCharLength(value)));
}
self.add_token_with_value(TokenType::Char, value);
Ok(())
}
fn raise(&self, error: CompilerError) -> CompilerErrorAtLine {
CompilerErrorAtLine::raise(error, self.line)
}
fn datetime(&mut self) -> Result<(), CompilerErrorAtLine> {
while self.peek() != '"' && !self.is_at_end() {
self.advance();
}
self.advance();
let value: String = self.chars[self.start + 2..self.current - 1]
.iter()
.collect();
self.add_token_with_value(TokenType::DateTime, value);
Ok(())
}
fn string(&mut self) -> Result<(), CompilerErrorAtLine> {
while self.peek() != '"' && !self.is_at_end() {
if self.peek() == '\n' {
self.line += 1;
}
self.advance();
}
if self.is_at_end() {
return Err(self.raise(Unterminated("string")));
}
self.advance();
let value: String = self.chars[self.start + 1..self.current - 1]
.iter()
.collect();
self.add_token_with_value(TokenType::StringType, value);
Ok(())
}
fn peek(&self) -> char {
if self.current >= self.chars.len() {
'\0'
} else {
self.chars[self.current]
}
}
fn peek_next(&self) -> char {
self.chars[self.current + 1]
}
fn match_next(&mut self, expected: char) -> bool {
if self.is_at_end() {
false
} else if self.chars[self.current] != expected {
false
} else {
self.current += 1;
true
}
}
fn add_token(&mut self, tokentype: TokenType) {
self.tokens
.push(Token::new(tokentype, "".to_string(), self.line));
}
fn add_token_with_value(&mut self, tokentype: TokenType, value: String) {
self.tokens.push(Token::new(tokentype, value, self.line));
}
fn advance(&mut self) -> char {
self.current += 1;
self.chars[self.current - 1]
}
fn is_at_end(&self) -> bool {
self.current >= self.chars.len()
}
}
struct Scanner {
chars: Vec<char>,
current: usize,
start: usize,
tokens: Vec<Token>,
line: usize,
new_line: bool,
}
fn is_digit(c: char) -> bool {
c >= '0' && c <= '9'
}
fn is_digit_or_scientific(c: char) -> bool {
is_digit(c) || c == 'e' || c == 'E'
}
fn is_alphanumeric(c: char) -> bool {
is_alpha(c) || is_digit(c)
}
fn is_alpha(c: char) -> bool {
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '$'
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test() {
let tokens = scan(
r#"struct Customer:
id: u32,
first_name: string,
last_name: string,
date_fetched: date,"#,
);
let tokenstring = format!("{:?}", tokens);
println!("{}", tokenstring);
// assert_eq!(tokenstring,r#"[Token { tokentype: Fn, lexeme: "fn", line: 2 }, Token { tokentype: Identifier, lexeme: "get", line: 2 }, Token { tokentype: LeftParen, lexeme: "", line: 2 }, Token { tokentype: Identifier, lexeme: "id", line: 2 }, Token { tokentype: Colon, lexeme: "", line: 2 }, Token { tokentype: Identifier, lexeme: "u32", line: 2 }, Token { tokentype: RightParen, lexeme: "", line: 2 }, Token { tokentype: Minus, lexeme: "", line: 2 }, Token { tokentype: Greater, lexeme: "", line: 2 }, Token { tokentype: Identifier, lexeme: "Customer", line: 2 }, Token { tokentype: Colon, lexeme: "", line: 2 }, Token { tokentype: Identifier, lexeme: "service", line: 3 }, Token { tokentype: Dot, lexeme: "", line: 3 }, Token { tokentype: Identifier, lexeme: "get", line: 3 }, Token { tokentype: LeftParen, lexeme: "", line: 3 }, Token { tokentype: Identifier, lexeme: "id", line: 3 }, Token { tokentype: RightParen, lexeme: "", line: 3 }]"#)
}
}