From 24a472be22a820076f10ec4319af937babde2e9f Mon Sep 17 00:00:00 2001
From: MarvelousAnything <marvelousanything@gmail.com>
Date: Sat, 12 Nov 2022 23:28:33 -0500
Subject: [PATCH] Separated token types.
---
src/lex/keyword.rs | 25 -------------
src/lex/lexer.rs | 25 ++++++-------
src/lex/mod.rs | 8 ++---
src/lex/token.rs | 67 +++++++----------------------------
src/lex/types.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 116 insertions(+), 96 deletions(-)
delete mode 100644 src/lex/keyword.rs
create mode 100644 src/lex/types.rs
diff --git a/src/lex/keyword.rs b/src/lex/keyword.rs
deleted file mode 100644
index 92214e6..0000000
--- a/src/lex/keyword.rs
+++ /dev/null
@@ -1,25 +0,0 @@
-#[derive(Debug, Eq, PartialEq, Clone)]
-pub enum Keyword {
- Var,
- Fun,
- If,
- Else,
- Until,
- Loop,
- Return,
-}
-
-impl Keyword {
- pub fn from_str(s: &str) -> Option<Self> {
- match s {
- "var" => Some(Keyword::Var),
- "fun" => Some(Keyword::Fun),
- "if" => Some(Keyword::If),
- "else" => Some(Keyword::Else),
- "until" => Some(Keyword::Until),
- "loop" => Some(Keyword::Loop),
- "return" => Some(Keyword::Return),
- _ => None,
- }
- }
-}
diff --git a/src/lex/lexer.rs b/src/lex/lexer.rs
index 83e0243..22cd3d0 100644
--- a/src/lex/lexer.rs
+++ b/src/lex/lexer.rs
@@ -3,7 +3,7 @@ use crate::lex::token::{Token, TokenType};
use anyhow::{bail, ensure, Result};
use log::{debug};
use thiserror::Error;
-use crate::lex::keyword::Keyword;
+use crate::lex::types::{KeywordToken, LiteralToken};
#[derive(Debug, Clone)]
pub struct Lexer {
@@ -39,7 +39,7 @@ pub enum LexerError {
#[derive(Debug)]
pub struct TokenStream {
- tokens: Vec<Token>,
+ pub tokens: Vec<Token>,
}
impl Display for TokenStream {
@@ -110,6 +110,10 @@ impl Lexer {
while self.has_next() {
tokens.push(self.get_next_token()?);
}
+ if !self.has_next() {
+ debug!("No more tokens to lex");
+ tokens.push(TokenType::Eof.at(self.index, self.line_no, self.col_no));
+ }
info!("Lexed {} tokens", tokens.len());
@@ -257,9 +261,6 @@ impl Lexer {
}
fn get_next_token(&mut self) -> Result<Token> {
- if !self.has_next() {
- return Ok(TokenType::Eof.at(self.index, self.line_no, self.col_no));
- }
let start = (self.index, self.line_no, self.col_no);
let token = match self.curr_char {
n if self.is_whitespace() => {
@@ -271,7 +272,7 @@ impl Lexer {
'\n' => {
trace!("Found newline at {}:{}[{}]", self.line_no, self.col_no, self.index);
// fold newlines into a single token
- self.col_no = 1;
+ self.col_no = 0;
self.line_no += 1;
self.advance()?;
TokenType::NL
@@ -280,31 +281,31 @@ impl Lexer {
// TODO: Look into this.
n if n.is_alphabetic() => {
let identifier = self.collect_identifier()?;
- if let Some(keyword) = Keyword::from_str(&identifier) {
+ if let Some(keyword) = KeywordToken::from_str(&identifier) {
debug!("Found keyword {:?} at {}:{}[{}]", keyword, self.line_no, self.col_no, self.index);
TokenType::Keyword(keyword)
} else {
debug!("Found identifier {:?} at {}:{}[{}]", identifier, self.line_no, self.col_no, self.index);
- TokenType::Identifier(identifier)
+ TokenType::IdentifierToken(identifier)
}
}
n if n.is_numeric() => {
let integer = self.collect_integer()?;
debug!("Collected integer: {} at {}:{}[{}] to {}:{}[{}]", integer, start.1, start.2, start.0, self.line_no, self.col_no, self.index);
- TokenType::IntegerLiteral(integer)
+ TokenType::Literal(LiteralToken::IntegerLiteral(integer))
}
'"' => {
self.advance()?;
let string = self.collect_string()?;
- debug!("Collected string: {} at {}:{}[{}] to {}:{}[{}]", string, start.1, start.2, start.0, self.line_no, self.col_no, self.index);
- TokenType::StringLiteral(string)
+ debug!("Collected string: \"{}\" at {}:{}[{}] to {}:{}[{}]", string, start.1, start.2, start.0, self.line_no, self.col_no, self.index);
+ TokenType::Literal(LiteralToken::StringLiteral(string))
}
'\'' => {
self.advance()?;
let character = self.consume()?;
ensure!(self.consume()? == '\'', LexerError::InvalidCharacterLiteral(self.line_no, self.col_no));
debug!("Collected character literal: {:?} at {}:{}[{}] to {}:{}[{}]", character, start.1, start.2, start.0, self.line_no, self.col_no, self.index);
- TokenType::CharacterLiteral(character)
+ TokenType::Literal(LiteralToken::CharacterLiteral(character))
}
'#' => {
debug!("Found comment at {}:{}[{}]", self.line_no, self.col_no, self.index);
diff --git a/src/lex/mod.rs b/src/lex/mod.rs
index 5a0774d..0707644 100644
--- a/src/lex/mod.rs
+++ b/src/lex/mod.rs
@@ -1,6 +1,6 @@
-mod consts;
-mod keyword;
-mod lexer;
-mod token;
+pub(crate) mod consts;
+pub(crate) mod types;
+pub(crate) mod lexer;
+pub(crate) mod token;
pub use lexer::Lexer;
\ No newline at end of file
diff --git a/src/lex/token.rs b/src/lex/token.rs
index 4635924..feff4a7 100644
--- a/src/lex/token.rs
+++ b/src/lex/token.rs
@@ -1,36 +1,11 @@
-use crate::lex::keyword::Keyword;
+use crate::lex::types::{KeywordToken, LiteralToken, SyntaxToken};
#[derive(Debug, Eq, PartialEq, Clone)]
pub enum TokenType {
- Keyword(Keyword),
- LBrace,
- RBrace,
- LParen,
- RParen,
- Assign,
- Comma,
- Dot,
- Minus,
- Not,
- Plus,
- Times,
- Slash,
- And,
- Or,
- Xor,
- Mod,
- Eq,
- Neq,
- Lt,
- Leq,
- Gt,
- Geq,
- LShift,
- RShift,
- CharacterLiteral(char),
- Identifier(String),
- IntegerLiteral(i64),
- StringLiteral(String),
+ Keyword(KeywordToken),
+ Syntax(SyntaxToken),
+ IdentifierToken(String),
+ Literal(LiteralToken),
Unknown(char),
Eof,
NL
@@ -47,27 +22,9 @@ impl TokenType {
}
pub fn from_char(c: char) -> Self {
- match c {
- '{' => TokenType::LBrace,
- '}' => TokenType::RBrace,
- '(' => TokenType::LParen,
- ')' => TokenType::RParen,
- ':' => TokenType::Assign,
- ',' => TokenType::Comma,
- '.' => TokenType::Dot,
- '-' => TokenType::Minus,
- '!' => TokenType::Not,
- '+' => TokenType::Plus,
- '*' => TokenType::Times,
- '/' => TokenType::Slash,
- '&' => TokenType::And,
- '|' => TokenType::Or,
- '^' => TokenType::Xor,
- '%' => TokenType::Mod,
- '<' => TokenType::Lt,
- '>' => TokenType::Gt,
- _ => TokenType::Unknown(c),
- }
+ SyntaxToken::from_char(c)
+ .map(TokenType::Syntax)
+ .unwrap_or_else(|| TokenType::Unknown(c))
}
}
@@ -89,19 +46,19 @@ impl Token {
}
pub fn is_identifier(&self) -> bool {
- matches!(self.token_type, TokenType::Identifier(_))
+ matches!(self.token_type, TokenType::IdentifierToken(_))
}
pub fn is_integer_literal(&self) -> bool {
- matches!(self.token_type, TokenType::IntegerLiteral(_))
+ matches!(self.token_type, TokenType::Literal(LiteralToken::IntegerLiteral(_)))
}
pub fn is_string_literal(&self) -> bool {
- matches!(self.token_type, TokenType::StringLiteral(_))
+ matches!(self.token_type, TokenType::Literal(LiteralToken::StringLiteral(_)))
}
pub fn is_character_literal(&self) -> bool {
- matches!(self.token_type, TokenType::CharacterLiteral(_))
+ matches!(self.token_type, TokenType::Literal(LiteralToken::CharacterLiteral(_)))
}
pub fn index(&self) -> usize {
diff --git a/src/lex/types.rs b/src/lex/types.rs
new file mode 100644
index 0000000..f9f322f
--- /dev/null
+++ b/src/lex/types.rs
@@ -0,0 +1,87 @@
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub enum KeywordToken {
+ Var,
+ Fun,
+ If,
+ Else,
+ Until,
+ Loop,
+ Return,
+}
+
+impl KeywordToken {
+ pub fn from_str(s: &str) -> Option<Self> {
+ match s {
+ "var" => Some(KeywordToken::Var),
+ "fun" => Some(KeywordToken::Fun),
+ "if" => Some(KeywordToken::If),
+ "else" => Some(KeywordToken::Else),
+ "until" => Some(KeywordToken::Until),
+ "loop" => Some(KeywordToken::Loop),
+ "return" => Some(KeywordToken::Return),
+ _ => None,
+ }
+ }
+}
+
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub enum SyntaxToken {
+ LBrace,
+ RBrace,
+ LParen,
+ RParen,
+ Assign,
+ Comma,
+ Dot,
+ Minus,
+ Not,
+ Plus,
+ Times,
+ Slash,
+ And,
+ Or,
+ Xor,
+ Mod,
+ Eq,
+ Neq,
+ Lt,
+ Leq,
+ Gt,
+ Geq,
+ LShift,
+ RShift,
+}
+
+impl SyntaxToken {
+ pub fn from_char(c: char) -> Option<Self> {
+ match c {
+ '{' => Some(SyntaxToken::LBrace),
+ '}' => Some(SyntaxToken::RBrace),
+ '(' => Some(SyntaxToken::LParen),
+ ')' => Some(SyntaxToken::RParen),
+ ':' => Some(SyntaxToken::Assign),
+ ',' => Some(SyntaxToken::Comma),
+ '.' => Some(SyntaxToken::Dot),
+ '-' => Some(SyntaxToken::Minus),
+ '!' => Some(SyntaxToken::Not),
+ '+' => Some(SyntaxToken::Plus),
+ '*' => Some(SyntaxToken::Times),
+ '/' => Some(SyntaxToken::Slash),
+ '&' => Some(SyntaxToken::And),
+ '|' => Some(SyntaxToken::Or),
+ '^' => Some(SyntaxToken::Xor),
+ '%' => Some(SyntaxToken::Mod),
+ '=' => Some(SyntaxToken::Eq),
+ '<' => Some(SyntaxToken::Lt),
+ '>' => Some(SyntaxToken::Gt),
+ _ => None,
+ }
+ }
+}
+
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub enum LiteralToken {
+ CharacterLiteral(char),
+ IntegerLiteral(i64),
+ StringLiteral(String),
+}
\ No newline at end of file
--
GitLab