From 24a472be22a820076f10ec4319af937babde2e9f Mon Sep 17 00:00:00 2001
From: MarvelousAnything <marvelousanything@gmail.com>
Date: Sat, 12 Nov 2022 23:28:33 -0500
Subject: [PATCH] Separated token types.

---
 src/lex/keyword.rs | 25 -------------
 src/lex/lexer.rs   | 25 ++++++-------
 src/lex/mod.rs     |  8 ++---
 src/lex/token.rs   | 67 +++++++----------------------------
 src/lex/types.rs   | 87 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 116 insertions(+), 96 deletions(-)
 delete mode 100644 src/lex/keyword.rs
 create mode 100644 src/lex/types.rs

diff --git a/src/lex/keyword.rs b/src/lex/keyword.rs
deleted file mode 100644
index 92214e6..0000000
--- a/src/lex/keyword.rs
+++ /dev/null
@@ -1,25 +0,0 @@
-#[derive(Debug, Eq, PartialEq, Clone)]
-pub enum Keyword {
-    Var,
-    Fun,
-    If,
-    Else,
-    Until,
-    Loop,
-    Return,
-}
-
-impl Keyword {
-    pub fn from_str(s: &str) -> Option<Self> {
-        match s {
-            "var" => Some(Keyword::Var),
-            "fun" => Some(Keyword::Fun),
-            "if" => Some(Keyword::If),
-            "else" => Some(Keyword::Else),
-            "until" => Some(Keyword::Until),
-            "loop" => Some(Keyword::Loop),
-            "return" => Some(Keyword::Return),
-            _ => None,
-        }
-    }
-}
diff --git a/src/lex/lexer.rs b/src/lex/lexer.rs
index 83e0243..22cd3d0 100644
--- a/src/lex/lexer.rs
+++ b/src/lex/lexer.rs
@@ -3,7 +3,7 @@ use crate::lex::token::{Token, TokenType};
 use anyhow::{bail, ensure, Result};
 use log::{debug};
 use thiserror::Error;
-use crate::lex::keyword::Keyword;
+use crate::lex::types::{KeywordToken, LiteralToken};
 
 #[derive(Debug, Clone)]
 pub struct Lexer {
@@ -39,7 +39,7 @@ pub enum LexerError {
 
 #[derive(Debug)]
 pub struct TokenStream {
-    tokens: Vec<Token>,
+    pub tokens: Vec<Token>,
 }
 
 impl Display for TokenStream {
@@ -110,6 +110,10 @@ impl Lexer {
         while self.has_next() {
             tokens.push(self.get_next_token()?);
         }
+        if !self.has_next() {
+            debug!("No more tokens to lex");
+            tokens.push(TokenType::Eof.at(self.index, self.line_no, self.col_no));
+        }
 
         info!("Lexed {} tokens", tokens.len());
 
@@ -257,9 +261,6 @@ impl Lexer {
     }
 
     fn get_next_token(&mut self) -> Result<Token> {
-        if !self.has_next() {
-            return Ok(TokenType::Eof.at(self.index, self.line_no, self.col_no));
-        }
         let start = (self.index, self.line_no, self.col_no);
         let token = match self.curr_char {
             n if self.is_whitespace() => {
@@ -271,7 +272,7 @@ impl Lexer {
             '\n' => {
                 trace!("Found newline at {}:{}[{}]", self.line_no, self.col_no, self.index);
                 // fold newlines into a single token
-                self.col_no = 1;
+                self.col_no = 0;
                 self.line_no += 1;
                 self.advance()?;
                 TokenType::NL
@@ -280,31 +281,31 @@ impl Lexer {
             // TODO: Look into this.
             n if n.is_alphabetic() => {
                 let identifier = self.collect_identifier()?;
-                if let Some(keyword) = Keyword::from_str(&identifier) {
+                if let Some(keyword) = KeywordToken::from_str(&identifier) {
                     debug!("Found keyword {:?} at {}:{}[{}]", keyword, self.line_no, self.col_no, self.index);
                     TokenType::Keyword(keyword)
                 } else {
                     debug!("Found identifier {:?} at {}:{}[{}]", identifier, self.line_no, self.col_no, self.index);
-                    TokenType::Identifier(identifier)
+                    TokenType::IdentifierToken(identifier)
                 }
             }
             n if n.is_numeric() => {
                 let integer = self.collect_integer()?;
                 debug!("Collected integer: {} at {}:{}[{}] to {}:{}[{}]", integer, start.1, start.2, start.0, self.line_no, self.col_no, self.index);
-                TokenType::IntegerLiteral(integer)
+                TokenType::Literal(LiteralToken::IntegerLiteral(integer))
             }
             '"' => {
                 self.advance()?;
                 let string = self.collect_string()?;
-                debug!("Collected string: {} at {}:{}[{}] to {}:{}[{}]", string, start.1, start.2, start.0, self.line_no, self.col_no, self.index);
-                TokenType::StringLiteral(string)
+                debug!("Collected string: \"{}\" at {}:{}[{}] to {}:{}[{}]", string, start.1, start.2, start.0, self.line_no, self.col_no, self.index);
+                TokenType::Literal(LiteralToken::StringLiteral(string))
             }
             '\'' => {
                 self.advance()?;
                 let character = self.consume()?;
                 ensure!(self.consume()? == '\'', LexerError::InvalidCharacterLiteral(self.line_no, self.col_no));
                 debug!("Collected character literal: {:?} at {}:{}[{}] to {}:{}[{}]", character, start.1, start.2, start.0, self.line_no, self.col_no, self.index);
-                TokenType::CharacterLiteral(character)
+                TokenType::Literal(LiteralToken::CharacterLiteral(character))
             }
             '#' => {
                 debug!("Found comment at {}:{}[{}]", self.line_no, self.col_no, self.index);
diff --git a/src/lex/mod.rs b/src/lex/mod.rs
index 5a0774d..0707644 100644
--- a/src/lex/mod.rs
+++ b/src/lex/mod.rs
@@ -1,6 +1,6 @@
-mod consts;
-mod keyword;
-mod lexer;
-mod token;
+pub(crate) mod consts;
+pub(crate) mod types;
+pub(crate) mod lexer;
+pub(crate) mod token;
 
 pub use lexer::Lexer;
\ No newline at end of file
diff --git a/src/lex/token.rs b/src/lex/token.rs
index 4635924..feff4a7 100644
--- a/src/lex/token.rs
+++ b/src/lex/token.rs
@@ -1,36 +1,11 @@
-use crate::lex::keyword::Keyword;
+use crate::lex::types::{KeywordToken, LiteralToken, SyntaxToken};
 
 #[derive(Debug, Eq, PartialEq, Clone)]
 pub enum TokenType {
-    Keyword(Keyword),
-    LBrace,
-    RBrace,
-    LParen,
-    RParen,
-    Assign,
-    Comma,
-    Dot,
-    Minus,
-    Not,
-    Plus,
-    Times,
-    Slash,
-    And,
-    Or,
-    Xor,
-    Mod,
-    Eq,
-    Neq,
-    Lt,
-    Leq,
-    Gt,
-    Geq,
-    LShift,
-    RShift,
-    CharacterLiteral(char),
-    Identifier(String),
-    IntegerLiteral(i64),
-    StringLiteral(String),
+    Keyword(KeywordToken),
+    Syntax(SyntaxToken),
+    IdentifierToken(String),
+    Literal(LiteralToken),
     Unknown(char),
     Eof,
     NL
@@ -47,27 +22,9 @@ impl TokenType {
     }
 
     pub fn from_char(c: char) -> Self {
-        match c {
-            '{' => TokenType::LBrace,
-            '}' => TokenType::RBrace,
-            '(' => TokenType::LParen,
-            ')' => TokenType::RParen,
-            ':' => TokenType::Assign,
-            ',' => TokenType::Comma,
-            '.' => TokenType::Dot,
-            '-' => TokenType::Minus,
-            '!' => TokenType::Not,
-            '+' => TokenType::Plus,
-            '*' => TokenType::Times,
-            '/' => TokenType::Slash,
-            '&' => TokenType::And,
-            '|' => TokenType::Or,
-            '^' => TokenType::Xor,
-            '%' => TokenType::Mod,
-            '<' => TokenType::Lt,
-            '>' => TokenType::Gt,
-            _ => TokenType::Unknown(c),
-        }
+        SyntaxToken::from_char(c)
+            .map(TokenType::Syntax)
+            .unwrap_or_else(|| TokenType::Unknown(c))
     }
 }
 
@@ -89,19 +46,19 @@ impl Token {
     }
 
     pub fn is_identifier(&self) -> bool {
-        matches!(self.token_type, TokenType::Identifier(_))
+        matches!(self.token_type, TokenType::IdentifierToken(_))
     }
 
     pub fn is_integer_literal(&self) -> bool {
-        matches!(self.token_type, TokenType::IntegerLiteral(_))
+        matches!(self.token_type, TokenType::Literal(LiteralToken::IntegerLiteral(_)))
     }
 
     pub fn is_string_literal(&self) -> bool {
-        matches!(self.token_type, TokenType::StringLiteral(_))
+        matches!(self.token_type, TokenType::Literal(LiteralToken::StringLiteral(_)))
     }
 
     pub fn is_character_literal(&self) -> bool {
-        matches!(self.token_type, TokenType::CharacterLiteral(_))
+        matches!(self.token_type, TokenType::Literal(LiteralToken::CharacterLiteral(_)))
     }
 
     pub fn index(&self) -> usize {
diff --git a/src/lex/types.rs b/src/lex/types.rs
new file mode 100644
index 0000000..f9f322f
--- /dev/null
+++ b/src/lex/types.rs
@@ -0,0 +1,87 @@
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub enum KeywordToken {
+    Var,
+    Fun,
+    If,
+    Else,
+    Until,
+    Loop,
+    Return,
+}
+
+impl KeywordToken {
+    pub fn from_str(s: &str) -> Option<Self> {
+        match s {
+            "var" => Some(KeywordToken::Var),
+            "fun" => Some(KeywordToken::Fun),
+            "if" => Some(KeywordToken::If),
+            "else" => Some(KeywordToken::Else),
+            "until" => Some(KeywordToken::Until),
+            "loop" => Some(KeywordToken::Loop),
+            "return" => Some(KeywordToken::Return),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub enum SyntaxToken {
+    LBrace,
+    RBrace,
+    LParen,
+    RParen,
+    Assign,
+    Comma,
+    Dot,
+    Minus,
+    Not,
+    Plus,
+    Times,
+    Slash,
+    And,
+    Or,
+    Xor,
+    Mod,
+    Eq,
+    Neq,
+    Lt,
+    Leq,
+    Gt,
+    Geq,
+    LShift,
+    RShift,
+}
+
+impl SyntaxToken {
+    pub fn from_char(c: char) -> Option<Self> {
+        match c {
+            '{' => Some(SyntaxToken::LBrace),
+            '}' => Some(SyntaxToken::RBrace),
+            '(' => Some(SyntaxToken::LParen),
+            ')' => Some(SyntaxToken::RParen),
+            ':' => Some(SyntaxToken::Assign),
+            ',' => Some(SyntaxToken::Comma),
+            '.' => Some(SyntaxToken::Dot),
+            '-' => Some(SyntaxToken::Minus),
+            '!' => Some(SyntaxToken::Not),
+            '+' => Some(SyntaxToken::Plus),
+            '*' => Some(SyntaxToken::Times),
+            '/' => Some(SyntaxToken::Slash),
+            '&' => Some(SyntaxToken::And),
+            '|' => Some(SyntaxToken::Or),
+            '^' => Some(SyntaxToken::Xor),
+            '%' => Some(SyntaxToken::Mod),
+            '=' => Some(SyntaxToken::Eq),
+            '<' => Some(SyntaxToken::Lt),
+            '>' => Some(SyntaxToken::Gt),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub enum LiteralToken {
+    CharacterLiteral(char),
+    IntegerLiteral(i64),
+    StringLiteral(String),
+}
\ No newline at end of file
-- 
GitLab