From e017c010e4a8c7751d4653fd72143f9c60e12eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BE=AA?= Date: Wed, 22 Jan 2025 12:50:04 +1100 Subject: [PATCH] more work on readerv2 --- .../chester/reader/FilePathImplJVM.scala | 2 +- .../chester/integrity/IntegrityCheck.scala | 2 +- .../scala/chester/reader/parseAndCheck.scala | 4 +- .../chester/lsp/ChesterLanguageServer.scala | 4 +- .../main/scala/chester/reader/Parser.scala | 2 +- .../scala/chester/reader/ReaderREPL.scala | 2 +- .../main/scala/chester/readerv2/Lexer.scala | 41 ++ .../main/scala/chester/readerv2/Token.scala | 84 ++--- .../scala/chester/readerv2/Tokenizer.scala | 353 ++++++++---------- .../main/scala/chester/error/SourcePos.scala | 2 +- .../scala/chester/reader/SourceOffset.scala | 4 +- .../chester/syntax/IdentifierRules.scala | 2 +- 12 files changed, 237 insertions(+), 265 deletions(-) create mode 100644 reader/src/main/scala/chester/readerv2/Lexer.scala diff --git a/compatibility/jvm-native/src/main/scala/chester/reader/FilePathImplJVM.scala b/compatibility/jvm-native/src/main/scala/chester/reader/FilePathImplJVM.scala index 97ce48bf..d08e21a3 100644 --- a/compatibility/jvm-native/src/main/scala/chester/reader/FilePathImplJVM.scala +++ b/compatibility/jvm-native/src/main/scala/chester/reader/FilePathImplJVM.scala @@ -13,7 +13,7 @@ object FilePathImplJVM extends FilePathImpl { Right(content) case Failure(exception) => Left( - ParseError(s"Failed to read file: ${exception.getMessage}", Pos.Zero) + ParseError(s"Failed to read file: ${exception.getMessage}", Pos.zero) ) } } diff --git a/core/src/main/scala/chester/integrity/IntegrityCheck.scala b/core/src/main/scala/chester/integrity/IntegrityCheck.scala index a1b0821b..6b6ae1b7 100644 --- a/core/src/main/scala/chester/integrity/IntegrityCheck.scala +++ b/core/src/main/scala/chester/integrity/IntegrityCheck.scala @@ -42,7 +42,7 @@ object IntegrityCheck { .fold( error => fail( - s"Parsing failed for input: $input ${error.message} at index ${error.index}" + s"Parsing failed for input: $input ${error.message} at index ${error.pos}" ), value => assertEquals(value, expected, s"Failed for input: $input") ) diff --git a/core/src/test/scala/chester/reader/parseAndCheck.scala b/core/src/test/scala/chester/reader/parseAndCheck.scala index 95f29e08..d6d0a130 100644 --- a/core/src/test/scala/chester/reader/parseAndCheck.scala +++ b/core/src/test/scala/chester/reader/parseAndCheck.scala @@ -16,7 +16,7 @@ def parseAndCheck(input: String, expected: Expr): Unit = { .fold( error => fail( - s"Parsing failed for input: $input ${error.message} at index ${error.index}" + s"Parsing failed for input: $input ${error.message} at index ${error.pos}" ), { value => assertEquals(read[Expr](write[Expr](value)), value) @@ -46,7 +46,7 @@ def getParsed(input: String): Expr = { .fold( error => fail( - s"Parsing failed for input: $input ${error.message} at index ${error.index}" + s"Parsing failed for input: $input ${error.message} at index ${error.pos}" ), value => value ) diff --git a/lsp/src/main/scala/chester/lsp/ChesterLanguageServer.scala b/lsp/src/main/scala/chester/lsp/ChesterLanguageServer.scala index 2f01ff05..ecf3e06e 100644 --- a/lsp/src/main/scala/chester/lsp/ChesterLanguageServer.scala +++ b/lsp/src/main/scala/chester/lsp/ChesterLanguageServer.scala @@ -182,8 +182,8 @@ class ChesterLanguageServer extends LanguageServer with TextDocumentService with parseResult.fold( { parseError => val range = new Range( - new Position(parseError.index.line, parseError.index.column.utf16), - new Position(parseError.index.line, parseError.index.column.utf16) + new Position(parseError.pos.line, parseError.pos.column.utf16), + new Position(parseError.pos.line, parseError.pos.column.utf16) ) val diagnostic = new Diagnostic( range, diff --git a/reader/src/main/scala/chester/reader/Parser.scala b/reader/src/main/scala/chester/reader/Parser.scala index c85a495f..5fed3ff9 100644 --- a/reader/src/main/scala/chester/reader/Parser.scala +++ b/reader/src/main/scala/chester/reader/Parser.scala @@ -65,7 +65,7 @@ case class ReaderInternal( def simpleId: P[String] = P( (CharacterPred(isIdentifierFirst).rep(1) ~ CharacterPred( - isIdentifierMiddle + isIdentifierPart ).rep.? ~ CharacterPred(isIdentifierEnd).?).! ) diff --git a/reader/src/main/scala/chester/reader/ReaderREPL.scala b/reader/src/main/scala/chester/reader/ReaderREPL.scala index a1d7f07f..743489a2 100644 --- a/reader/src/main/scala/chester/reader/ReaderREPL.scala +++ b/reader/src/main/scala/chester/reader/ReaderREPL.scala @@ -79,7 +79,7 @@ object ReaderREPL { )(using p).exprEntrance ) match { case Parsed.Success(expr, _) => Right(expr) - case f: Parsed.Failure => Left(ParseError(f.msg, Pos.Zero)) + case f: Parsed.Failure => Left(ParseError(f.msg, Pos.zero)) } } } diff --git a/reader/src/main/scala/chester/readerv2/Lexer.scala b/reader/src/main/scala/chester/readerv2/Lexer.scala new file mode 100644 index 00000000..36d7cf42 --- /dev/null +++ b/reader/src/main/scala/chester/readerv2/Lexer.scala @@ -0,0 +1,41 @@ +package chester.readerv2 + +import chester.error.Pos +import chester.reader.ParseError +import Token._ + +case class LexerState( + tokens: TokenStream, + current: Token, + errors: Vector[ParseError] = Vector.empty +) + +class Lexer(tokens: TokenStream) { + def initialize: LexerState = { + tokens.headOption match { + case Some(Right(token)) => LexerState(tokens.tail, token) + case Some(Left(error)) => LexerState(tokens.tail, EOF(error.pos), Vector(error)) + case None => LexerState(LazyList.empty, EOF(Pos.zero)) + } + } + + def advance(state: LexerState): LexerState = { + state.tokens.headOption match { + case Some(Right(token)) => + state.copy(tokens = state.tokens.tail, current = token) + case Some(Left(error)) => + state.copy( + tokens = state.tokens.tail, + errors = state.errors :+ error + ) + case None => state + } + } + + def skipWhitespaceAndComments(state: LexerState): LexerState = { + state.current match { + case _: Whitespace | _: Comment => skipWhitespaceAndComments(advance(state)) + case _ => state + } + } +} \ No newline at end of file diff --git a/reader/src/main/scala/chester/readerv2/Token.scala b/reader/src/main/scala/chester/readerv2/Token.scala index ef83ae7b..2e150c24 100644 --- a/reader/src/main/scala/chester/readerv2/Token.scala +++ b/reader/src/main/scala/chester/readerv2/Token.scala @@ -1,88 +1,76 @@ package chester.readerv2 import chester.error.Pos +import chester.reader.ParseError sealed trait Token { def pos: Pos def text: String } -sealed trait TokenKind { - def text: String -} - -sealed trait Literal extends TokenKind -sealed trait Delimiter extends TokenKind -sealed trait Operator extends TokenKind - -object TokenKind { - sealed trait Name { - def parts: Vector[NamePart] - } - +object Token { sealed trait NamePart case class IdentifierPart(value: Vector[Char]) extends NamePart case class OperatorPart(value: Vector[Char]) extends NamePart - case class Identifier(parts: Vector[NamePart]) extends TokenKind with Name { + sealed trait StringSegment + case class StringChars(chars: Vector[Char]) extends StringSegment + case class StringEscape(char: Char) extends StringSegment + case class StringInterpolation(expr: Vector[Token]) extends StringSegment + + case class Identifier(parts: Vector[NamePart], pos: Pos) extends Token { def text: String = parts.map { case IdentifierPart(chars) => chars.mkString case OperatorPart(chars) => chars.mkString }.mkString } - case class IntegerLiteral(value: BigInt, radix: Int) extends TokenKind with Literal { + case class IntegerLiteral(value: BigInt, radix: Int, pos: Pos) extends Token { def text: String = if (radix == 10) value.toString else s"0x${value.toString(16)}" } - case class RationalLiteral(value: BigDecimal) extends TokenKind with Literal { + case class RationalLiteral(value: BigDecimal, pos: Pos) extends Token { def text: String = value.toString } - case class StringLiteral(segments: Vector[StringSegment]) extends TokenKind with Literal { - def text: String = s"\"${segments.map(_.text).mkString}\"" + case class StringLiteral(segments: Vector[StringSegment], pos: Pos) extends Token { + def text: String = { + val sb = new StringBuilder("\"") + segments.foreach { + case StringChars(chars) => sb.append(chars.mkString) + case StringEscape(c) => sb.append('\\').append(c) + case StringInterpolation(expr) => sb.append("${").append(expr.map(_.text).mkString).append("}") + } + sb.append("\"").toString + } } - sealed trait StringSegment { - def text: String - } - case class StringChars(chars: Vector[Char]) extends StringSegment { - def text: String = chars.mkString - } - case class StringEscape(char: Char) extends StringSegment { - def text: String = s"\\$char" - } - - case class SymbolLiteral(segments: Vector[StringSegment]) extends TokenKind with Literal { - def text: String = s"'${segments.map(_.text).mkString}" + case class SymbolLiteral(name: String, pos: Pos) extends Token { + def text: String = s"'$name" } // Delimiters - case object LParen extends TokenKind with Delimiter { def text = "(" } - case object RParen extends TokenKind with Delimiter { def text = ")" } - case object LBrace extends TokenKind with Delimiter { def text = "{" } - case object RBrace extends TokenKind with Delimiter { def text = "}" } - case object LBracket extends TokenKind with Delimiter { def text = "[" } - case object RBracket extends TokenKind with Delimiter { def text = "]" } - - // Operators - case object Comma extends TokenKind with Operator { def text = "," } - case object Dot extends TokenKind with Operator { def text = "." } - case object Equal extends TokenKind with Operator { def text = "=" } - case object Arrow extends TokenKind with Operator { def text = "->" } + case class LParen(pos: Pos) extends Token { def text = "(" } + case class RParen(pos: Pos) extends Token { def text = ")" } + case class LBrace(pos: Pos) extends Token { def text = "{" } + case class RBrace(pos: Pos) extends Token { def text = "}" } + case class LBracket(pos: Pos) extends Token { def text = "[" } + case class RBracket(pos: Pos) extends Token { def text = "]" } + case class Comma(pos: Pos) extends Token { def text = "," } + case class Dot(pos: Pos) extends Token { def text = "." } + case class Equal(pos: Pos) extends Token { def text = "=" } + case class Arrow(pos: Pos) extends Token { def text = "->" } // Comments and Whitespace - case class Comment(content: Vector[Char]) extends TokenKind { + case class Comment(content: Vector[Char], pos: Pos) extends Token { def text: String = s"//${content.mkString}" } - case class Whitespace(chars: Vector[Char]) extends TokenKind { + case class Whitespace(chars: Vector[Char], pos: Pos) extends Token { def text: String = chars.mkString } - case object EOF extends TokenKind { def text = "" } + case class EOF(pos: Pos) extends Token { def text = "" } } -case class TokenWithPos(kind: TokenKind, pos: Pos) extends Token { - def text: String = kind.text -} \ No newline at end of file +type TokenStream = LazyList[Either[ParseError, Token]] \ No newline at end of file diff --git a/reader/src/main/scala/chester/readerv2/Tokenizer.scala b/reader/src/main/scala/chester/readerv2/Tokenizer.scala index aac775f2..5c610e44 100644 --- a/reader/src/main/scala/chester/readerv2/Tokenizer.scala +++ b/reader/src/main/scala/chester/readerv2/Tokenizer.scala @@ -5,7 +5,8 @@ import chester.utils.WithUTF16 import chester.syntax.IdentifierRules._ import chester.reader.{ParseError, SourceOffset} import _root_.io.github.iltotore.iron._ -import _root_.io.github.iltotore.iron.constraint.all._ +import _root_.io.github.iltotore.iron.constraint.all.{Positive0 => IPositive0, _} +import Token._ class Tokenizer(sourceOffset: SourceOffset)(using reporter: Reporter[ParseError]) { private val content = sourceOffset.readContent match { @@ -15,265 +16,207 @@ class Tokenizer(sourceOffset: SourceOffset)(using reporter: Reporter[ParseError] "" } - private var index = 0 - private var line = sourceOffset.linesOffset - private var column = sourceOffset.posOffset - private var utf16Column = sourceOffset.posOffset.utf16 - private var hasError = false - - private def currentPos: Pos = Pos( - sourceOffset.posOffset + WithUTF16(index.refineUnsafe, utf16Column.refineUnsafe), - line, - column + private case class TokenizerState( + index: Int, + line: Int :| IPositive0, + column: WithUTF16, + utf16Column: Int :| IPositive0 ) - private def reportError(message: String): Unit = { - hasError = true - reporter(ParseError(message, currentPos)) - } + private def currentPos(state: TokenizerState): Pos = Pos( + sourceOffset.posOffset + WithUTF16(state.index.refineUnsafe, state.utf16Column), + state.line, + state.column + ) - private def currentChar: Option[Char] = - if (index >= content.length) None else Some(content(index)) - - private def peek: Option[Char] = - if (index + 1 >= content.length) None else Some(content(index + 1)) + private def peek(state: TokenizerState): Option[Char] = + if (state.index + 1 >= content.length) None else Some(content(state.index + 1)) - private def advance(): Unit = { - if (index < content.length) { - val c = content(index) + private def advance(state: TokenizerState): TokenizerState = { + if (state.index < content.length) { + val c = content(state.index) if (c == '\n') { - line = (line + 1).refineUnsafe[Positive0] - column = WithUTF16.Zero - utf16Column = 0 + state.copy( + index = state.index + 1, + line = (state.line + 1).refineUnsafe[IPositive0], + column = WithUTF16.Zero, + utf16Column = 0.refineUnsafe[IPositive0] + ) } else { val charWidth = if (c.isHighSurrogate) 2 else 1 - column = WithUTF16( - (column.i + 1).refineUnsafe, - (column.utf16 + charWidth).refineUnsafe + state.copy( + index = state.index + 1, + column = WithUTF16( + (state.column.i + 1).refineUnsafe, + (state.column.utf16 + charWidth).refineUnsafe + ), + utf16Column = (state.utf16Column + charWidth).refineUnsafe ) - utf16Column = (utf16Column+charWidth).refineUnsafe } - index += 1 - } + } else state } - private def singleCharToken(kind: TokenKind): Token = { - val pos = currentPos - advance() - TokenWithPos(kind, pos) + private def singleCharToken(state: TokenizerState, c: Char): (TokenizerState, Either[ParseError, Token]) = { + val pos = currentPos(state) + val nextState = advance(state) + val token = c match { + case '(' => LParen(pos) + case ')' => RParen(pos) + case '{' => LBrace(pos) + case '}' => RBrace(pos) + case '[' => LBracket(pos) + case ']' => RBracket(pos) + case ',' => Comma(pos) + case '.' => Dot(pos) + case '=' => Equal(pos) + case '-' if peek(state) == Some('>') => + Arrow(pos) + case c => + reporter(ParseError(s"Unexpected character: $c", pos)) + EOF(pos) + } + (nextState, Right(token)) } - private def scanWhitespace(): Token = { - val startPos = currentPos + private def scanWhitespace(state: TokenizerState): (TokenizerState, Either[ParseError, Token]) = { + val startPos = currentPos(state) val chars = new scala.collection.mutable.ArrayBuffer[Char]() + var current = state - while (currentChar.exists(_.isWhitespace)) { - chars += currentChar.get - advance() + while (current.index < content.length && content(current.index).isWhitespace) { + chars += content(current.index) + current = advance(current) } - TokenWithPos(TokenKind.Whitespace(chars.toVector), startPos) + (current, Right(Token.Whitespace(chars.toVector, startPos))) } - private def scanComment(): Token = { - val startPos = currentPos + private def scanComment(state: TokenizerState): (TokenizerState, Either[ParseError, Token]) = { + val startPos = currentPos(state) val chars = new scala.collection.mutable.ArrayBuffer[Char]() + var current = advance(advance(state)) // Skip // - // Skip the two forward slashes - advance(); advance() - - while (currentChar.exists(_ != '\n')) { - chars += currentChar.get - advance() + while (current.index < content.length && content(current.index) != '\n') { + chars += content(current.index) + current = advance(current) } - TokenWithPos(TokenKind.Comment(chars.toVector), startPos) + (current, Right(Comment(chars.toVector, startPos))) } - private def scanIdentifier(): Token = { - val startPos = currentPos - val parts = new scala.collection.mutable.ArrayBuffer[TokenKind.NamePart]() - var currentPart = new scala.collection.mutable.ArrayBuffer[Char]() + private def scanIdentifier(state: TokenizerState): (TokenizerState, Either[ParseError, Token]) = { + val startPos = currentPos(state) + val chars = new scala.collection.mutable.ArrayBuffer[Char]() + var current = state - def flushPart(isOperator: Boolean): Unit = { - if (currentPart.nonEmpty) { - parts += (if (isOperator) TokenKind.OperatorPart(currentPart.toVector) - else TokenKind.IdentifierPart(currentPart.toVector)) - currentPart.clear() - } + while (current.index < content.length && isIdentifierPart(content(current.index))) { + chars += content(current.index) + current = advance(current) } - while (currentChar.exists(c => isIdentifierMiddle(c) || isOperatorSymbol(c))) { - val c = currentChar.get - if (isOperatorSymbol(c)) { - flushPart(false) - currentPart += c - flushPart(true) - } else { - currentPart += c - } - advance() - } - flushPart(false) - - TokenWithPos(TokenKind.Identifier(parts.toVector), startPos) + (current, Right(Identifier(Vector(IdentifierPart(chars.toVector)), startPos))) } - private def scanNumber(): Token = { - val startPos = currentPos - val numBuilder = new StringBuilder() + private def isHexDigit(c: Char): Boolean = + c.isDigit || ('a' to 'f').contains(c.toLower) + + private def scanNumber(state: TokenizerState): (TokenizerState, Either[ParseError, Token]) = { + val startPos = currentPos(state) + val chars = new scala.collection.mutable.ArrayBuffer[Char]() + var current = state var isHex = false - var isBinary = false - // Check for hex/binary prefix - if (currentChar == Some('0') && peek.exists(p => p == 'x' || p == 'b')) { - numBuilder += '0' - advance() - val prefix = currentChar.get - numBuilder += prefix - isHex = prefix == 'x' - isBinary = prefix == 'b' - advance() + if (content(current.index) == '0' && current.index + 1 < content.length && content(current.index + 1).toLower == 'x') { + isHex = true + chars += '0' += 'x' + current = advance(advance(current)) } - // Scan digits - while (currentChar.exists(c => - if (isHex) c.isDigit || ('a' to 'f').contains(c.toLower) - else if (isBinary) c == '0' || c == '1' - else c.isDigit || c == '.' || c == 'e' || c == 'E' || c == '-' - )) { - numBuilder += currentChar.get - advance() + while (current.index < content.length && + (content(current.index).isDigit || + (isHex && isHexDigit(content(current.index))))) { + chars += content(current.index) + current = advance(current) } - val numStr = numBuilder.toString - if (numStr.exists(c => c == '.' || c == 'e' || c == 'E')) { - TokenWithPos(TokenKind.RationalLiteral(BigDecimal(numStr)), startPos) + if (current.index < content.length && content(current.index) == '.' && !isHex) { + chars += '.' + current = advance(current) + while (current.index < content.length && content(current.index).isDigit) { + chars += content(current.index) + current = advance(current) + } + val value = BigDecimal(chars.mkString) + (current, Right(RationalLiteral(value, startPos))) } else { - val radix = if (isHex) 16 else if (isBinary) 2 else 10 - val value = if (isHex || isBinary) - BigInt(numStr.substring(2), radix) + val value = if (isHex) + BigInt(chars.drop(2).mkString, 16) else - BigInt(numStr, radix) - TokenWithPos(TokenKind.IntegerLiteral(value, radix), startPos) + BigInt(chars.mkString) + (current, Right(IntegerLiteral(value, if (isHex) 16 else 10, startPos))) } } - private def scanString(): Token = { - val startPos = currentPos - advance() // skip opening quote - val segments = new scala.collection.mutable.ArrayBuffer[TokenKind.StringSegment]() - var currentChars = new scala.collection.mutable.ArrayBuffer[Char]() + private def scanString(state: TokenizerState): (TokenizerState, Either[ParseError, Token]) = { + val startPos = currentPos(state) + val chars = new scala.collection.mutable.ArrayBuffer[Char]() + var current = advance(state) // Skip opening quote - def flushChars(): Unit = { - if (currentChars.nonEmpty) { - segments += TokenKind.StringChars(currentChars.toVector) - currentChars.clear() + while (current.index < content.length && content(current.index) != '"') { + if (content(current.index) == '\\' && current.index + 1 < content.length) { + current = advance(current) + chars += content(current.index) + } else { + chars += content(current.index) } + current = advance(current) } - while (currentChar.exists(_ != '"')) { - currentChar match { - case Some('\\') => - flushChars() - advance() - currentChar match { - case Some(c @ ('n'|'r'|'t'|'\\'|'"'|'\'')) => - segments += TokenKind.StringEscape(c) - advance() - case Some(c) => - reportError(s"Invalid escape sequence: \\$c") - advance() - case None => - reportError("Unterminated string literal") - return TokenWithPos(TokenKind.StringLiteral(segments.toVector), startPos) - } - case Some(c) => - currentChars += c - advance() - case None => - reportError("Unterminated string literal") - return TokenWithPos(TokenKind.StringLiteral(segments.toVector), startPos) - } + if (current.index >= content.length) { + (current, Left(ParseError("Unterminated string literal", startPos))) + } else { + current = advance(current) // Skip closing quote + (current, Right(StringLiteral(Vector(StringChars(chars.toVector)), startPos))) } - - flushChars() - advance() // skip closing quote - TokenWithPos(TokenKind.StringLiteral(segments.toVector), startPos) } - private def scanSymbol(): Token = { - val startPos = currentPos - advance() // skip opening quote - val segments = new scala.collection.mutable.ArrayBuffer[TokenKind.StringSegment]() - var currentChars = new scala.collection.mutable.ArrayBuffer[Char]() + private def scanSymbol(state: TokenizerState): (TokenizerState, Either[ParseError, Token]) = { + val startPos = currentPos(state) + val chars = new scala.collection.mutable.ArrayBuffer[Char]() + var current = advance(state) // Skip opening quote - def flushChars(): Unit = { - if (currentChars.nonEmpty) { - segments += TokenKind.StringChars(currentChars.toVector) - currentChars.clear() - } + while (current.index < content.length && content(current.index) != '\'' && content(current.index) != '\n') { + chars += content(current.index) + current = advance(current) } - while (currentChar.exists(c => isIdentifierMiddle(c) || isOperatorSymbol(c))) { - currentChar match { - case Some('\\') => - flushChars() - advance() - currentChar match { - case Some(c @ ('n'|'r'|'t'|'\\'|'"'|'\'')) => - segments += TokenKind.StringEscape(c) - advance() - case Some(c) => - reportError(s"Invalid escape sequence: \\$c") - advance() - case None => - reportError("Unterminated symbol literal") - return TokenWithPos(TokenKind.SymbolLiteral(segments.toVector), startPos) - } - case Some(c) => - currentChars += c - advance() - case None => - reportError("Unterminated symbol literal") - return TokenWithPos(TokenKind.SymbolLiteral(segments.toVector), startPos) - } + if (current.index >= content.length || content(current.index) == '\n') { + (current, Left(ParseError("Unterminated symbol literal", startPos))) + } else { + current = advance(current) // Skip closing quote + (current, Right(SymbolLiteral(chars.mkString, startPos))) } - - flushChars() - TokenWithPos(TokenKind.SymbolLiteral(segments.toVector), startPos) } - def nextToken(): Token = { - if (hasError) return TokenWithPos(TokenKind.EOF, currentPos) - - currentChar match { - case None => TokenWithPos(TokenKind.EOF, currentPos) - case Some(c) => c match { - case c if c.isWhitespace => scanWhitespace() - case '/' if peek == Some('/') => scanComment() - case c if isIdentifierFirst(c) => scanIdentifier() - case c if c.isDigit => scanNumber() - case '"' => scanString() - case '\'' => scanSymbol() - case '(' => singleCharToken(TokenKind.LParen) - case ')' => singleCharToken(TokenKind.RParen) - case '{' => singleCharToken(TokenKind.LBrace) - case '}' => singleCharToken(TokenKind.RBrace) - case '[' => singleCharToken(TokenKind.LBracket) - case ']' => singleCharToken(TokenKind.RBracket) - case ',' => singleCharToken(TokenKind.Comma) - case '.' => singleCharToken(TokenKind.Dot) - case '=' => singleCharToken(TokenKind.Equal) - case '-' if peek == Some('>') => - val startPos = currentPos - advance(); advance() - TokenWithPos(TokenKind.Arrow, startPos) - case c => - reportError(s"Unexpected character: $c") - advance() - nextToken() + def tokenize: TokenStream = { + def loop(state: TokenizerState): LazyList[Either[ParseError, Token]] = { + if (state.index >= content.length) { + LazyList(Right(EOF(currentPos(state)))) + } else { + val c = content(state.index) + val (nextState, result) = c match { + case c if c.isWhitespace => scanWhitespace(state) + case '/' if peek(state) == Some('/') => scanComment(state) + case c if isIdentifierFirst(c) => scanIdentifier(state) + case c if c.isDigit => scanNumber(state) + case '"' => scanString(state) + case '\'' => scanSymbol(state) + case c => singleCharToken(state, c) + } + result #:: loop(nextState) } } + + loop(TokenizerState(0, sourceOffset.linesOffset, sourceOffset.posOffset, sourceOffset.posOffset.utf16)) } } \ No newline at end of file diff --git a/syntax/shared/src/main/scala/chester/error/SourcePos.scala b/syntax/shared/src/main/scala/chester/error/SourcePos.scala index 0bd6203a..c061eeec 100644 --- a/syntax/shared/src/main/scala/chester/error/SourcePos.scala +++ b/syntax/shared/src/main/scala/chester/error/SourcePos.scala @@ -14,7 +14,7 @@ import scala.annotation.tailrec case class Pos(index: WithUTF16, line: Int :| Positive0, column: WithUTF16) derives ReadWriter object Pos { - val Zero: Pos = Pos(WithUTF16.Zero, 0, WithUTF16.Zero) + val zero: Pos = Pos(WithUTF16.Zero, 0, WithUTF16.Zero) } /** start <= i < end */ diff --git a/syntax/shared/src/main/scala/chester/reader/SourceOffset.scala b/syntax/shared/src/main/scala/chester/reader/SourceOffset.scala index 11580d81..3e270f0d 100644 --- a/syntax/shared/src/main/scala/chester/reader/SourceOffset.scala +++ b/syntax/shared/src/main/scala/chester/reader/SourceOffset.scala @@ -8,7 +8,7 @@ import _root_.io.github.iltotore.iron.constraint.all.* import _root_.io.github.iltotore.iron.upickle.given import chester.utils.doc.{Doc, PrettierOptions} -case class ParseError(message: String, index: Pos) extends Problem { +case class ParseError(message: String, pos: Pos) extends Problem { override def severity: Problem.Severity = Problem.Severity.Error override def stage: Problem.Stage = Problem.Stage.PARSE @@ -44,7 +44,7 @@ object FilePath { case class FilePath private (fileName: String) extends ParserSource { private[chester] var impl: FilePathImpl = null override lazy val readContent: Either[ParseError, String] = { - if (impl == null) Left(ParseError("No FilePathImpl provided", Pos.Zero)) + if (impl == null) Left(ParseError("No FilePathImpl provided", Pos.zero)) else impl.readContent(fileName) } } diff --git a/syntax/shared/src/main/scala/chester/syntax/IdentifierRules.scala b/syntax/shared/src/main/scala/chester/syntax/IdentifierRules.scala index 1a0deac9..c88b7748 100644 --- a/syntax/shared/src/main/scala/chester/syntax/IdentifierRules.scala +++ b/syntax/shared/src/main/scala/chester/syntax/IdentifierRules.scala @@ -26,7 +26,7 @@ object IdentifierRules { def isIdentifierFirst(x: Character): Boolean = isWording(x) || isWordingSymbol(x) - def isIdentifierMiddle(x: Character): Boolean = + def isIdentifierPart(x: Character): Boolean = isIdentifierFirst(x) || isDigit(x) || isMiddleWordingSymbol(x) def isIdentifierEnd(x: Character): Boolean = isIdentifierFirst(x) || isDigit(x)