Begin parser rewrite, overhaul language to make more functional

2024-04-05 06:40:47 +01:00 · 2024-04-05 06:40:47 +01:00 · 9b2c5b4870
parent 6a41028640
commit 9b2c5b4870
3 changed files with 238 additions and 321 deletions
--- a/Script/colour.luau
+++ b/Script/colour.luau
@ -15,35 +15,35 @@ local reset = style "reset"

 local Colour = {}

-function Colour.blue(str: string)
+function Colour.blue(str: string | number)
 	return blue .. str .. reset
 end

-function Colour.green(str: string)
+function Colour.green(str: string | number)
 	return green .. str .. reset
 end

-function Colour.purple(str: string)
+function Colour.purple(str: string | number)
 	return purple .. str .. reset
 end

-function Colour.red(str: string)
+function Colour.red(str: string | number)
 	return red .. str .. reset
 end

-function Colour.yellow(str: string)
+function Colour.yellow(str: string | number)
 	return yellow .. str .. reset
 end

-function Colour.cyan(str: string)
+function Colour.cyan(str: string | number)
 	return cyan .. str .. reset
 end

-function Colour.bold(str: string)
+function Colour.bold(str: string | number)
 	return bold .. str .. reset
 end

-function Colour.dim(str: string)
+function Colour.dim(str: string | number)
 	return dim .. str .. reset
 end

--- a/Script/main.luau
+++ b/Script/main.luau
@ -6,6 +6,7 @@ local colour = require "colour"

 type TokenKind =
 	"INDENT"
+	| "DEDENT"
 	| "SPACE"
 	| "NEWLINE"
 	| "IDENTIFIER"
@ -15,19 +16,23 @@ type TokenKind =
 	| "TEXTOPERATOR"
 	| "EQUALS"
 	| "PLUS"
-	| "PLUSPLUS"
-	| "PLUSEQUALS"
 	| "MINUS"
-	| "MINUSMINUS"
-	| "MINUSEQUALS"
 	| "TIMES"
 	| "DIVIDE"
 	| "MODULO"
+	| "EXPONENT"
+	| "COLON"
+	| "SEMICOLON"
+	| "LPAREN"
+	| "RPAREN"
+	| "LBRACE"
+	| "RBRACE"
+	| "LBRACKET"
+	| "RBRACKET"

 type ExprKind =
 	"block"
 	| "if"
-	| "elseif"
 	| "else"
 	| "binop"
 	| "postfix"
@ -38,7 +43,6 @@ type ExprKind =

 local keywords = {
 	["if"] = true,
-	["elseif"] = true,
 	["else"] = true,
 }

@ -56,19 +60,12 @@ local binaryOperators = {
 	["not"] = true,
 	["="] = true,
 	["+"] = true,
-	["+="] = true,
 	["-"] = true,
-	["-="] = true,
 	["*"] = true,
 	["/"] = true,
 	["%"] = true,
 }

-local postfixOperators = {
-	["++"] = true,
-	["--"] = true,
-}
-
 type Token = {
 	kind: TokenKind,
 	value: string,
@ -93,57 +90,24 @@ local function BlockExpr(startToken: Token, expressions: { Expr }): BlockExpr
 	}
 end

-type ElseExpr = Expr & {
-	block: Expr,
-}
-
-local function ElseExpr(startToken: Token, block: BlockExpr): ElseExpr
-	return {
-		startToken = startToken,
-		kind = "else" :: ExprKind,
-		block = block,
-	}
-end
-
-type ElseIfExpr = Expr & {
-	condition: Expr,
-	block: BlockExpr,
-	next: (ElseIfExpr | ElseExpr)?,
-}
-
-local function ElseIfExpr(
-	startToken: Token,
-	condition: Expr,
-	block: BlockExpr,
-	next: (ElseIfExpr | ElseExpr)?
-): ElseIfExpr
-	return {
-		startToken = startToken,
-		kind = "elseif" :: ExprKind,
-		condition = condition,
-		block = block,
-		next = next,
-	}
-end
-
 type IfExpr = Expr & {
 	condition: Expr,
-	block: BlockExpr,
-	next: (ElseIfExpr | ElseExpr)?,
+	ifBlock: BlockExpr,
+	elseBlock: BlockExpr,
 }

 local function IfExpr(
 	startToken: Token,
 	condition: Expr,
-	block: BlockExpr,
-	next: (ElseIfExpr | ElseExpr)?
+	ifBlock: BlockExpr,
+	elseBlock: BlockExpr
 ): IfExpr
 	return {
 		startToken = startToken,
 		kind = "if" :: ExprKind,
 		condition = condition,
-		block = block,
-		next = next,
+		ifBlock = ifBlock,
+		elseBlock = elseBlock,
 	}
 end

@ -315,9 +279,12 @@ local function generate(program: { Expr }): string
 			block ..= generate { ifexpr.condition }
 			block ..= " then\n"

-			block ..= indent(generate { ifexpr.block }, 1)
+			block ..= indent(generate { ifexpr.ifBlock }, 1)
 			block ..= "\n"

+			block ..= "else\n"
+			block ..= indent(generate { ifexpr.elseBlock }, 1)
+
 			block ..= "end"

 			output ..= indent(block, 1)
@ -335,19 +302,6 @@ local function generate(program: { Expr }): string
 			end
 			output ..= "return "
 			output ..= generate { block.expressions[b + 1] }
-		elseif kind == "postfix" then
-			local postfix = expr :: PostfixOpExpr
-			output ..= generate { postfix.expr }
-
-			local value = postfix.operator.value
-
-			if value == "++" then
-				output ..= " += 1\n"
-			elseif value == "--" then
-				output ..= " -= 1\n"
-			else
-				error(`unknown postfix operator {value}`)
-			end
 		else
 			error(`unknown expr kind {kind}`)
 		end
@ -356,6 +310,45 @@ local function generate(program: { Expr }): string
 	return output
 end

+local printIndent = 0
+
+local function printToken(token: Token)
+	local pos = `{token.line}:{token.column}`
+	while #pos < 5 do
+		pos ..= " "
+	end
+
+	local kind = token.kind
+	while #kind < 13 do
+		kind ..= " "
+	end
+
+	local value = token.value
+	if token.kind == "STRING" then
+		value = colour.green(`"{value}"`)
+	elseif token.kind == "NUMBER" then
+		value = colour.yellow(value)
+	elseif token.kind == "IDENTIFIER" then
+		value = colour.cyan(value)
+	elseif token.kind == "KEYWORD" then
+		value = colour.red(value)
+	elseif token.kind == "INDENT" then
+		value = "{"
+		printIndent += 1
+	elseif token.kind == "DEDENT" then
+		value = "}"
+		printIndent -= 1
+	elseif token.kind == "NEWLINE" or token.kind == "SPACE" then
+		value = ""
+	end
+
+	for _ = 1, printIndent - if token.kind == "INDENT" then 1 else 0 do
+		value = "  " .. value
+	end
+
+	print(pos, colour.blue(kind), colour.bold(value))
+end
+
 local function parse(tokens: { Token }): { Expr }
 	local program: { Expr } = {}

@ -363,228 +356,116 @@ local function parse(tokens: { Token }): { Expr }
 		error(colour.red "no tokens to parse")
 	end

-	local function addExpr(expr: Expr)
-		table.insert(program, expr)
+	-- remove spaces and newlines
+	for i, token in tokens do
+		if token.kind == "SPACE" then
+			table.remove(tokens, i)
+		end
 	end

-	local i = 0
-	local len = #tokens
-	while i < len do
-		i += 1
-		local token = tokens[i]
-		local currentIndent = 0
+	-- A program is a list of expressions

-		local function getBlock(): { Token }
-			-- get tokens until the end of the block (which is the same indent level as the if statement)
-			local blockTokens: { Token } = {}
-			local blockIndent = 0
+	local function next(): Token
+		return tokens[1]
+	end

-			if #tokens == 0 then
-				error(colour.red "tried to get empty block")
+	local function get(): Token
+		local token = next()
+		table.remove(tokens, 1)
+		return token
+	end
+
+	local function eat(kind: TokenKind): Token
+		local token = get()
+		if token.kind ~= kind then
+			print(
+				colour.red "expected",
+				colour.yellow(kind),
+				colour.red "got",
+				colour.yellow(token.kind)
+			)
+			exit(1)
+		end
+		return token
+	end
+
+	local function canEndAnExpression(token: Token): boolean
+		local kind: TokenKind = token.kind
+		return kind == "IDENTIFIER"
+			or kind == "NUMBER"
+			or kind == "STRING"
+			or kind == "RPAREN"
+			or kind == "RBRACE"
+			or kind == "RBRACKET"
+	end
+
+	local function getIfExprCond(): { Token }
+		local tokens: { Token } = {}
+
+		local depth = 0
+		while true do
+			local token = get()
+			print("got token", token)
+			if token.kind == "COLON" and depth == 0 then
+				break
+			elseif token.kind == "KEYWORD" and token.value == "if" then
+				-- keywords that require a colon
+				depth += 1
 			end
-
-			while i < len do -- todo figure out if the + 1 breaks something or not its 5:57 am idck
-				if tokens[i].kind == "NEWLINE" then
-					blockIndent = 0
-					-- chock next few tokens to see if they're indented
-
-					local j = i + 1
-					while j < len and tokens[j].kind == "INDENT" do
-						blockIndent += 1
-						j += 1
-					end
-					if blockIndent <= currentIndent then
-						print "block finished"
-						break
-					end
-				end
-
-				table.insert(blockTokens, tokens[i])
-				i += 1
-			end
-
-			if i >= len then
-				print "welp"
-			end
-
-			if #blockTokens == 0 then
-				error(colour.red "empty block")
-			end
-
-			print(blockTokens)
-
-			return blockTokens
+			table.insert(tokens, token)
 		end

-		local function getCond(): { Token }
-			local condTokens: { Token } = {}
+		return tokens
+	end

-			-- get all tokens until the end of the line
-			while i < len and tokens[i + 1].kind ~= "NEWLINE" do
-				i += 1
-				table.insert(condTokens, tokens[i])
+	local function getUntilEndOfExpression(): { Token }
+		local tokens: { Token } = {}
+
+		-- just because a token can end an expression doesn't mean it does
+		local startToken = next()
+
+		if startToken.kind == "KEYWORD" then
+			if startToken.value == "if" then
+				-- skip the if keyword
+				get()
+				-- first get the condition
+				local conditionTokens = getIfExprCond()
+				print("Got tokns", conditionTokens)
 			end
-
-			return condTokens
+		else
+			print(
+				colour.red "unimplemented token",
+				colour.yellow(startToken.kind)
+			)
+			exit(1)
 		end

-		local function nextNonSpace(): (Token, number)
-			local j = i
-			while j < len and tokens[j].kind == "SPACE" do
-				j += 1
+		return tokens
+	end
+
+	while #tokens > 0 do
+		local token = get()
+
+		printToken(token)
+
+		if token.kind == "IDENTIFIER" then
+			local nextToken = get()
+
+			if binaryOperators[nextToken.value] then
+				-- binary operator
+				local left = IdentifierExpr(token)
+				local operator = nextToken
+
+				local rightTokens = getUntilEndOfExpression()
+				local right = parse(rightTokens)[1]
+
+				print(operator)
+
+				table.insert(program, BinOpExpr(token, left, right, operator))
 			end
-			return tokens[j], j
-		end
-
-		local function parseCond(condTokens: { Token }): Expr
-			print("parsing cond", condTokens)
-			error("bruh")
-
-			local cond = parse(condTokens)
-
-			if #cond > 1 then
-				error(colour.red "too many exprs in cond")
-			elseif #cond < 1 then
-				error(colour.red "not enough exprs in cond")
-			end
-
-			return cond[1]
-		end
-
-		if token.kind == "INDENT" then
-			currentIndent += 1
-		elseif token.kind == "NEWLINE" then
-			currentIndent = 0
-		elseif token.kind == "KEYWORD" then
-			if token.value == "if" then
-				print(i)
-				local cond = getCond()
-				print(i)
-
-				local block = getBlock()
-
-				local expr = IfExpr(
-					token,
-					parseCond(cond),
-					BlockExpr(token, parse(block))
-				)
-				addExpr(expr)
-				-- elseif token.value == "elseif" then
-				-- 	local cond = getCond()
-				-- 	-- skip the newline
-				-- 	i += 1
-				-- 	local block = getBlock()
-				-- 	addExpr(
-				-- 		ElseIfExpr(
-				-- 			token,
-				-- 			parseCond(cond),
-				-- 			BlockExpr(token, parse(block))
-				-- 		)
-				-- 	)
-				-- elseif token.value == "else" then
-				-- 	-- skip newline
-				-- 	i += 1
-
-				-- 	local block = getBlock()
-				-- 	addExpr(ElseExpr(token, BlockExpr(token, parse(block))))
-				-- else
-				-- 	print(token)
-				-- 	error(colour.red "unknown token value " .. token.value)
-			end
-		elseif token.kind == "IDENTIFIER" then
-			-- identifier is at the start of an expression, it could be:
-			-- 1: a binop (next token is a text operator or operator
-			-- 3: a postfix op (next token is ++ or --)
-			-- 4: a function call
-			-- 5: standalone
-			-- after one 2am philosophical compiler thinking session, I've concluded that yes, an assignment is indeed a binop
-
-			-- skip the identifier
-			i += 1
-			local nextToken, advance = nextNonSpace()
-
-			if not nextToken then
-				-- standalone
-				i = advance
-				addExpr(IdentifierExpr(token))
-			elseif binaryOperators[nextToken.value] then
-				-- binop
-				i = advance
-				local cond = getCond()
-				addExpr(BinOpExpr(
-					token,
-					IdentifierExpr(token),
-					-- get condition tokens as rhs
-					parseCond(cond),
-					nextToken
-				))
-			elseif postfixOperators[nextToken.value] then
-				-- postfix
-				i = advance
-				addExpr(PostfixOpExpr(token, IdentifierExpr(token), nextToken))
-			else
-				i -= 1 -- getCond skips the identifier
-				local cond = getCond()
-				addExpr(FunctionCallExpr(token, token, parseCond(cond)))
-			end
-		elseif token.kind == "NUMBER" then
-			-- number is at the start of an expression, it could be:
-			-- 1: a binop (next token is a text operator or operator
-			-- 2: standalone
-
-			-- skip the number
-			i += 1
-			local nextToken, advance = nextNonSpace()
-
-			local function standalone()
-				i = advance
-				addExpr(NumberExpr(token))
-			end
-
-			if not nextToken or not binaryOperators[nextToken.value] then
-				standalone()
-			else
-				-- binop
-				i = advance
-				local cond = getCond()
-				addExpr(BinOpExpr(
-					token,
-					NumberExpr(token),
-					-- get condition tokens as rhs
-					parseCond(cond),
-					nextToken
-				))
-			end
-		elseif token.kind == "STRING" then
-			-- string is at the start of an expression, it could be:
-			-- 1: a binop (next token is a text operator or operator
-			-- 2: standalone
-
-			-- skip the string
-			i += 1
-			local nextToken, advance = nextNonSpace()
-
-			local function standalone()
-				i = advance
-				addExpr(StringExpr(token))
-			end
-
-			if not nextToken or not binaryOperators[nextToken.value] then
-				standalone()
-			else
-				-- binop
-				i = advance
-				addExpr(BinOpExpr(
-					token,
-					StringExpr(token),
-					-- get condition tokens as rhs
-					parseCond(getCond()),
-					nextToken
-				))
-			end
-		elseif token.kind ~= "SPACE" then
-			print(token)
-			error(colour.red "unknown token kind " .. token.kind)
+		else
+			print(colour.red "unexpected token", colour.yellow(token.kind))
+			exit(1)
 		end
 	end

@ -598,6 +479,7 @@ local function lex(source: { string }): { Token }
 		return tokens[#tokens - (n - 1)]
 	end
 	local line, column = 1, 0
+	local indent = 0

 	local function addToken(
 		kind: TokenKind,
@ -631,23 +513,28 @@ local function lex(source: { string }): { Token }
 			addToken("SPACE", " ")
 		elseif char == "\t" then
 			-- only if last line is a newline or an indent
-			if last(1).kind == "NEWLINE" or last(1).kind == "INDENT" then
-				addToken("INDENT", "\t")
-				column += 3
-			else
+			if last(1).kind ~= "NEWLINE" and last(1).kind ~= "INDENT" then
 				addToken("SPACE", "\t")
+				continue
 			end
-		elseif char == ";" then
-			-- parse till end of line
-			i += 1 -- skip the semicolon
-			while i < len and source[i] ~= "\n" do
-				column += 1
+
+			-- count how many tabs there are
+			local tabs = 1
+			while source[i + tabs] == "\t" do
+				tabs += 1
 				i += 1
 			end
-			column -= 1
-			i -= 1

-			-- I used to do something with it here but nah
+			local diff = tabs - indent
+			for _ = 1, math.abs(diff) do
+				if diff > 0 then
+					addToken("INDENT", "\t")
+					indent += 1
+				else
+					addToken("DEDENT", "\t")
+					indent -= 1
+				end
+			end
 		elseif char == '"' then
 			local startLine, startColumn = line, column

@ -668,37 +555,33 @@ local function lex(source: { string }): { Token }

 			addToken("STRING", stringLiteral, startLine, startColumn)
 		elseif char == "+" then
-			-- check if it's a ++ or a += or just a +
-			if i + 1 < len and source[i + 1] == "+" then
-				addToken("PLUSPLUS", "++")
-				i += 1
-				column += 1
-			elseif i + 1 < len and source[i + 1] == "=" then
-				addToken("PLUSEQUALS", "+=")
-				i += 1
-				column += 1
-			else
-				addToken("PLUS", "+")
-			end
+			addToken("PLUS", "+")
 		elseif char == "-" then
-			-- check if it's a -- or a -= or just a -
-			if i + 1 < len and source[i + 1] == "-" then
-				addToken("MINUSMINUS", "--")
-				i += 1
-				column += 1
-			elseif i + 1 < len and source[i + 1] == "=" then
-				addToken("MINUSEQUALS", "-=")
-				i += 1
-				column += 1
-			else
-				addToken("MINUS", "-")
-			end
+			addToken("MINUS", "-")
 		elseif char == "*" then
 			addToken("TIMES", "*")
 		elseif char == "/" then
 			addToken("DIVIDE", "/")
 		elseif char == "%" then
 			addToken("MODULO", "%")
+		elseif char == "^" then
+			addToken("EXPONENT", "^")
+		elseif char == ":" then
+			addToken("COLON", ":")
+		elseif char == ";" then
+			addToken("SEMICOLON", ";")
+		elseif char == "(" then
+			addToken("LPAREN", "(")
+		elseif char == ")" then
+			addToken("RPAREN", ")")
+		elseif char == "{" then
+			addToken("LBRACE", "{")
+		elseif char == "}" then
+			addToken("RBRACE", "}")
+		elseif char == "[" then
+			addToken("LBRACKET", "[")
+		elseif char == "]" then
+			addToken("RBRACKET", "]")
 		elseif char ~= " " then
 			if char >= "0" and char <= "9" then
 				local startLine, startColumn = line, column
@ -776,6 +659,36 @@ local function lex(source: { string }): { Token }
 		end
 	end

+	-- postprocessing
+
+	-- remove leading and trailing newlines
+	while tokens[1].kind == "NEWLINE" do
+		table.remove(tokens, 1)
+	end
+	while tokens[#tokens].kind == "NEWLINE" do
+		table.remove(tokens, #tokens)
+	end
+
+	-- check if the number of indents and dedents are the same, if not add the remaining
+	local indents, dedents = 0, 0
+	for _, token in tokens do
+		if token.kind == "INDENT" then
+			indents += 1
+		elseif token.kind == "DEDENT" then
+			dedents += 1
+		end
+	end
+	if dedents > indents then
+		-- huh?????
+		print(colour.red "too many dedents")
+		exit(1)
+	end
+
+	while indents > dedents do
+		addToken("DEDENT", "\t")
+		dedents += 1
+	end
+
 	return tokens
 end

--- a/Script/test.melt
+++ b/Script/test.melt
@ -1,2 +1,6 @@
-y = if x is 6
+y = if x is 6:
 	print "yes"
+	print "test"
+else
+	print "no"
+	print "test"