melt/Luau/binding/scanner.c

442 lines
8.5 KiB
C

#include <tree_sitter/parser.h>
#include <wctype.h>
enum TokenType
{
COMMENT_START,
COMMENT_CONTENT,
COMMENT_END,
STRING_START,
STRING_CONTENT,
STRING_END,
INTERP_START,
INTERP_CONTENT,
INTERP_BRACE_OPEN,
INTERP_BRACE_CLOSE,
INTERP_END
};
static void consume(TSLexer *lexer)
{
lexer->advance(lexer, false);
}
static void skip(TSLexer *lexer)
{
lexer->advance(lexer, true);
}
static bool consume_if(TSLexer *lexer, const int32_t character)
{
if (lexer->lookahead == character)
{
consume(lexer);
return true;
}
return false;
}
static bool skipwspace(TSLexer *lexer)
{
if (iswspace(lexer->lookahead) || lexer->lookahead == '\r')
{
skip(lexer);
return true;
}
return false;
}
const char SQ_STRING_DELIMITER = '\'';
const char DQ_STRING_DELIMITER = '"';
const char TICK_DELIMITER = '`';
enum StartedToken
{
SHORT_COMMENT = 1,
SHORT_SQ_STRING,
SHORT_DQ_STRING,
LONG_COMMENT,
LONG_STRING,
TICK_STRING,
INTERP_EXPRESSION,
};
struct ScannerState
{
enum StartedToken started;
unsigned int depth;
unsigned int idepth;
};
void *tree_sitter_luau_external_scanner_create()
{
// this used to be allocated without instantiation
struct ScannerState *state = malloc(sizeof(struct ScannerState));
state->started = 0;
state->depth = 0;
state->idepth = 0;
return state;
}
void tree_sitter_luau_external_scanner_destroy(void *payload)
{
free(payload);
}
unsigned int tree_sitter_luau_external_scanner_serialize(void *payload, char *buffer)
{
struct ScannerState *state = payload;
buffer[0] = state->started;
buffer[1] = state->depth;
buffer[2] = state->idepth;
return 3;
}
void tree_sitter_luau_external_scanner_deserialize(void *payload, const char *buffer, unsigned int length)
{
if (length == 3)
{
struct ScannerState *state = payload;
state->started = buffer[0];
state->depth = buffer[1];
state->idepth = buffer[2];
}
}
static unsigned int get_depth(TSLexer *lexer)
{
unsigned int current_depth = 0;
while (consume_if(lexer, '='))
{
current_depth += 1;
}
return current_depth;
}
static bool scan_depth(TSLexer *lexer, unsigned int remaining_depth)
{
while (remaining_depth > 0 && consume_if(lexer, '='))
{
remaining_depth -= 1;
}
return remaining_depth == 0;
}
static bool escape_handler(TSLexer *lexer)
{
if (consume_if(lexer, '\\') && !lexer->eof(lexer))
{
if (lexer->lookahead == '\r')
{
skip(lexer);
if (!lexer->eof(lexer) && lexer->lookahead == '\n')
{
skip(lexer);
}
}
else if (lexer->lookahead == '\n')
{
skip(lexer);
}
else if (consume_if(lexer, 'z') && !lexer->eof(lexer))
{
while (skipwspace(lexer) && !lexer->eof(lexer))
;
return true;
}
}
return false;
}
bool tree_sitter_luau_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols)
{
struct ScannerState *state = payload;
switch (state->started)
{
case SHORT_COMMENT:
{
// try to match the short comment's end (new line or eof)
if (lexer->lookahead == '\n' || lexer->eof(lexer))
{
if (valid_symbols[COMMENT_END])
{
state->started = state->idepth > 0 ? INTERP_EXPRESSION : 0;
lexer->result_symbol = COMMENT_END;
return true;
}
}
else if (valid_symbols[COMMENT_CONTENT])
{
// consume all characters till a short comment's end
do
{
consume(lexer);
} while (lexer->lookahead != '\n' && !lexer->eof(lexer));
lexer->result_symbol = COMMENT_CONTENT;
return true;
}
break;
}
case SHORT_SQ_STRING:
case SHORT_DQ_STRING:
{
// define the short string's delimiter
const char delimiter = state->started == SHORT_SQ_STRING ? SQ_STRING_DELIMITER : DQ_STRING_DELIMITER;
// try to match the short string's end (" or ')
if (consume_if(lexer, delimiter))
{
if (valid_symbols[STRING_END])
{
state->started = state->idepth > 0 ? INTERP_EXPRESSION : 0;
lexer->result_symbol = STRING_END;
return true;
}
}
else if (lexer->lookahead != '\n' && !lexer->eof(lexer))
{
if (valid_symbols[STRING_CONTENT])
{
// consume any character till a short string's end, new line or eof
do
{
escape_handler(lexer);
consume(lexer);
} while (lexer->lookahead != delimiter && lexer->lookahead != '\n' && !lexer->eof(lexer));
lexer->result_symbol = STRING_CONTENT;
return true;
}
}
break;
}
case TICK_STRING:
{
const char delimiter = TICK_DELIMITER;
if (consume_if(lexer, delimiter))
{
if (valid_symbols[INTERP_END])
{
state->started = state->idepth > 0 ? INTERP_EXPRESSION : 0;
lexer->result_symbol = INTERP_END;
return true;
}
}
else if (consume_if(lexer, '{'))
{
state->idepth++;
state->started = INTERP_EXPRESSION;
lexer->result_symbol = INTERP_BRACE_OPEN;
return true;
}
else if (lexer->lookahead != '\n' && !lexer->eof(lexer))
{
if (valid_symbols[INTERP_CONTENT])
{
do
{
if (lexer->lookahead == '{')
{
break;
}
else
{
if (escape_handler(lexer))
continue;
}
consume(lexer);
} while (lexer->lookahead != delimiter && lexer->lookahead != '\n' && !lexer->eof(lexer));
lexer->result_symbol = INTERP_CONTENT;
return true;
}
}
break;
}
case LONG_COMMENT:
case LONG_STRING:
{
const bool is_inside_a_comment = state->started == LONG_COMMENT;
bool some_characters_were_consumed = false;
if (is_inside_a_comment ? valid_symbols[COMMENT_END] : valid_symbols[STRING_END])
{
// try to match the long comment's/string's end (]=*])
if (consume_if(lexer, ']'))
{
if (scan_depth(lexer, state->depth) && consume_if(lexer, ']'))
{
state->started = state->idepth > 0 ? INTERP_EXPRESSION : 0;
state->depth = 0;
lexer->result_symbol = is_inside_a_comment ? COMMENT_END : STRING_END;
return true;
}
some_characters_were_consumed = true;
}
}
if (is_inside_a_comment ? valid_symbols[COMMENT_CONTENT] : valid_symbols[STRING_CONTENT])
{
if (!some_characters_were_consumed)
{
if (lexer->eof(lexer))
{
break;
}
// consume the next character as it can't start a long comment's/string's end ([)
consume(lexer);
}
// consume any character till a long comment's/string's end or eof
while (true)
{
lexer->mark_end(lexer);
if (consume_if(lexer, ']'))
{
if (scan_depth(lexer, state->depth))
{
if (consume_if(lexer, ']'))
{
break;
}
}
else
{
continue;
}
}
if (lexer->eof(lexer))
{
break;
}
consume(lexer);
}
lexer->result_symbol = is_inside_a_comment ? COMMENT_CONTENT : STRING_CONTENT;
return true;
}
break;
}
case INTERP_EXPRESSION:
{
while (skipwspace(lexer))
;
if (valid_symbols[INTERP_BRACE_CLOSE])
{
if (consume_if(lexer, '}'))
{
state->idepth--;
state->started = TICK_STRING;
lexer->result_symbol = INTERP_BRACE_CLOSE;
return true;
}
}
}
default:
{
// ignore all whitespace
while (skipwspace(lexer))
;
state->started = 0;
if (valid_symbols[COMMENT_START])
{
// try to match a short comment's start (--)
if (consume_if(lexer, '-'))
{
if (consume_if(lexer, '-'))
{
state->started = SHORT_COMMENT;
// try to match a long comment's start (--[=*[)
lexer->mark_end(lexer);
if (consume_if(lexer, '['))
{
unsigned int possible_depth = get_depth(lexer);
if (consume_if(lexer, '['))
{
state->started = LONG_COMMENT;
state->depth = possible_depth;
lexer->mark_end(lexer);
}
}
lexer->result_symbol = COMMENT_START;
return true;
}
break;
}
}
if (valid_symbols[STRING_START])
{
// try to match a short single-quoted string's start (")
if (consume_if(lexer, SQ_STRING_DELIMITER))
{
state->started = SHORT_SQ_STRING;
}
// try to match a short double-quoted string's start (')
else if (consume_if(lexer, DQ_STRING_DELIMITER))
{
state->started = SHORT_DQ_STRING;
}
// try to match a long string's start ([=*[)
else if (consume_if(lexer, '['))
{
unsigned int possible_depth = get_depth(lexer);
if (consume_if(lexer, '['))
{
state->started = LONG_STRING;
state->depth = possible_depth;
}
}
if (state->started)
{
lexer->result_symbol = STRING_START;
return true;
}
}
if (valid_symbols[INTERP_START])
{
if (consume_if(lexer, TICK_DELIMITER))
{
state->started = TICK_STRING;
lexer->result_symbol = INTERP_START;
return true;
}
}
state->started = state->idepth > 0 ? INTERP_EXPRESSION : 0;
break;
}
}
return false;
}