From 368511f666317fefeffca51777f062e436696682 Mon Sep 17 00:00:00 2001 From: John Alanbrook Date: Mon, 9 Feb 2026 11:56:09 -0600 Subject: [PATCH 1/2] parse.ce and tokenize.ce --- internal/bootstrap.cm | 15 +- parse.ce | 2373 +++++++++++++++++++++++++++++++++++++++++ source/cell.c | 16 +- source/quickjs.h | 3 + source/runtime.c | 45 +- tokenize.ce | 569 ++++++++++ 6 files changed, 3002 insertions(+), 19 deletions(-) create mode 100644 parse.ce create mode 100644 tokenize.ce diff --git a/internal/bootstrap.cm b/internal/bootstrap.cm index 3b53d01d..002a8519 100644 --- a/internal/bootstrap.cm +++ b/internal/bootstrap.cm @@ -1,14 +1,17 @@ -// Hidden vars (os, program) come from env +// Hidden vars (os, args) come from env +// args[0] = script filename, args[1..] = user args var load_internal = os.load_internal function use_embed(name) { return load_internal("js_" + name + "_use") } var fd = use_embed('fd') +var json = use_embed('json') var use_cache = {} use_cache['fd'] = fd use_cache['os'] = os +use_cache['json'] = json function use(path) { if (use_cache[path]) @@ -34,7 +37,15 @@ function use(path) { } // Load and run the user's program +var program = args[0] + +var user_args = [] +var _i = 1 +while (_i < length(args)) { + push(user_args, args[_i]) + _i = _i + 1 +} var blob = fd.slurp(program) stone(blob) var script = text(blob) -mach_eval(program, script, {use: use}) +mach_eval(program, script, {use: use, args: user_args, json: json}) diff --git a/parse.ce b/parse.ce new file mode 100644 index 00000000..70d0637d --- /dev/null +++ b/parse.ce @@ -0,0 +1,2373 @@ +// ============================================================ +// Section 1: Inline Tokenizer (from tokenize.ce) +// ============================================================ + +var src = args[0] +var filename = length(args) > 1 ? args[1] : "" + +// Convert to codepoint array +var _src_len = length(src) +var cp = [] +var _i = 0 +while (_i < _src_len) { + push(cp, codepoint(src[_i])) + _i = _i + 1 +} + +var pos = 0 +var row = 0 +var col = 0 +var tokens = [] + +// Codepoint constants +def CP_LF = 10 +def CP_CR = 13 +def CP_TAB = 9 +def CP_SPACE = 32 +def CP_BANG = 33 +def CP_DQUOTE = 34 +def CP_HASH = 35 +def CP_DOLLAR = 36 +def CP_PERCENT = 37 +def CP_AMP = 38 +def CP_SQUOTE = 39 +def CP_LPAREN = 40 +def CP_RPAREN = 41 +def CP_STAR = 42 +def CP_PLUS = 43 +def CP_COMMA = 44 +def CP_MINUS = 45 +def CP_DOT = 46 +def CP_SLASH = 47 +def CP_0 = 48 +def CP_1 = 49 +def CP_7 = 55 +def CP_9 = 57 +def CP_COLON = 58 +def CP_SEMI = 59 +def CP_LT = 60 +def CP_EQ = 61 +def CP_GT = 62 +def CP_QMARK = 63 +def CP_AT = 64 +def CP_A = 65 +def CP_B = 66 +def CP_E = 69 +def CP_F = 70 +def CP_O = 79 +def CP_X = 88 +def CP_Z = 90 +def CP_LBRACKET = 91 +def CP_BSLASH = 92 +def CP_RBRACKET = 93 +def CP_CARET = 94 +def CP_UNDERSCORE = 95 +def CP_BACKTICK = 96 +def CP_a = 97 +def CP_b = 98 +def CP_e = 101 +def CP_f = 102 +def CP_n = 110 +def CP_o = 111 +def CP_r = 114 +def CP_t = 116 +def CP_x = 120 +def CP_z = 122 +def CP_LBRACE = 123 +def CP_PIPE = 124 +def CP_RBRACE = 125 +def CP_TILDE = 126 + +var keywords = { + if: "if", in: "in", do: "do", go: "go", + var: "var", def: "def", for: "for", + else: "else", this: "this", null: "null", true: "true", + false: "false", while: "while", break: "break", + return: "return", delete: "delete", + disrupt: "disrupt", function: "function", continue: "continue", + disruption: "disruption" +} + +function pk() { + if (pos >= _src_len) return -1 + return cp[pos] +} + +function pk_at(n) { + var idx = pos + n + if (idx >= _src_len) return -1 + return cp[idx] +} + +function adv() { + var c = cp[pos] + pos = pos + 1 + if (c == CP_LF) { + row = row + 1 + col = 0 + } else { + col = col + 1 + } + return c +} + +function is_digit(c) { + return c >= CP_0 && c <= CP_9 +} + +function is_hex(c) { + return (c >= CP_0 && c <= CP_9) || (c >= CP_a && c <= CP_f) || (c >= CP_A && c <= CP_F) +} + +function is_alpha(c) { + return (c >= CP_a && c <= CP_z) || (c >= CP_A && c <= CP_Z) +} + +function is_alnum(c) { + return is_alpha(c) || is_digit(c) +} + +function is_ident_start(c) { + return is_alpha(c) || c == CP_UNDERSCORE || c == CP_DOLLAR +} + +function is_ident_char(c) { + return is_alnum(c) || c == CP_UNDERSCORE || c == CP_DOLLAR || c == CP_QMARK || c == CP_BANG +} + +function substr(start, end) { + var s = "" + var i = start + while (i < end) { + s = s + character(cp[i]) + i = i + 1 + } + return s +} + +function read_string(quote_cp) { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + adv() + while (pos < _src_len && pk() != quote_cp) { + if (pk() == CP_BSLASH) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_SQUOTE) { value = value + "'" } + else if (esc == CP_DQUOTE) { value = value + "\"" } + else if (esc == CP_0) { value = value + character(0) } + else if (esc == CP_BACKTICK) { value = value + "`" } + else { value = value + character(esc) } + } else { + value = value + character(adv()) + } + } + if (pos < _src_len) adv() + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) +} + +function read_template() { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + var depth = 0 + var tc = 0 + var q = 0 + adv() + while (pos < _src_len && pk() != CP_BACKTICK) { + if (pk() == CP_BSLASH && pos + 1 < _src_len) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_BACKTICK) { value = value + "`" } + else if (esc == CP_DOLLAR) { value = value + "$" } + else if (esc == CP_0) { value = value + character(0) } + else { value = value + character(esc) } + } else if (pk() == CP_DOLLAR && pos + 1 < _src_len && pk_at(1) == CP_LBRACE) { + adv() + adv() + depth = 1 + while (pos < _src_len && depth > 0) { + tc = pk() + if (tc == CP_LBRACE) { depth = depth + 1; adv() } + else if (tc == CP_RBRACE) { depth = depth - 1; adv() } + else if (tc == CP_SQUOTE || tc == CP_DQUOTE || tc == CP_BACKTICK) { + q = adv() + while (pos < _src_len && pk() != q) { + if (pk() == CP_BSLASH && pos + 1 < _src_len) adv() + adv() + } + if (pos < _src_len) adv() + } else { adv() } + } + } else { + value = value + character(adv()) + } + } + if (pos < _src_len) adv() + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) +} + +function read_number() { + var start = pos + var start_row = row + var start_col = col + var raw = "" + if (pk() == CP_0 && (pk_at(1) == CP_x || pk_at(1) == CP_X)) { + adv(); adv() + while (pos < _src_len && (is_hex(pk()) || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_b || pk_at(1) == CP_B)) { + adv(); adv() + while (pos < _src_len && (pk() == CP_0 || pk() == CP_1 || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_o || pk_at(1) == CP_O)) { + adv(); adv() + while (pos < _src_len && pk() >= CP_0 && pk() <= CP_7) adv() + } else { + while (pos < _src_len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + if (pos < _src_len && pk() == CP_DOT) { + adv() + while (pos < _src_len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + } + if (pos < _src_len && (pk() == CP_e || pk() == CP_E)) { + adv() + if (pos < _src_len && (pk() == CP_PLUS || pk() == CP_MINUS)) adv() + while (pos < _src_len && is_digit(pk())) adv() + } + } + raw = substr(start, pos) + push(tokens, { + kind: "number", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw, number: number(raw) + }) +} + +function read_name() { + var start = pos + var start_row = row + var start_col = col + var name = "" + var kw = null + while (pos < _src_len && is_ident_char(pk())) adv() + name = substr(start, pos) + kw = keywords[name] + if (kw != null) { + push(tokens, { + kind: kw, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) + } else { + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: name + }) + } +} + +function read_comment() { + var start = pos + var start_row = row + var start_col = col + var raw = "" + if (pk_at(1) == CP_SLASH) { + while (pos < _src_len && pk() != CP_LF && pk() != CP_CR) adv() + } else { + adv(); adv() + while (pos < _src_len) { + if (pk() == CP_STAR && pk_at(1) == CP_SLASH) { + adv(); adv() + break + } + adv() + } + } + raw = substr(start, pos) + push(tokens, { + kind: "comment", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw + }) +} + +function emit_op(kind, count) { + var start = pos + var start_row = row + var start_col = col + var i = 0 + while (i < count) { adv(); i = i + 1 } + push(tokens, { + kind: kind, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) +} + +function emit_ident(count) { + var start = pos + var start_row = row + var start_col = col + var val = "" + var i = 0 + while (i < count) { val = val + character(adv()); i = i + 1 } + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: val + }) +} + +function tokenize_one() { + var c = pk() + var start = 0 + var start_row = 0 + var start_col = 0 + var raw = "" + if (c == -1) return false + + if (c == CP_LF) { + start = pos; start_row = row; start_col = col + adv() + push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) + return true + } + if (c == CP_CR) { + start = pos; start_row = row; start_col = col + adv() + if (pos < _src_len && pk() == CP_LF) adv() + push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) + return true + } + if (c == CP_SPACE || c == CP_TAB) { + start = pos; start_row = row; start_col = col + while (pos < _src_len && (pk() == CP_SPACE || pk() == CP_TAB)) adv() + raw = substr(start, pos) + push(tokens, { kind: "space", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw }) + return true + } + if (c == CP_SQUOTE || c == CP_DQUOTE) { read_string(c); return true } + if (c == CP_BACKTICK) { read_template(); return true } + if (is_digit(c)) { read_number(); return true } + if (c == CP_DOT && is_digit(pk_at(1))) { read_number(); return true } + if (is_ident_start(c)) { read_name(); return true } + if (c == CP_SLASH) { + if (pk_at(1) == CP_SLASH || pk_at(1) == CP_STAR) { read_comment(); return true } + if (pk_at(1) == CP_EQ) { emit_op("/=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("/", 1); return true + } + if (c == CP_STAR) { + if (pk_at(1) == CP_STAR) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("**=", 3); return true } + emit_op("**", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("*=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("*", 1); return true + } + if (c == CP_PERCENT) { + if (pk_at(1) == CP_EQ) { emit_op("%=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("%", 1); return true + } + if (c == CP_PLUS) { + if (pk_at(1) == CP_EQ) { emit_op("+=", 2); return true } + if (pk_at(1) == CP_PLUS) { emit_op("++", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("+", 1); return true + } + if (c == CP_MINUS) { + if (pk_at(1) == CP_EQ) { emit_op("-=", 2); return true } + if (pk_at(1) == CP_MINUS) { emit_op("--", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("-", 1); return true + } + if (c == CP_LT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op("<=", 2); return true } + if (pk_at(1) == CP_LT) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("<<=", 3); return true } + emit_op("<<", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("<", 1); return true + } + if (c == CP_GT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op(">=", 2); return true } + if (pk_at(1) == CP_GT) { + if (pk_at(2) == CP_GT) { + if (pk_at(3) == CP_BANG) { emit_ident(4); return true } + if (pk_at(3) == CP_EQ) { emit_op(">>>=", 4); return true } + emit_op(">>>", 3); return true + } + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op(">>=", 3); return true } + emit_op(">>", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op(">", 1); return true + } + if (c == CP_EQ) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_EQ) { emit_op("===", 3); return true } + emit_op("==", 2); return true + } + if (pk_at(1) == CP_GT) { emit_op("=>", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("=", 1); return true + } + if (c == CP_BANG) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("!==", 3); return true } + emit_op("!=", 2); return true + } + emit_op("!", 1); return true + } + if (c == CP_AMP) { + if (pk_at(1) == CP_AMP) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("&&=", 3); return true } + emit_op("&&", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("&=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("&", 1); return true + } + if (c == CP_PIPE) { + if (pk_at(1) == CP_PIPE) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("||=", 3); return true } + emit_op("||", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("|=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("|", 1); return true + } + if (c == CP_CARET) { + if (pk_at(1) == CP_EQ) { emit_op("^=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("^", 1); return true + } + if (c == CP_LBRACKET) { + if (pk_at(1) == CP_RBRACKET && pk_at(2) == CP_BANG) { emit_ident(3); return true } + emit_op("[", 1); return true + } + if (c == CP_TILDE) { + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("~", 1); return true + } + emit_op(character(c), 1) + return true +} + +// Tokenize +while (pos < _src_len) { + tokenize_one() +} +push(tokens, { kind: "eof", at: pos, from_row: row, from_column: col, to_row: row, to_column: col }) + +// ============================================================ +// Section 2: Parser Cursor +// ============================================================ + +var cursor = 0 +var tok = null +var got_lf = false +var prev_tok = null + +function advance() { + var t = null + var k = null + prev_tok = tok + cursor = cursor + 1 + got_lf = false + while (cursor < length(tokens)) { + t = tokens[cursor] + k = t.kind + if (k == "space" || k == "comment") { + cursor = cursor + 1 + continue + } + if (k == "newline") { + got_lf = true + cursor = cursor + 1 + continue + } + tok = t + return null + } + tok = tokens[length(tokens) - 1] +} + +function peek_ahead(n) { + var c = cursor + 1 + var count = 0 + var t = null + var k = null + while (c < length(tokens)) { + t = tokens[c] + k = t.kind + if (k != "space" && k != "comment" && k != "newline") { + count = count + 1 + if (count == n) return t + } + c = c + 1 + } + return tokens[length(tokens) - 1] +} + +function init_cursor() { + cursor = -1 + advance() +} + +// ============================================================ +// Section 3: AST Helpers +// ============================================================ + +var errors = [] +var error_count = 0 +var function_nr = 1 + +function ast_node(kind, token) { + return { + kind: kind, + at: token.at, + from_row: token.from_row, + from_column: token.from_column + } +} + +function ast_node_end(node) { + node.to_row = prev_tok.to_row + node.to_column = prev_tok.to_column + return node +} + +function parse_error(token, msg) { + if (error_count >= 5) return null + error_count = error_count + 1 + push(errors, { + message: msg, + line: token.from_row + 1, + column: token.from_column + 1, + offset: token.at + }) +} + +function is_keyword(kind) { + return kind == "if" || kind == "in" || kind == "do" || kind == "go" || + kind == "var" || kind == "def" || kind == "for" || + kind == "else" || kind == "this" || kind == "null" || kind == "true" || + kind == "false" || kind == "while" || kind == "break" || + kind == "return" || kind == "delete" || + kind == "disrupt" || kind == "function" || kind == "continue" || + kind == "disruption" +} + +// ============================================================ +// Section 4: Expression Parsing +// ============================================================ + +// Forward declarations via var +var parse_expr = null +var parse_assign_expr = null +var parse_assign = null +var parse_statement = null +var parse_block_statements = null +var parse_function_inner = null +var parse_arrow_function = null + +function is_arrow_function() { + // Check if ( ... ) => pattern + if (tok.kind != "(") return false + var c = cursor + 1 + var depth = 1 + var k = null + while (c < length(tokens) && depth > 0) { + k = tokens[c].kind + if (k == "(") { depth = depth + 1 } + else if (k == ")") { depth = depth - 1 } + else if (k == "text" || k == "number") { null } + c = c + 1 + } + // Skip whitespace/newline/comment tokens + while (c < length(tokens)) { + k = tokens[c].kind + if (k != "space" && k != "newline" && k != "comment") break + c = c + 1 + } + if (c >= length(tokens)) return false + return tokens[c].kind == "=>" +} + +function parse_primary() { + var start = tok + var node = null + var k = tok.kind + var list = null + var pair = null + var left = null + var right = null + var is_ident = false + var is_kw = false + var p1 = null + var elem = null + var fn_start = null + var fn = null + var name_item = null + var params = null + var param = null + var rpos = 0 + var pattern_str = "" + var flags = "" + + if (k == "number") { + node = ast_node("number", start) + node.value = tok.value + node.number = tok.number + advance() + ast_node_end(node) + return node + } + if (k == "text") { + node = ast_node("text", start) + node.value = tok.value + advance() + ast_node_end(node) + return node + } + if (k == "name") { + // Check for single-param arrow: name => + p1 = peek_ahead(1) + if (p1.kind == "=>") { + return parse_arrow_function() + } + node = ast_node("name", start) + node.name = tok.value + advance() + ast_node_end(node) + return node + } + if (k == "null") { + node = ast_node("null", start) + advance() + ast_node_end(node) + return node + } + if (k == "true") { + node = ast_node("true", start) + advance() + ast_node_end(node) + return node + } + if (k == "false") { + node = ast_node("false", start) + advance() + ast_node_end(node) + return node + } + if (k == "this") { + node = ast_node("this", start) + advance() + ast_node_end(node) + return node + } + if (k == "[") { + node = ast_node("array", start) + list = [] + node.list = list + advance() + while (tok.kind != "]" && tok.kind != "eof") { + elem = parse_assign_expr() + if (elem != null) push(list, elem) + if (tok.kind == ",") advance() + else break + } + ast_node_end(node) + if (tok.kind == "]") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated array literal, expected ']'") + return node + } + if (k == "{") { + node = ast_node("record", start) + list = [] + node.list = list + advance() + while (tok.kind != "}" && tok.kind != "eof") { + pair = {} + is_ident = (tok.kind == "name") + is_kw = is_keyword(tok.kind) + if (is_ident || is_kw || tok.kind == "text" || tok.kind == "number") { + if (is_kw) { + left = ast_node("name", tok) + left.name = tok.kind + advance() + ast_node_end(left) + } else { + left = parse_primary() + } + pair.left = left + } else if (tok.kind == "[") { + advance() + left = parse_assign_expr() + pair.left = left + if (tok.kind == "]") advance() + else parse_error(tok, "expected ']' after computed property") + } else { + parse_error(tok, "expected property name in object literal") + break + } + if (tok.kind == ":") { + advance() + right = parse_assign_expr() + pair.right = right + } else if (tok.kind == "(") { + // Method shorthand + fn_start = tok + fn = ast_node("function", fn_start) + name_item = pair.left + if (name_item != null && name_item.name != null) { + fn.name = name_item.name + } + params = [] + fn.list = params + advance() + while (tok.kind != ")" && tok.kind != "eof") { + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + advance() + ast_node_end(param) + if (tok.kind == "=" || tok.kind == "|") { + advance() + param.expression = parse_expr() + } + push(params, param) + } else { + parse_error(tok, "expected parameter name") + break + } + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated method parameter list") + if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") + if (tok.kind == "{") { + advance() + fn.statements = parse_block_statements() + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated method body") + } else { + parse_error(tok, "expected '{' for method body") + } + fn.function_nr = function_nr + function_nr = function_nr + 1 + ast_node_end(fn) + pair.right = fn + } else if (!(is_ident && (tok.kind == "," || tok.kind == "}"))) { + parse_error(tok, "expected ':' after property name") + } + push(list, pair) + if (tok.kind == ",") advance() + else break + } + ast_node_end(node) + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated object literal, expected '}'") + return node + } + if (k == "(") { + if (is_arrow_function()) { + return parse_arrow_function() + } + advance() + node = parse_expr() + if (tok.kind == ")") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated parenthesized expression, expected ')'") + else parse_error(tok, "expected ')' after expression") + return node + } + if (k == "function") { + return parse_function_inner() + } + if (k == "/") { + // Regex literal + node = ast_node("regexp", start) + // Re-scan from token position to parse regex + rpos = tok.at + 1 + pattern_str = "" + flags = "" + while (rpos < _src_len && cp[rpos] != CP_SLASH) { + if (cp[rpos] == CP_BSLASH && rpos + 1 < _src_len) { + pattern_str = pattern_str + character(cp[rpos]) + character(cp[rpos + 1]) + rpos = rpos + 2 + } else { + pattern_str = pattern_str + character(cp[rpos]) + rpos = rpos + 1 + } + } + if (rpos < _src_len) rpos = rpos + 1 + while (rpos < _src_len && is_alpha(cp[rpos])) { + flags = flags + character(cp[rpos]) + rpos = rpos + 1 + } + node.pattern = pattern_str + if (length(flags) > 0) node.flags = flags + advance() + ast_node_end(node) + return node + } + + // Error + if (k == "eof") { + parse_error(start, "unexpected end of input") + } else { + parse_error(start, "unexpected token where expression expected") + } + advance() + return null +} + +function parse_postfix() { + var node = parse_primary() + var start = null + var new_node = null + var index = null + var arg = null + var args_list = null + if (node == null) return null + while (true) { + start = tok + if (tok.kind == ".") { + advance() + new_node = ast_node(".", start) + new_node.left = node + if (tok.kind == "name" || is_keyword(tok.kind)) { + if (tok.kind == "name") { + new_node.right = tok.value + } else { + new_node.right = tok.kind + } + advance() + } else { + parse_error(tok, "expected property name after '.'") + } + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "[") { + advance() + new_node = ast_node("[", start) + new_node.left = node + if (tok.kind == "]") { + advance() + } else { + index = parse_assign_expr() + new_node.right = index + if (tok.kind == "]") advance() + else parse_error(tok, "expected ']'") + } + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "(") { + advance() + new_node = ast_node("(", start) + new_node.expression = node + args_list = [] + new_node.list = args_list + while (tok.kind != ")" && tok.kind != "eof") { + arg = parse_assign_expr() + if (arg != null) push(args_list, arg) + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + else parse_error(tok, "unterminated argument list, expected ')'") + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "++") { + new_node = ast_node("++", start) + new_node.expression = node + new_node.postfix = true + advance() + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "--") { + new_node = ast_node("--", start) + new_node.expression = node + new_node.postfix = true + advance() + ast_node_end(new_node) + node = new_node + } else { + break + } + } + return node +} + +function parse_unary() { + var start = tok + var node = null + var expr = null + var k = tok.kind + if (k == "!") { + advance() + node = ast_node("!", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "~") { + advance() + node = ast_node("~", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "+") { + advance() + node = ast_node("+unary", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "-") { + advance() + node = ast_node("-unary", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "++") { + advance() + node = ast_node("++", start) + node.expression = parse_unary() + node.postfix = false + ast_node_end(node) + return node + } + if (k == "--") { + advance() + node = ast_node("--", start) + node.expression = parse_unary() + node.postfix = false + ast_node_end(node) + return node + } + if (k == "delete") { + advance() + node = ast_node("delete", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + return parse_postfix() +} + +// Binary operator precedence +var binop_prec = { + "**": 14, + "*": 13, "/": 13, "%": 13, + "+": 12, "-": 12, + "<<": 11, ">>": 11, ">>>": 11, + "<": 10, ">": 10, "<=": 10, ">=": 10, in: 10, + "==": 9, "!=": 9, "===": 9, "!==": 9, + "&": 8, "^": 7, "|": 6, + "&&": 5, "||": 4 +} + +function parse_binary(min_prec) { + var left_node = parse_unary() + var start = null + var op = null + var prec = null + var next_prec = 0 + var right_node = null + var node = null + if (left_node == null) return null + while (true) { + start = tok + op = tok.kind + prec = binop_prec[op] + if (prec == null || prec < min_prec) break + advance() + next_prec = prec + 1 + if (prec == 14) next_prec = prec // right-assoc for ** + right_node = parse_binary(next_prec) + node = ast_node(op, start) + node.left = left_node + node.right = right_node + ast_node_end(node) + left_node = node + } + return left_node +} + +function parse_ternary() { + var cond = parse_binary(1) + var start = null + var then_expr = null + var else_expr = null + var node = null + if (cond == null) return null + if (tok.kind == "?") { + start = tok + advance() + then_expr = parse_expr() + if (tok.kind == ":") advance() + else parse_error(tok, "expected ':' in ternary expression") + else_expr = parse_expr() + node = ast_node("then", start) + node.expression = cond + node.then = then_expr + node.else = else_expr + ast_node_end(node) + return node + } + return cond +} + +// Assign operators +var assign_ops = { + "=": "assign", "+=": "+=", "-=": "-=", "*=": "*=", "/=": "/=", "%=": "%=", + "<<=": "<<=", ">>=": ">>=", ">>>=": ">>>=", + "&=": "&=", "^=": "^=", "|=": "|=", "**=": "**=", + "&&=": "&&=", "||=": "||=" +} + +parse_assign = function(unused) { + var left_node = parse_ternary() + var start = null + var kind = null + var right_node = null + var node = null + var left_kind = null + var right_kind = null + if (left_node == null) return null + start = tok + kind = assign_ops[tok.kind] + if (kind == null) return left_node + + // Validate assignment target + left_kind = left_node.kind + if (left_kind != "name" && left_kind != "." && left_kind != "[") { + parse_error(start, "invalid assignment left-hand side") + } + + advance() + right_node = parse_assign() + node = ast_node(kind, start) + node.left = left_node + node.right = right_node + + // Check push/pop bracket syntax + if (left_node.kind == "[" && left_node.right == null) node.push = true + if (right_node != null && right_node.kind == "[" && right_node.right == null) node.pop = true + + ast_node_end(node) + return node +} + +parse_assign_expr = function(unused) { + return parse_assign() +} + +parse_expr = function(unused) { + var left_node = parse_assign() + var start = null + var right_node = null + var node = null + if (left_node == null) return null + while (tok.kind == ",") { + start = tok + advance() + right_node = parse_assign() + node = ast_node(",", start) + node.left = left_node + node.right = right_node + ast_node_end(node) + left_node = node + } + return left_node +} + +// ============================================================ +// Section 5: Statement Parsing +// ============================================================ + +var in_disruption = 0 + +function expect_semi() { + if (tok.kind == ";") { advance(); return null } + if (tok.kind == "eof" || tok.kind == "}" || got_lf || tok.kind == "else") return null + parse_error(tok, "expecting ';'") +} + +function sync_to_statement() { + var k = null + while (tok.kind != "eof") { + k = tok.kind + if (k == ";") { advance(); return null } + if (k == "}") return null + if (k == "var" || k == "def" || k == "if" || k == "while" || + k == "for" || k == "return" || k == "disrupt" || + k == "function" || k == "break" || k == "continue" || k == "do") return null + advance() + } +} + +parse_block_statements = function(unused) { + var stmts = [] + var before = null + var stmt = null + while (tok.kind != "}" && tok.kind != "eof") { + before = cursor + stmt = parse_statement() + if (stmt != null) { + push(stmts, stmt) + } else if (cursor == before) { + sync_to_statement() + } + } + return stmts +} + +parse_function_inner = function(unused) { + var start = tok + var node = ast_node("function", start) + var params = [] + var stmts = null + var param = null + var prev_names = null + var pname = null + var dup = false + var j = 0 + var old_dis = 0 + + if (in_disruption) { + parse_error(tok, "cannot define function inside disruption clause") + } + + advance() // skip 'function' + + // Optional name + if (tok.kind == "name") { + node.name = tok.value + advance() + } + + // Parameters + node.list = params + if (tok.kind == "(") { + advance() + prev_names = [] + while (tok.kind != ")" && tok.kind != "eof") { + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + // Check duplicate + pname = tok.value + dup = false + j = 0 + while (j < length(prev_names)) { + if (prev_names[j] == pname) { dup = true; break } + j = j + 1 + } + if (dup) parse_error(tok, "duplicate parameter name '" + pname + "'") + push(prev_names, pname) + advance() + ast_node_end(param) + if (tok.kind == "=" || tok.kind == "|") { + advance() + param.expression = parse_assign_expr() + } + push(params, param) + } else { + parse_error(tok, "expected parameter name") + break + } + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated function parameter list, expected ')'") + } else { + parse_error(tok, "expected '(' after function name") + } + + if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") + + // Body + if (tok.kind == "{") { + advance() + stmts = parse_block_statements() + node.statements = stmts + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated function body, expected '}'") + } else { + parse_error(tok, "expected '{' for function body") + } + + // Disruption clause + if (tok.kind == "disruption") { + advance() + if (tok.kind == "{") { + advance() + old_dis = in_disruption + in_disruption = 1 + node.disruption = parse_block_statements() + in_disruption = old_dis + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated disruption clause, expected '}'") + } else { + parse_error(tok, "expected '{' after disruption") + } + } + + node.function_nr = function_nr + function_nr = function_nr + 1 + ast_node_end(node) + return node +} + +parse_arrow_function = function(unused) { + var start = tok + var node = ast_node("function", start) + var params = [] + var param = null + var stmts = null + var ret = null + var expr = null + var prev_names = null + var pname = null + var dup = false + var j = 0 + node.arrow = true + + if (in_disruption) { + parse_error(tok, "cannot define function inside disruption clause") + } + + node.list = params + + if (tok.kind == "name") { + // Single param without parens + param = ast_node("name", tok) + param.name = tok.value + advance() + ast_node_end(param) + push(params, param) + } else if (tok.kind == "(") { + advance() + prev_names = [] + while (tok.kind != ")" && tok.kind != "eof") { + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + pname = tok.value + dup = false + j = 0 + while (j < length(prev_names)) { + if (prev_names[j] == pname) { dup = true; break } + j = j + 1 + } + if (dup) parse_error(tok, "duplicate parameter name '" + pname + "'") + push(prev_names, pname) + advance() + ast_node_end(param) + if (tok.kind == "=" || tok.kind == "|") { + advance() + param.expression = parse_assign_expr() + } + push(params, param) + } else { + parse_error(tok, "expected parameter name") + break + } + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + } + + if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") + + // Arrow token + if (tok.kind != "=>") { + parse_error(tok, "expected '=>' in arrow function") + } else { + advance() + } + + // Body + if (tok.kind == "{") { + advance() + stmts = parse_block_statements() + node.statements = stmts + if (tok.kind == "}") advance() + } else { + // Expression body + stmts = [] + ret = ast_node("return", tok) + expr = parse_assign_expr() + ret.expression = expr + ast_node_end(ret) + push(stmts, ret) + node.statements = stmts + } + + node.function_nr = function_nr + function_nr = function_nr + 1 + ast_node_end(node) + return node +} + +parse_statement = function(unused) { + var start = tok + var node = null + var k = tok.kind + var stmts = null + var cond = null + var then_stmts = null + var else_stmts = null + var else_ifs = null + var body = null + var expr = null + var init = null + var test = null + var update = null + var left_node = null + var right_node = null + var kind_name = null + var is_def = false + var decls = null + var decl_count = 0 + var var_name = null + var right_kind = null + var elif = null + var p1_tok = null + var labeled_stmt = null + + if (k == "{") { + node = ast_node("block", start) + advance() + stmts = parse_block_statements() + node.statements = stmts + if (tok.kind == "}") advance() + ast_node_end(node) + return node + } + + if (k == "var" || k == "def") { + kind_name = k + is_def = (k == "def") + advance() + if (tok.kind != "name") { + parse_error(tok, "expected identifier after '" + kind_name + "'") + return null + } + decls = [] + decl_count = 0 + while (tok.kind == "name") { + node = ast_node(kind_name, start) + left_node = ast_node("name", tok) + left_node.name = tok.value + var_name = tok.value + advance() + ast_node_end(left_node) + node.left = left_node + if (tok.kind == "=") { + advance() + right_node = parse_assign_expr() + node.right = right_node + if (right_node != null && right_node.kind == "[" && right_node.right == null) { + node.pop = true + } + } else if (is_def) { + parse_error(start, "missing initializer for constant '" + var_name + "'") + } + ast_node_end(node) + push(decls, node) + decl_count = decl_count + 1 + if (tok.kind == ",") advance() + else break + } + expect_semi() + if (decl_count == 1) { + return decls[0] + } + node = ast_node("var_list", start) + node.list = decls + ast_node_end(node) + return node + } + + if (k == "if") { + node = ast_node("if", start) + advance() + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' before condition") + cond = parse_expr() + node.expression = cond + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after if condition") + then_stmts = [] + node.then = then_stmts + body = parse_statement() + if (body != null) push(then_stmts, body) + else_ifs = [] + node.list = else_ifs + if (tok.kind == "else") { + advance() + if (tok.kind == "if") { + elif = parse_statement() + if (elif != null) push(else_ifs, elif) + } else { + else_stmts = [] + node.else = else_stmts + body = parse_statement() + if (body != null) push(else_stmts, body) + } + } + ast_node_end(node) + return node + } + + if (k == "while") { + node = ast_node("while", start) + advance() + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' before condition") + cond = parse_expr() + node.expression = cond + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after while condition") + stmts = [] + node.statements = stmts + body = parse_statement() + if (body != null) push(stmts, body) + ast_node_end(node) + return node + } + + if (k == "do") { + node = ast_node("do", start) + advance() + stmts = [] + node.statements = stmts + body = parse_statement() + if (body != null) push(stmts, body) + if (tok.kind == "while") advance() + else parse_error(tok, "expected 'while' after do body") + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' before condition") + cond = parse_expr() + node.expression = cond + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after do-while condition") + expect_semi() + ast_node_end(node) + return node + } + + if (k == "for") { + node = ast_node("for", start) + advance() + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' after for") + if (tok.kind != ";") { + if (tok.kind == "var" || tok.kind == "def") { + init = parse_statement() + node.init = init + } else { + init = parse_expr() + node.init = init + if (tok.kind == ";") advance() + } + } else { + advance() + } + if (tok.kind != ";") { + test = parse_expr() + node.test = test + } + if (tok.kind == ";") advance() + if (tok.kind != ")") { + update = parse_expr() + node.update = update + } + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after for clauses") + stmts = [] + node.statements = stmts + body = parse_statement() + if (body != null) push(stmts, body) + ast_node_end(node) + return node + } + + if (k == "return") { + node = ast_node("return", start) + advance() + if (tok.kind != ";" && tok.kind != "}" && !got_lf) { + expr = parse_expr() + node.expression = expr + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "go") { + node = ast_node("go", start) + advance() + if (tok.kind != ";" && tok.kind != "}" && !got_lf) { + expr = parse_expr() + node.expression = expr + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "disrupt") { + node = ast_node("disrupt", start) + advance() + expect_semi() + ast_node_end(node) + return node + } + + if (k == "break") { + node = ast_node("break", start) + advance() + if (tok.kind == "name" && !got_lf) { + node.name = tok.value + advance() + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "continue") { + node = ast_node("continue", start) + advance() + if (tok.kind == "name" && !got_lf) { + node.name = tok.value + advance() + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "function") { + return parse_function_inner() + } + + if (k == ";") { + advance() + return null + } + + if (k == "name") { + // Check for labeled statement + p1_tok = peek_ahead(1) + if (p1_tok.kind == ":") { + node = ast_node("label", start) + node.name = tok.value + advance() // skip identifier + advance() // skip colon + labeled_stmt = parse_statement() + node.statement = labeled_stmt + ast_node_end(node) + return node + } + } + + // Expression statement + expr = parse_expr() + if (expr != null) { + node = ast_node("call", start) + node.expression = expr + ast_node_end(node) + expect_semi() + return node + } + parse_error(start, "unexpected token at start of statement") + return null +} + +// ============================================================ +// Section 6: Program +// ============================================================ + +function parse_program() { + var root = {kind: "program", filename: filename} + var functions = [] + var statements = [] + var before = 0 + var stmt = null + root.functions = functions + root.statements = statements + + while (tok.kind != "eof") { + before = cursor + stmt = parse_statement() + if (stmt != null) { + if (stmt.kind == "function") { + push(functions, stmt) + } else { + push(statements, stmt) + } + } else if (cursor == before) { + sync_to_statement() + } + } + return root +} + +// ============================================================ +// Section 7: Semantic Analysis +// ============================================================ + +var sem_errors = [] +var scopes_array = [] +var intrinsics = [] +var block_var_counter = 0 + +function sem_error(node, msg) { + var err = {message: msg} + if (node.from_row != null) err.line = node.from_row + 1 + if (node.from_column != null) err.column = node.from_column + 1 + push(sem_errors, err) +} + +function make_scope(parent, fn_nr, opts) { + return { + parent: parent, + vars: [], + in_loop: opts.in_loop == true, + function_nr: fn_nr, + is_function_scope: opts.is_func == true, + block_depth: opts.bdepth != null ? opts.bdepth : 0 + } +} + +function sem_add_var(scope, name, make_opts) { + push(scope.vars, { + name: name, + scope_name: null, + is_const: make_opts.is_const == true, + make: make_opts.make, + function_nr: make_opts.fn_nr, + nr_uses: 0, + closure: 0 + }) +} + +function sem_lookup_var(scope, name) { + var result = {v: null, level: 0, def_function_nr: -1} + var cur_fn = scope.function_nr + var s = scope + var i = 0 + while (s != null) { + i = 0 + while (i < length(s.vars)) { + if (s.vars[i].name == name) { + result.v = s.vars[i] + result.def_function_nr = s.vars[i].function_nr + return result + } + i = i + 1 + } + if (s.parent != null && s.parent.function_nr != cur_fn) { + result.level = result.level + 1 + cur_fn = s.parent.function_nr + } + s = s.parent + } + return result +} + +function sem_find_var(scope, name) { + var r = sem_lookup_var(scope, name) + return r.v +} + +function sem_in_loop(scope) { + var s = scope + while (s != null) { + if (s.in_loop) return true + s = s.parent + } + return false +} + +function sem_add_intrinsic(name) { + var i = 0 + while (i < length(intrinsics)) { + if (intrinsics[i] == name) return null + i = i + 1 + } + push(intrinsics, name) +} + +var functino_names = { + "+!": true, "-!": true, "*!": true, "/!": true, "%!": true, "**!": true, + "!": true, "<=!": true, ">=!": true, "=!": true, "!=!": true, + "&!": true, "|!": true, "^!": true, "<>!": true, ">>>!": true, + "&&!": true, "||!": true, "~!": true, "[]!": true +} + +function is_functino_name(name) { + return functino_names[name] == true +} + +function sem_propagate_block_vars(parent, block) { + var i = 0 + var v = null + var sn = null + while (i < length(block.vars)) { + v = block.vars[i] + sn = v.scope_name + if (sn == null) sn = v.name + push(parent.vars, { + name: sn, + scope_name: null, + is_const: v.is_const, + make: v.make, + function_nr: v.function_nr, + nr_uses: v.nr_uses, + closure: v.closure + }) + i = i + 1 + } +} + +function sem_build_scope_record(scope) { + var rec = {function_nr: scope.function_nr} + var slots = 0 + var close_slots = 0 + var i = 0 + var v = null + while (i < length(scope.vars)) { + v = scope.vars[i] + rec[v.name] = { + make: v.make, + function_nr: v.function_nr, + nr_uses: v.nr_uses, + closure: v.closure == 1, + level: 0 + } + slots = slots + 1 + if (v.closure) close_slots = close_slots + 1 + i = i + 1 + } + return {rec: rec, nr_slots: slots, nr_close: close_slots} +} + +// Forward declarations +var sem_check_expr = null +var sem_check_stmt = null + +function sem_predeclare_vars(scope, stmts) { + var i = 0 + var stmt = null + var kind = null + var name = null + var item = null + var ik = null + var j = 0 + while (i < length(stmts)) { + stmt = stmts[i] + kind = stmt.kind + if (kind == "function") { + name = stmt.name + if (name != null && sem_find_var(scope, name) == null) { + sem_add_var(scope, name, {make: "function", fn_nr: scope.function_nr}) + } + } else if (kind == "var") { + name = stmt.left.name + if (name != null && sem_find_var(scope, name) == null) { + sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) + } + } else if (kind == "var_list") { + j = 0 + while (j < length(stmt.list)) { + item = stmt.list[j] + ik = item.kind + if (ik == "var") { + name = item.left.name + if (name != null && sem_find_var(scope, name) == null) { + sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) + } + } + j = j + 1 + } + } + i = i + 1 + } +} + +function sem_check_assign_target(scope, left_node) { + if (left_node == null) return null + var kind = left_node.kind + var name = null + var v = null + var r = null + var obj_expr = null + + if (kind == "name") { + name = left_node.name + if (name == null) return null + v = sem_find_var(scope, name) + if (v == null) { + sem_error(left_node, "cannot assign to unbound variable '" + name + "'") + } else if (v.is_const) { + sem_error(left_node, "cannot assign to constant '" + name + "'") + } + r = sem_lookup_var(scope, name) + if (r.v != null) { + left_node.level = r.level + left_node.function_nr = r.def_function_nr + if (r.v.scope_name != null) left_node.scope_name = r.v.scope_name + } else { + left_node.level = -1 + } + } else if (kind == "." || kind == "[") { + obj_expr = left_node.left + sem_check_expr(scope, obj_expr) + if (kind == "[" && left_node.right != null) { + sem_check_expr(scope, left_node.right) + } + } +} + +sem_check_expr = function(scope, expr) { + if (expr == null) return null + var kind = expr.kind + if (kind == null) return null + var name = null + var r = null + var i = 0 + var operand = null + var v = null + var prop = null + var val = null + var fn_nr_val = null + var fn_scope = null + var pname = null + var def_val = null + var sr = null + + // Assignment operators + if (kind == "assign" || kind == "+=" || kind == "-=" || kind == "*=" || + kind == "/=" || kind == "%=" || kind == "<<=" || kind == ">>=" || + kind == ">>>=" || kind == "&=" || kind == "^=" || kind == "|=" || + kind == "**=" || kind == "&&=" || kind == "||=") { + sem_check_assign_target(scope, expr.left) + sem_check_expr(scope, expr.right) + return null + } + + // Increment/decrement + if (kind == "++" || kind == "--") { + operand = expr.expression + if (operand != null && operand.kind == "name") { + name = operand.name + if (name != null) { + v = sem_find_var(scope, name) + if (v == null) { + sem_error(expr, "cannot assign to unbound variable '" + name + "'") + } else if (v.is_const) { + sem_error(expr, "cannot assign to constant '" + name + "'") + } + r = sem_lookup_var(scope, name) + if (r.v != null) { + operand.level = r.level + operand.function_nr = r.def_function_nr + if (r.v.scope_name != null) operand.scope_name = r.v.scope_name + } else { + operand.level = -1 + } + } + } + return null + } + + // Binary ops + if (kind == "," || kind == "+" || kind == "-" || kind == "*" || + kind == "/" || kind == "%" || kind == "==" || kind == "!=" || + kind == "<" || kind == ">" || kind == "<=" || kind == ">=" || + kind == "&&" || kind == "||" || kind == "&" || + kind == "|" || kind == "^" || kind == "<<" || kind == ">>" || + kind == ">>>" || kind == "**" || kind == "in" || + kind == "." || kind == "[") { + sem_check_expr(scope, expr.left) + sem_check_expr(scope, expr.right) + return null + } + + // Ternary + if (kind == "then") { + sem_check_expr(scope, expr.expression) + sem_check_expr(scope, expr.then) + sem_check_expr(scope, expr.else) + return null + } + + // Call + if (kind == "(") { + sem_check_expr(scope, expr.expression) + i = 0 + while (i < length(expr.list)) { + sem_check_expr(scope, expr.list[i]) + i = i + 1 + } + return null + } + + // Unary ops + if (kind == "!" || kind == "~" || kind == "delete" || + kind == "-unary" || kind == "+unary") { + sem_check_expr(scope, expr.expression) + return null + } + + // Array literal + if (kind == "array") { + i = 0 + while (i < length(expr.list)) { + sem_check_expr(scope, expr.list[i]) + i = i + 1 + } + return null + } + + // Record literal + if (kind == "record") { + i = 0 + while (i < length(expr.list)) { + prop = expr.list[i] + val = prop.right + sem_check_expr(scope, val) + i = i + 1 + } + return null + } + + // Function expression + if (kind == "function") { + fn_nr_val = expr.function_nr + if (fn_nr_val == null) fn_nr_val = scope.function_nr + fn_scope = make_scope(scope, fn_nr_val, {is_func: true}) + expr.outer = scope.function_nr + // Add params + i = 0 + while (i < length(expr.list)) { + pname = expr.list[i].name + if (pname != null) sem_add_var(fn_scope, pname, {is_const: true, make: "input", fn_nr: fn_nr_val}) + def_val = expr.list[i].expression + if (def_val != null) sem_check_expr(fn_scope, def_val) + i = i + 1 + } + // Pre-register declarations + if (expr.statements != null) { + sem_predeclare_vars(fn_scope, expr.statements) + i = 0 + while (i < length(expr.statements)) { + sem_check_stmt(fn_scope, expr.statements[i]) + i = i + 1 + } + } + // Disruption + if (expr.disruption != null) { + i = 0 + while (i < length(expr.disruption)) { + sem_check_stmt(fn_scope, expr.disruption[i]) + i = i + 1 + } + } + // Build scope record + sr = sem_build_scope_record(fn_scope) + push(scopes_array, sr.rec) + expr.nr_slots = sr.nr_slots + expr.nr_close_slots = sr.nr_close + return null + } + + // Template literal + if (kind == "text literal") { + i = 0 + while (i < length(expr.list)) { + sem_check_expr(scope, expr.list[i]) + i = i + 1 + } + return null + } + + // Name + if (kind == "name") { + name = expr.name + if (name != null) { + if (is_functino_name(name)) { + expr.make = "functino" + expr.level = -1 + return null + } + r = sem_lookup_var(scope, name) + if (r.v != null) { + expr.level = r.level + expr.function_nr = r.def_function_nr + r.v.nr_uses = r.v.nr_uses + 1 + if (r.level > 0) r.v.closure = 1 + if (r.v.scope_name != null) expr.scope_name = r.v.scope_name + } else { + expr.level = -1 + sem_add_intrinsic(name) + } + } + return null + } + + // Leaf nodes: number, text, regexp, null, true, false, this +} + +sem_check_stmt = function(scope, stmt) { + if (stmt == null) return null + var kind = stmt.kind + if (kind == null) return null + var name = null + var existing = null + var i = 0 + var sn = null + var then_scope = null + var list_scope = null + var else_scope = null + var loop_scope = null + var do_scope = null + var for_scope = null + var init_kind = null + var blk_scope = null + var fn_nr_val = null + var fn_scope = null + var pname = null + var def_val = null + var sr = null + + if (kind == "var_list") { + i = 0 + while (i < length(stmt.list)) { + sem_check_stmt(scope, stmt.list[i]) + i = i + 1 + } + return null + } + + if (kind == "var") { + name = stmt.left.name + if (name != null) { + existing = sem_find_var(scope, name) + if (existing != null && existing.is_const) { + sem_error(stmt.left, "cannot redeclare constant '" + name + "'") + } + if (existing == null || existing.function_nr != scope.function_nr || scope.block_depth > 0) { + sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) + } + if (scope.block_depth > 0) { + sn = "_" + name + "_" + text(block_var_counter) + block_var_counter = block_var_counter + 1 + scope.vars[length(scope.vars) - 1].scope_name = sn + stmt.left.scope_name = sn + } + } + sem_check_expr(scope, stmt.right) + return null + } + + if (kind == "def") { + name = stmt.left.name + if (name != null) { + existing = sem_find_var(scope, name) + if (existing != null && existing.is_const) { + sem_error(stmt.left, "cannot redeclare constant '" + name + "'") + } else if (existing != null && !existing.is_const && existing.function_nr == scope.function_nr) { + existing.is_const = 1 + existing.make = "def" + } else { + sem_add_var(scope, name, {is_const: true, make: "def", fn_nr: scope.function_nr}) + if (scope.block_depth > 0) { + sn = "_" + name + "_" + text(block_var_counter) + block_var_counter = block_var_counter + 1 + scope.vars[length(scope.vars) - 1].scope_name = sn + stmt.left.scope_name = sn + } + } + } + sem_check_expr(scope, stmt.right) + return null + } + + if (kind == "call") { + sem_check_expr(scope, stmt.expression) + return null + } + + if (kind == "if") { + sem_check_expr(scope, stmt.expression) + // then + then_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.then)) { + sem_check_stmt(then_scope, stmt.then[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, then_scope) + // else-if list + list_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.list)) { + sem_check_stmt(list_scope, stmt.list[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, list_scope) + // else + if (stmt.else != null) { + else_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.else)) { + sem_check_stmt(else_scope, stmt.else[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, else_scope) + } + return null + } + + if (kind == "while") { + sem_check_expr(scope, stmt.expression) + loop_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(loop_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, loop_scope) + return null + } + + if (kind == "do") { + do_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(do_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, do_scope) + sem_check_expr(scope, stmt.expression) + return null + } + + if (kind == "for") { + for_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) + if (stmt.init != null) { + init_kind = stmt.init.kind + if (init_kind == "var" || init_kind == "def") { + sem_check_stmt(for_scope, stmt.init) + } else { + sem_check_expr(for_scope, stmt.init) + } + } + sem_check_expr(for_scope, stmt.test) + sem_check_expr(for_scope, stmt.update) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(for_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, for_scope) + return null + } + + if (kind == "return" || kind == "go") { + sem_check_expr(scope, stmt.expression) + return null + } + + if (kind == "disrupt") { + return null + } + + if (kind == "break") { + if (!sem_in_loop(scope)) { + sem_error(stmt, "'break' used outside of loop") + } + return null + } + + if (kind == "continue") { + if (!sem_in_loop(scope)) { + sem_error(stmt, "'continue' used outside of loop") + } + return null + } + + if (kind == "block") { + blk_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(blk_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, blk_scope) + return null + } + + if (kind == "label") { + sem_check_stmt(scope, stmt.statement) + return null + } + + if (kind == "function") { + name = stmt.name + if (name != null) sem_add_var(scope, name, {make: "function", fn_nr: scope.function_nr}) + fn_nr_val = stmt.function_nr + if (fn_nr_val == null) fn_nr_val = scope.function_nr + fn_scope = make_scope(scope, fn_nr_val, {is_func: true}) + stmt.outer = scope.function_nr + i = 0 + while (i < length(stmt.list)) { + pname = stmt.list[i].name + if (pname != null) sem_add_var(fn_scope, pname, {is_const: true, make: "input", fn_nr: fn_nr_val}) + def_val = stmt.list[i].expression + if (def_val != null) sem_check_expr(fn_scope, def_val) + i = i + 1 + } + sem_predeclare_vars(fn_scope, stmt.statements) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(fn_scope, stmt.statements[i]) + i = i + 1 + } + if (stmt.disruption != null) { + i = 0 + while (i < length(stmt.disruption)) { + sem_check_stmt(fn_scope, stmt.disruption[i]) + i = i + 1 + } + } + sr = sem_build_scope_record(fn_scope) + push(scopes_array, sr.rec) + stmt.nr_slots = sr.nr_slots + stmt.nr_close_slots = sr.nr_close + return null + } +} + +function semantic_check(ast) { + var global_scope = make_scope(null, 0, {is_func: true}) + var i = 0 + var stmt = null + var name = null + + // Pre-register top-level function names + i = 0 + while (i < length(ast.functions)) { + name = ast.functions[i].name + if (name != null) sem_add_var(global_scope, name, {make: "function", fn_nr: 0}) + i = i + 1 + } + + // Check all statements + i = 0 + while (i < length(ast.statements)) { + sem_check_stmt(global_scope, ast.statements[i]) + i = i + 1 + } + + // Check function bodies + i = 0 + while (i < length(ast.functions)) { + sem_check_stmt(global_scope, ast.functions[i]) + i = i + 1 + } + + // Build program scope record and prepend + var sr = sem_build_scope_record(global_scope) + var new_scopes = [sr.rec] + i = 0 + while (i < length(scopes_array)) { + push(new_scopes, scopes_array[i]) + i = i + 1 + } + scopes_array = new_scopes + + // Attach to AST + ast.scopes = scopes_array + ast.intrinsics = intrinsics + if (length(sem_errors) > 0) { + ast.errors = sem_errors + } +} + +// ============================================================ +// Section 8: Main +// ============================================================ + +init_cursor() +var ast = parse_program() + +if (error_count == 0) { + semantic_check(ast) +} + +// Merge parse errors +var _mi = 0 +if (length(errors) > 0) { + if (ast.errors != null) { + _mi = 0 + while (_mi < length(errors)) { + push(ast.errors, errors[_mi]) + _mi = _mi + 1 + } + } else { + ast.errors = errors + } +} + +print(json.encode(ast)) diff --git a/source/cell.c b/source/cell.c index 56367f2b..5c4ad392 100644 --- a/source/cell.c +++ b/source/cell.c @@ -727,7 +727,6 @@ int cell_init(int argc, char **argv) /* Check for --mach-run flag to compile and run through MACH VM */ if (argc >= 3 && strcmp(argv[1], "--mach-run") == 0) { - const char *filename = argv[2]; if (!find_cell_shop()) return 1; size_t boot_size; @@ -755,7 +754,7 @@ int cell_init(int argc, char **argv) cJSON_Delete(boot_ast); return 1; } - JSContext *ctx = JS_NewContextWithHeapSize(rt, 256 * 1024); + JSContext *ctx = JS_NewContextWithHeapSize(rt, 16 * 1024 * 1024); if (!ctx) { printf("Failed to create JS context\n"); cJSON_Delete(boot_ast); JS_FreeRuntime(rt); @@ -766,7 +765,12 @@ int cell_init(int argc, char **argv) JSValue hidden_env = JS_NewObject(ctx); JS_SetPropertyStr(ctx, hidden_env, "os", js_os_use(ctx)); - JS_SetPropertyStr(ctx, hidden_env, "program", JS_NewString(ctx, filename)); + JSValue args_arr = JS_NewArray(ctx); + for (int i = 2; i < argc; i++) { + JSValue str = JS_NewString(ctx, argv[i]); + JS_ArrayPush(ctx, &args_arr, str); + } + JS_SetPropertyStr(ctx, hidden_env, "args", args_arr); hidden_env = JS_Stone(ctx, hidden_env); JSValue result = JS_RunMachTree(ctx, boot_ast, hidden_env); @@ -775,7 +779,9 @@ int cell_init(int argc, char **argv) int exit_code = 0; if (JS_IsException(result)) { JSValue exc = JS_GetException(ctx); - const char *err_str = JS_ToCString(ctx, exc); +const char *err_str = NULL; +JSValue msg = JS_GetPropertyStr(ctx, exc, "message"); +err_str = JS_ToCString(ctx, msg); if (err_str) { printf("Error: %s\n", err_str); JS_FreeCString(ctx, err_str); @@ -921,4 +927,4 @@ int uncaught_exception(JSContext *js, JSValue v) JS_FreeValue(js, exp); JS_FreeValue(js, v); return 0; -} \ No newline at end of file +} diff --git a/source/quickjs.h b/source/quickjs.h index d00ce026..c4a78d69 100644 --- a/source/quickjs.h +++ b/source/quickjs.h @@ -697,6 +697,9 @@ JSValue JS_GetProperty (JSContext *ctx, JSValue this_obj, JSValue prop); // For records JSValue JS_GetPropertyStr (JSContext *ctx, JSValue this_obj, const char *prop); int JS_SetPropertyStr (JSContext *ctx, JSValue this_obj, const char *prop, JSValue val); + +// Set property on the global object +int JS_SetGlobalStr (JSContext *ctx, const char *prop, JSValue val); int JS_SetProperty (JSContext *ctx, JSValue this_obj, JSValue prop, JSValue val); JSValue JS_GetPrototype (JSContext *ctx, JSValue val); diff --git a/source/runtime.c b/source/runtime.c index 324df6ce..780a2a1d 100644 --- a/source/runtime.c +++ b/source/runtime.c @@ -6204,6 +6204,13 @@ static int js_json_to_str (JSContext *ctx, JSONStringifyContext *jsc, JSValue ho goto exception; } + /* Heap strings are JS_TAG_PTR but must be quoted, not iterated as objects */ + if (JS_IsText (val_ref.val) && !MIST_IsImmediateASCII (val_ref.val)) { + val_ref.val = JS_ToQuotedString (ctx, val_ref.val); + if (JS_IsException (val_ref.val)) goto exception; + goto concat_value; + } + if (JS_IsObject ( val_ref.val)) { /* includes arrays (OBJ_ARRAY) since they have JS_TAG_PTR */ v = js_array_includes (ctx, jsc->stack, 1, &val_ref.val); @@ -9013,17 +9020,28 @@ static JSValue js_cell_array (JSContext *ctx, JSValue this_val, int argc, JSValu if (argc < 2 || JS_IsNull (argv[1])) { /* Split into characters */ - JSValue result = JS_NewArrayLen (ctx, len); - if (JS_IsException (result)) { return result; } - JSArray *out = JS_VALUE_GET_ARRAY (result); + JSGCRef arr_ref, str_ref; + JS_PushGCRef (ctx, &arr_ref); + JS_PushGCRef (ctx, &str_ref); + str_ref.val = arg; + arr_ref.val = JS_NewArray (ctx); + if (JS_IsException (arr_ref.val)) { + JS_PopGCRef (ctx, &str_ref); + JS_PopGCRef (ctx, &arr_ref); + return JS_EXCEPTION; + } for (int i = 0; i < len; i++) { - JSValue ch = js_sub_string_val (ctx, arg, i, i + 1); + JSValue ch = js_sub_string_val (ctx, str_ref.val, i, i + 1); if (JS_IsException (ch)) { + JS_PopGCRef (ctx, &str_ref); + JS_PopGCRef (ctx, &arr_ref); return JS_EXCEPTION; } - out->values[i] = ch; + JS_ArrayPush (ctx, &arr_ref.val, ch); } - out->len = len; + JSValue result = arr_ref.val; + JS_PopGCRef (ctx, &str_ref); + JS_PopGCRef (ctx, &arr_ref); return result; } @@ -11404,11 +11422,11 @@ static JSValue js_cell_length (JSContext *ctx, JSValue this_val, int argc, JSVal int tag = JS_VALUE_GET_TAG (val); /* Strings return codepoint count */ - if (tag == JS_TAG_STRING_IMM) { + if (MIST_IsImmediateASCII (val)) { return JS_NewInt32 (ctx, MIST_GetImmediateASCIILen (val)); } - if (tag == JS_TAG_STRING) { - JSText *p = JS_VALUE_GET_STRING (val); + if (JS_IsPtr (val) && objhdr_type (*chase (val)) == OBJ_TEXT) { + JSText *p = (JSText *)chase (val); return JS_NewInt32 (ctx, (int)JSText_len (p)); } @@ -11582,8 +11600,7 @@ static JSValue js_cell_is_stone (JSContext *ctx, JSValue this_val, int argc, JSV /* is_text(val) */ static JSValue js_cell_is_text (JSContext *ctx, JSValue this_val, int argc, JSValue *argv) { if (argc < 1) return JS_FALSE; - int tag = JS_VALUE_GET_TAG (argv[0]); - return JS_NewBool (ctx, tag == JS_TAG_STRING || tag == JS_TAG_STRING_IMM); + return JS_NewBool (ctx, JS_IsText (argv[0])); } /* is_proto(val, master) - check if val has master in prototype chain */ @@ -11737,6 +11754,10 @@ static JSValue js_cell_some(JSContext *ctx, JSValue this_val, int argc, JSValue /* GC-SAFE: Helper to set a global function. Creates function first, then reads ctx->global_obj to ensure it's not stale if GC ran during function creation. */ +int JS_SetGlobalStr (JSContext *ctx, const char *prop, JSValue val) { + return JS_SetPropertyStr(ctx, ctx->global_obj, prop, val); +} + static void js_set_global_cfunc(JSContext *ctx, const char *name, JSCFunction *func, int length) { JSGCRef ref; JS_PushGCRef(ctx, &ref); @@ -11799,7 +11820,7 @@ static void JS_AddIntrinsicBaseObjects (JSContext *ctx) { /* Core functions - using GC-safe helper */ js_set_global_cfunc(ctx, "eval", js_cell_eval, 2); - js_set_global_cfunc(ctx, "mach_eval", js_mach_eval, 2); + js_set_global_cfunc(ctx, "mach_eval", js_mach_eval, 3); js_set_global_cfunc(ctx, "stone", js_cell_stone, 1); js_set_global_cfunc(ctx, "length", js_cell_length, 1); js_set_global_cfunc(ctx, "call", js_cell_call, 3); diff --git a/tokenize.ce b/tokenize.ce new file mode 100644 index 00000000..f3d2abde --- /dev/null +++ b/tokenize.ce @@ -0,0 +1,569 @@ +var src = args[0] +var filename = length(args) > 1 ? args[1] : "" + +// Convert to codepoint array - integers are GC-safe immediate values +var len = length(src) +var cp = [] +var _i = 0 +while (_i < len) { + push(cp, codepoint(src[_i])) + _i = _i + 1 +} + +var pos = 0 +var row = 0 +var col = 0 +var tokens = [] + +// Codepoint constants +def CP_LF = 10 +def CP_CR = 13 +def CP_TAB = 9 +def CP_SPACE = 32 +def CP_BANG = 33 +def CP_DQUOTE = 34 +def CP_HASH = 35 +def CP_DOLLAR = 36 +def CP_PERCENT = 37 +def CP_AMP = 38 +def CP_SQUOTE = 39 +def CP_LPAREN = 40 +def CP_RPAREN = 41 +def CP_STAR = 42 +def CP_PLUS = 43 +def CP_COMMA = 44 +def CP_MINUS = 45 +def CP_DOT = 46 +def CP_SLASH = 47 +def CP_0 = 48 +def CP_1 = 49 +def CP_7 = 55 +def CP_9 = 57 +def CP_COLON = 58 +def CP_SEMI = 59 +def CP_LT = 60 +def CP_EQ = 61 +def CP_GT = 62 +def CP_QMARK = 63 +def CP_AT = 64 +def CP_A = 65 +def CP_B = 66 +def CP_E = 69 +def CP_F = 70 +def CP_O = 79 +def CP_X = 88 +def CP_Z = 90 +def CP_LBRACKET = 91 +def CP_BSLASH = 92 +def CP_RBRACKET = 93 +def CP_CARET = 94 +def CP_UNDERSCORE = 95 +def CP_BACKTICK = 96 +def CP_a = 97 +def CP_b = 98 +def CP_e = 101 +def CP_f = 102 +def CP_n = 110 +def CP_o = 111 +def CP_r = 114 +def CP_t = 116 +def CP_x = 120 +def CP_z = 122 +def CP_LBRACE = 123 +def CP_PIPE = 124 +def CP_RBRACE = 125 +def CP_TILDE = 126 + +// Keywords lookup +var keywords = { + if: "if", in: "in", do: "do", go: "go", + var: "var", def: "def", for: "for", + else: "else", this: "this", null: "null", true: "true", + false: "false", while: "while", break: "break", + return: "return", delete: "delete", + disrupt: "disrupt", function: "function", continue: "continue", + disruption: "disruption" +} + +function pk() { + if (pos >= len) return -1 + return cp[pos] +} + +function pk_at(n) { + var idx = pos + n + if (idx >= len) return -1 + return cp[idx] +} + +function adv() { + var c = cp[pos] + pos = pos + 1 + if (c == CP_LF) { + row = row + 1 + col = 0 + } else { + col = col + 1 + } + return c +} + +function is_digit(c) { + return c >= CP_0 && c <= CP_9 +} + +function is_hex(c) { + return (c >= CP_0 && c <= CP_9) || (c >= CP_a && c <= CP_f) || (c >= CP_A && c <= CP_F) +} + +function is_alpha(c) { + return (c >= CP_a && c <= CP_z) || (c >= CP_A && c <= CP_Z) +} + +function is_alnum(c) { + return is_alpha(c) || is_digit(c) +} + +function is_ident_start(c) { + return is_alpha(c) || c == CP_UNDERSCORE || c == CP_DOLLAR +} + +function is_ident_char(c) { + return is_alnum(c) || c == CP_UNDERSCORE || c == CP_DOLLAR || c == CP_QMARK || c == CP_BANG +} + +function substr(start, end) { + var s = "" + var i = start + while (i < end) { + s = s + character(cp[i]) + i = i + 1 + } + return s +} + +function read_string(quote_cp) { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + adv() // skip opening quote + while (pos < len && pk() != quote_cp) { + if (pk() == CP_BSLASH) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_SQUOTE) { value = value + "'" } + else if (esc == CP_DQUOTE) { value = value + "\"" } + else if (esc == CP_0) { value = value + character(0) } + else if (esc == CP_BACKTICK) { value = value + "`" } + else { value = value + character(esc) } + } else { + value = value + character(adv()) + } + } + if (pos < len) adv() // skip closing quote + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) +} + +function read_template() { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + var depth = 0 + var tc = 0 + var q = 0 + adv() // skip opening backtick + while (pos < len && pk() != CP_BACKTICK) { + if (pk() == CP_BSLASH && pos + 1 < len) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_BACKTICK) { value = value + "`" } + else if (esc == CP_DOLLAR) { value = value + "$" } + else if (esc == CP_0) { value = value + character(0) } + else { value = value + character(esc) } + } else if (pk() == CP_DOLLAR && pos + 1 < len && pk_at(1) == CP_LBRACE) { + adv() // $ + adv() // { + depth = 1 + while (pos < len && depth > 0) { + tc = pk() + if (tc == CP_LBRACE) { depth = depth + 1; adv() } + else if (tc == CP_RBRACE) { depth = depth - 1; adv() } + else if (tc == CP_SQUOTE || tc == CP_DQUOTE || tc == CP_BACKTICK) { + q = adv() + while (pos < len && pk() != q) { + if (pk() == CP_BSLASH && pos + 1 < len) adv() + adv() + } + if (pos < len) adv() + } else { adv() } + } + } else { + value = value + character(adv()) + } + } + if (pos < len) adv() // skip closing backtick + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) +} + +function read_number() { + var start = pos + var start_row = row + var start_col = col + if (pk() == CP_0 && (pk_at(1) == CP_x || pk_at(1) == CP_X)) { + adv(); adv() + while (pos < len && (is_hex(pk()) || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_b || pk_at(1) == CP_B)) { + adv(); adv() + while (pos < len && (pk() == CP_0 || pk() == CP_1 || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_o || pk_at(1) == CP_O)) { + adv(); adv() + while (pos < len && pk() >= CP_0 && pk() <= CP_7) adv() + } else { + while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + if (pos < len && pk() == CP_DOT) { + adv() + while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + } + if (pos < len && (pk() == CP_e || pk() == CP_E)) { + adv() + if (pos < len && (pk() == CP_PLUS || pk() == CP_MINUS)) adv() + while (pos < len && is_digit(pk())) adv() + } + } + var raw = substr(start, pos) + push(tokens, { + kind: "number", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw, number: number(raw) + }) +} + +function read_name() { + var start = pos + var start_row = row + var start_col = col + while (pos < len && is_ident_char(pk())) adv() + var name = substr(start, pos) + var kw = keywords[name] + if (kw != null) { + push(tokens, { + kind: kw, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) + } else { + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: name + }) + } +} + +function read_comment() { + var start = pos + var start_row = row + var start_col = col + if (pk_at(1) == CP_SLASH) { + while (pos < len && pk() != CP_LF && pk() != CP_CR) adv() + } else { + adv(); adv() // skip /* + while (pos < len) { + if (pk() == CP_STAR && pk_at(1) == CP_SLASH) { + adv(); adv() + break + } + adv() + } + } + var raw = substr(start, pos) + push(tokens, { + kind: "comment", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw + }) +} + +function emit_op(kind, count) { + var start = pos + var start_row = row + var start_col = col + var i = 0 + while (i < count) { adv(); i = i + 1 } + push(tokens, { + kind: kind, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) +} + +function emit_ident(count) { + var start = pos + var start_row = row + var start_col = col + var val = "" + var i = 0 + while (i < count) { val = val + character(adv()); i = i + 1 } + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: val + }) +} + +function tokenize_one() { + var c = pk() + var start = 0 + var start_row = 0 + var start_col = 0 + var raw = "" + if (c == -1) return false + + // Newline + if (c == CP_LF) { + start = pos + start_row = row + start_col = col + adv() + push(tokens, { + kind: "newline", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: "\n" + }) + return true + } + + if (c == CP_CR) { + start = pos + start_row = row + start_col = col + adv() + if (pos < len && pk() == CP_LF) adv() + push(tokens, { + kind: "newline", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: "\n" + }) + return true + } + + // Whitespace + if (c == CP_SPACE || c == CP_TAB) { + start = pos + start_row = row + start_col = col + while (pos < len && (pk() == CP_SPACE || pk() == CP_TAB)) adv() + raw = substr(start, pos) + push(tokens, { + kind: "space", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw + }) + return true + } + + // Strings + if (c == CP_SQUOTE || c == CP_DQUOTE) { + read_string(c) + return true + } + + // Template + if (c == CP_BACKTICK) { + read_template() + return true + } + + // Numbers + if (is_digit(c)) { + read_number() + return true + } + if (c == CP_DOT && is_digit(pk_at(1))) { + read_number() + return true + } + + // Identifiers and keywords + if (is_ident_start(c)) { + read_name() + return true + } + + // Comments and / + if (c == CP_SLASH) { + if (pk_at(1) == CP_SLASH || pk_at(1) == CP_STAR) { + read_comment() + return true + } + if (pk_at(1) == CP_EQ) { emit_op("/=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("/", 1) + return true + } + + // Operators + if (c == CP_STAR) { + if (pk_at(1) == CP_STAR) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("**=", 3); return true } + emit_op("**", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("*=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("*", 1); return true + } + + if (c == CP_PERCENT) { + if (pk_at(1) == CP_EQ) { emit_op("%=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("%", 1); return true + } + + if (c == CP_PLUS) { + if (pk_at(1) == CP_EQ) { emit_op("+=", 2); return true } + if (pk_at(1) == CP_PLUS) { emit_op("++", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("+", 1); return true + } + + if (c == CP_MINUS) { + if (pk_at(1) == CP_EQ) { emit_op("-=", 2); return true } + if (pk_at(1) == CP_MINUS) { emit_op("--", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("-", 1); return true + } + + if (c == CP_LT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op("<=", 2); return true } + if (pk_at(1) == CP_LT) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("<<=", 3); return true } + emit_op("<<", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("<", 1); return true + } + + if (c == CP_GT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op(">=", 2); return true } + if (pk_at(1) == CP_GT) { + if (pk_at(2) == CP_GT) { + if (pk_at(3) == CP_BANG) { emit_ident(4); return true } + if (pk_at(3) == CP_EQ) { emit_op(">>>=", 4); return true } + emit_op(">>>", 3); return true + } + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op(">>=", 3); return true } + emit_op(">>", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op(">", 1); return true + } + + if (c == CP_EQ) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_EQ) { emit_op("===", 3); return true } + emit_op("==", 2); return true + } + if (pk_at(1) == CP_GT) { emit_op("=>", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("=", 1); return true + } + + if (c == CP_BANG) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("!==", 3); return true } + emit_op("!=", 2); return true + } + emit_op("!", 1); return true + } + + if (c == CP_AMP) { + if (pk_at(1) == CP_AMP) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("&&=", 3); return true } + emit_op("&&", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("&=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("&", 1); return true + } + + if (c == CP_PIPE) { + if (pk_at(1) == CP_PIPE) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("||=", 3); return true } + emit_op("||", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("|=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("|", 1); return true + } + + if (c == CP_CARET) { + if (pk_at(1) == CP_EQ) { emit_op("^=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("^", 1); return true + } + + if (c == CP_LBRACKET) { + if (pk_at(1) == CP_RBRACKET && pk_at(2) == CP_BANG) { emit_ident(3); return true } + emit_op("[", 1); return true + } + + if (c == CP_TILDE) { + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("~", 1); return true + } + + // Single character tokens + emit_op(character(c), 1) + return true +} + +// Main loop +while (pos < len) { + tokenize_one() +} + +// EOF token +push(tokens, { + kind: "eof", at: pos, + from_row: row, from_column: col, + to_row: row, to_column: col +}) + +print(json.encode({filename: filename, tokens: tokens})) From 8fc9bfe01304c8e51e40aefd44ce224b1f2f3a28 Mon Sep 17 00:00:00 2001 From: John Alanbrook Date: Mon, 9 Feb 2026 12:19:05 -0600 Subject: [PATCH 2/2] parse and tokenize modules --- internal/bootstrap.cm | 7 +- parse.ce | 2374 +---------------------------------------- parse.cm | 1850 ++++++++++++++++++++++++++++++++ tokenize.ce | 570 +--------- tokenize.cm | 499 +++++++++ 5 files changed, 2359 insertions(+), 2941 deletions(-) create mode 100644 parse.cm create mode 100644 tokenize.cm diff --git a/internal/bootstrap.cm b/internal/bootstrap.cm index 002a8519..a217dd48 100644 --- a/internal/bootstrap.cm +++ b/internal/bootstrap.cm @@ -24,10 +24,9 @@ function use(path) { if (fd.is_file(file_path)) { script = text(fd.slurp(file_path)) - exports = {} - mach_eval(path, script, {use: use, exports: exports}) - use_cache[path] = exports - return exports + result = mach_eval(path, script, {use: use}) + use_cache[path] = result + return result } // Try embedded C module diff --git a/parse.ce b/parse.ce index 70d0637d..da795b37 100644 --- a/parse.ce +++ b/parse.ce @@ -1,2373 +1,7 @@ -// ============================================================ -// Section 1: Inline Tokenizer (from tokenize.ce) -// ============================================================ - +var tokenize = use("tokenize") +var parse = use("parse") var src = args[0] var filename = length(args) > 1 ? args[1] : "" - -// Convert to codepoint array -var _src_len = length(src) -var cp = [] -var _i = 0 -while (_i < _src_len) { - push(cp, codepoint(src[_i])) - _i = _i + 1 -} - -var pos = 0 -var row = 0 -var col = 0 -var tokens = [] - -// Codepoint constants -def CP_LF = 10 -def CP_CR = 13 -def CP_TAB = 9 -def CP_SPACE = 32 -def CP_BANG = 33 -def CP_DQUOTE = 34 -def CP_HASH = 35 -def CP_DOLLAR = 36 -def CP_PERCENT = 37 -def CP_AMP = 38 -def CP_SQUOTE = 39 -def CP_LPAREN = 40 -def CP_RPAREN = 41 -def CP_STAR = 42 -def CP_PLUS = 43 -def CP_COMMA = 44 -def CP_MINUS = 45 -def CP_DOT = 46 -def CP_SLASH = 47 -def CP_0 = 48 -def CP_1 = 49 -def CP_7 = 55 -def CP_9 = 57 -def CP_COLON = 58 -def CP_SEMI = 59 -def CP_LT = 60 -def CP_EQ = 61 -def CP_GT = 62 -def CP_QMARK = 63 -def CP_AT = 64 -def CP_A = 65 -def CP_B = 66 -def CP_E = 69 -def CP_F = 70 -def CP_O = 79 -def CP_X = 88 -def CP_Z = 90 -def CP_LBRACKET = 91 -def CP_BSLASH = 92 -def CP_RBRACKET = 93 -def CP_CARET = 94 -def CP_UNDERSCORE = 95 -def CP_BACKTICK = 96 -def CP_a = 97 -def CP_b = 98 -def CP_e = 101 -def CP_f = 102 -def CP_n = 110 -def CP_o = 111 -def CP_r = 114 -def CP_t = 116 -def CP_x = 120 -def CP_z = 122 -def CP_LBRACE = 123 -def CP_PIPE = 124 -def CP_RBRACE = 125 -def CP_TILDE = 126 - -var keywords = { - if: "if", in: "in", do: "do", go: "go", - var: "var", def: "def", for: "for", - else: "else", this: "this", null: "null", true: "true", - false: "false", while: "while", break: "break", - return: "return", delete: "delete", - disrupt: "disrupt", function: "function", continue: "continue", - disruption: "disruption" -} - -function pk() { - if (pos >= _src_len) return -1 - return cp[pos] -} - -function pk_at(n) { - var idx = pos + n - if (idx >= _src_len) return -1 - return cp[idx] -} - -function adv() { - var c = cp[pos] - pos = pos + 1 - if (c == CP_LF) { - row = row + 1 - col = 0 - } else { - col = col + 1 - } - return c -} - -function is_digit(c) { - return c >= CP_0 && c <= CP_9 -} - -function is_hex(c) { - return (c >= CP_0 && c <= CP_9) || (c >= CP_a && c <= CP_f) || (c >= CP_A && c <= CP_F) -} - -function is_alpha(c) { - return (c >= CP_a && c <= CP_z) || (c >= CP_A && c <= CP_Z) -} - -function is_alnum(c) { - return is_alpha(c) || is_digit(c) -} - -function is_ident_start(c) { - return is_alpha(c) || c == CP_UNDERSCORE || c == CP_DOLLAR -} - -function is_ident_char(c) { - return is_alnum(c) || c == CP_UNDERSCORE || c == CP_DOLLAR || c == CP_QMARK || c == CP_BANG -} - -function substr(start, end) { - var s = "" - var i = start - while (i < end) { - s = s + character(cp[i]) - i = i + 1 - } - return s -} - -function read_string(quote_cp) { - var start = pos - var start_row = row - var start_col = col - var value = "" - var esc = 0 - adv() - while (pos < _src_len && pk() != quote_cp) { - if (pk() == CP_BSLASH) { - adv() - esc = adv() - if (esc == CP_n) { value = value + "\n" } - else if (esc == CP_t) { value = value + "\t" } - else if (esc == CP_r) { value = value + "\r" } - else if (esc == CP_BSLASH) { value = value + "\\" } - else if (esc == CP_SQUOTE) { value = value + "'" } - else if (esc == CP_DQUOTE) { value = value + "\"" } - else if (esc == CP_0) { value = value + character(0) } - else if (esc == CP_BACKTICK) { value = value + "`" } - else { value = value + character(esc) } - } else { - value = value + character(adv()) - } - } - if (pos < _src_len) adv() - push(tokens, { - kind: "text", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: value - }) -} - -function read_template() { - var start = pos - var start_row = row - var start_col = col - var value = "" - var esc = 0 - var depth = 0 - var tc = 0 - var q = 0 - adv() - while (pos < _src_len && pk() != CP_BACKTICK) { - if (pk() == CP_BSLASH && pos + 1 < _src_len) { - adv() - esc = adv() - if (esc == CP_n) { value = value + "\n" } - else if (esc == CP_t) { value = value + "\t" } - else if (esc == CP_r) { value = value + "\r" } - else if (esc == CP_BSLASH) { value = value + "\\" } - else if (esc == CP_BACKTICK) { value = value + "`" } - else if (esc == CP_DOLLAR) { value = value + "$" } - else if (esc == CP_0) { value = value + character(0) } - else { value = value + character(esc) } - } else if (pk() == CP_DOLLAR && pos + 1 < _src_len && pk_at(1) == CP_LBRACE) { - adv() - adv() - depth = 1 - while (pos < _src_len && depth > 0) { - tc = pk() - if (tc == CP_LBRACE) { depth = depth + 1; adv() } - else if (tc == CP_RBRACE) { depth = depth - 1; adv() } - else if (tc == CP_SQUOTE || tc == CP_DQUOTE || tc == CP_BACKTICK) { - q = adv() - while (pos < _src_len && pk() != q) { - if (pk() == CP_BSLASH && pos + 1 < _src_len) adv() - adv() - } - if (pos < _src_len) adv() - } else { adv() } - } - } else { - value = value + character(adv()) - } - } - if (pos < _src_len) adv() - push(tokens, { - kind: "text", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: value - }) -} - -function read_number() { - var start = pos - var start_row = row - var start_col = col - var raw = "" - if (pk() == CP_0 && (pk_at(1) == CP_x || pk_at(1) == CP_X)) { - adv(); adv() - while (pos < _src_len && (is_hex(pk()) || pk() == CP_UNDERSCORE)) adv() - } else if (pk() == CP_0 && (pk_at(1) == CP_b || pk_at(1) == CP_B)) { - adv(); adv() - while (pos < _src_len && (pk() == CP_0 || pk() == CP_1 || pk() == CP_UNDERSCORE)) adv() - } else if (pk() == CP_0 && (pk_at(1) == CP_o || pk_at(1) == CP_O)) { - adv(); adv() - while (pos < _src_len && pk() >= CP_0 && pk() <= CP_7) adv() - } else { - while (pos < _src_len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() - if (pos < _src_len && pk() == CP_DOT) { - adv() - while (pos < _src_len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() - } - if (pos < _src_len && (pk() == CP_e || pk() == CP_E)) { - adv() - if (pos < _src_len && (pk() == CP_PLUS || pk() == CP_MINUS)) adv() - while (pos < _src_len && is_digit(pk())) adv() - } - } - raw = substr(start, pos) - push(tokens, { - kind: "number", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: raw, number: number(raw) - }) -} - -function read_name() { - var start = pos - var start_row = row - var start_col = col - var name = "" - var kw = null - while (pos < _src_len && is_ident_char(pk())) adv() - name = substr(start, pos) - kw = keywords[name] - if (kw != null) { - push(tokens, { - kind: kw, at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col - }) - } else { - push(tokens, { - kind: "name", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: name - }) - } -} - -function read_comment() { - var start = pos - var start_row = row - var start_col = col - var raw = "" - if (pk_at(1) == CP_SLASH) { - while (pos < _src_len && pk() != CP_LF && pk() != CP_CR) adv() - } else { - adv(); adv() - while (pos < _src_len) { - if (pk() == CP_STAR && pk_at(1) == CP_SLASH) { - adv(); adv() - break - } - adv() - } - } - raw = substr(start, pos) - push(tokens, { - kind: "comment", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: raw - }) -} - -function emit_op(kind, count) { - var start = pos - var start_row = row - var start_col = col - var i = 0 - while (i < count) { adv(); i = i + 1 } - push(tokens, { - kind: kind, at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col - }) -} - -function emit_ident(count) { - var start = pos - var start_row = row - var start_col = col - var val = "" - var i = 0 - while (i < count) { val = val + character(adv()); i = i + 1 } - push(tokens, { - kind: "name", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: val - }) -} - -function tokenize_one() { - var c = pk() - var start = 0 - var start_row = 0 - var start_col = 0 - var raw = "" - if (c == -1) return false - - if (c == CP_LF) { - start = pos; start_row = row; start_col = col - adv() - push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) - return true - } - if (c == CP_CR) { - start = pos; start_row = row; start_col = col - adv() - if (pos < _src_len && pk() == CP_LF) adv() - push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) - return true - } - if (c == CP_SPACE || c == CP_TAB) { - start = pos; start_row = row; start_col = col - while (pos < _src_len && (pk() == CP_SPACE || pk() == CP_TAB)) adv() - raw = substr(start, pos) - push(tokens, { kind: "space", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw }) - return true - } - if (c == CP_SQUOTE || c == CP_DQUOTE) { read_string(c); return true } - if (c == CP_BACKTICK) { read_template(); return true } - if (is_digit(c)) { read_number(); return true } - if (c == CP_DOT && is_digit(pk_at(1))) { read_number(); return true } - if (is_ident_start(c)) { read_name(); return true } - if (c == CP_SLASH) { - if (pk_at(1) == CP_SLASH || pk_at(1) == CP_STAR) { read_comment(); return true } - if (pk_at(1) == CP_EQ) { emit_op("/=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("/", 1); return true - } - if (c == CP_STAR) { - if (pk_at(1) == CP_STAR) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("**=", 3); return true } - emit_op("**", 2); return true - } - if (pk_at(1) == CP_EQ) { emit_op("*=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("*", 1); return true - } - if (c == CP_PERCENT) { - if (pk_at(1) == CP_EQ) { emit_op("%=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("%", 1); return true - } - if (c == CP_PLUS) { - if (pk_at(1) == CP_EQ) { emit_op("+=", 2); return true } - if (pk_at(1) == CP_PLUS) { emit_op("++", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("+", 1); return true - } - if (c == CP_MINUS) { - if (pk_at(1) == CP_EQ) { emit_op("-=", 2); return true } - if (pk_at(1) == CP_MINUS) { emit_op("--", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("-", 1); return true - } - if (c == CP_LT) { - if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(1) == CP_EQ) { emit_op("<=", 2); return true } - if (pk_at(1) == CP_LT) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("<<=", 3); return true } - emit_op("<<", 2); return true - } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("<", 1); return true - } - if (c == CP_GT) { - if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(1) == CP_EQ) { emit_op(">=", 2); return true } - if (pk_at(1) == CP_GT) { - if (pk_at(2) == CP_GT) { - if (pk_at(3) == CP_BANG) { emit_ident(4); return true } - if (pk_at(3) == CP_EQ) { emit_op(">>>=", 4); return true } - emit_op(">>>", 3); return true - } - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op(">>=", 3); return true } - emit_op(">>", 2); return true - } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op(">", 1); return true - } - if (c == CP_EQ) { - if (pk_at(1) == CP_EQ) { - if (pk_at(2) == CP_EQ) { emit_op("===", 3); return true } - emit_op("==", 2); return true - } - if (pk_at(1) == CP_GT) { emit_op("=>", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("=", 1); return true - } - if (c == CP_BANG) { - if (pk_at(1) == CP_EQ) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("!==", 3); return true } - emit_op("!=", 2); return true - } - emit_op("!", 1); return true - } - if (c == CP_AMP) { - if (pk_at(1) == CP_AMP) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("&&=", 3); return true } - emit_op("&&", 2); return true - } - if (pk_at(1) == CP_EQ) { emit_op("&=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("&", 1); return true - } - if (c == CP_PIPE) { - if (pk_at(1) == CP_PIPE) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("||=", 3); return true } - emit_op("||", 2); return true - } - if (pk_at(1) == CP_EQ) { emit_op("|=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("|", 1); return true - } - if (c == CP_CARET) { - if (pk_at(1) == CP_EQ) { emit_op("^=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("^", 1); return true - } - if (c == CP_LBRACKET) { - if (pk_at(1) == CP_RBRACKET && pk_at(2) == CP_BANG) { emit_ident(3); return true } - emit_op("[", 1); return true - } - if (c == CP_TILDE) { - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("~", 1); return true - } - emit_op(character(c), 1) - return true -} - -// Tokenize -while (pos < _src_len) { - tokenize_one() -} -push(tokens, { kind: "eof", at: pos, from_row: row, from_column: col, to_row: row, to_column: col }) - -// ============================================================ -// Section 2: Parser Cursor -// ============================================================ - -var cursor = 0 -var tok = null -var got_lf = false -var prev_tok = null - -function advance() { - var t = null - var k = null - prev_tok = tok - cursor = cursor + 1 - got_lf = false - while (cursor < length(tokens)) { - t = tokens[cursor] - k = t.kind - if (k == "space" || k == "comment") { - cursor = cursor + 1 - continue - } - if (k == "newline") { - got_lf = true - cursor = cursor + 1 - continue - } - tok = t - return null - } - tok = tokens[length(tokens) - 1] -} - -function peek_ahead(n) { - var c = cursor + 1 - var count = 0 - var t = null - var k = null - while (c < length(tokens)) { - t = tokens[c] - k = t.kind - if (k != "space" && k != "comment" && k != "newline") { - count = count + 1 - if (count == n) return t - } - c = c + 1 - } - return tokens[length(tokens) - 1] -} - -function init_cursor() { - cursor = -1 - advance() -} - -// ============================================================ -// Section 3: AST Helpers -// ============================================================ - -var errors = [] -var error_count = 0 -var function_nr = 1 - -function ast_node(kind, token) { - return { - kind: kind, - at: token.at, - from_row: token.from_row, - from_column: token.from_column - } -} - -function ast_node_end(node) { - node.to_row = prev_tok.to_row - node.to_column = prev_tok.to_column - return node -} - -function parse_error(token, msg) { - if (error_count >= 5) return null - error_count = error_count + 1 - push(errors, { - message: msg, - line: token.from_row + 1, - column: token.from_column + 1, - offset: token.at - }) -} - -function is_keyword(kind) { - return kind == "if" || kind == "in" || kind == "do" || kind == "go" || - kind == "var" || kind == "def" || kind == "for" || - kind == "else" || kind == "this" || kind == "null" || kind == "true" || - kind == "false" || kind == "while" || kind == "break" || - kind == "return" || kind == "delete" || - kind == "disrupt" || kind == "function" || kind == "continue" || - kind == "disruption" -} - -// ============================================================ -// Section 4: Expression Parsing -// ============================================================ - -// Forward declarations via var -var parse_expr = null -var parse_assign_expr = null -var parse_assign = null -var parse_statement = null -var parse_block_statements = null -var parse_function_inner = null -var parse_arrow_function = null - -function is_arrow_function() { - // Check if ( ... ) => pattern - if (tok.kind != "(") return false - var c = cursor + 1 - var depth = 1 - var k = null - while (c < length(tokens) && depth > 0) { - k = tokens[c].kind - if (k == "(") { depth = depth + 1 } - else if (k == ")") { depth = depth - 1 } - else if (k == "text" || k == "number") { null } - c = c + 1 - } - // Skip whitespace/newline/comment tokens - while (c < length(tokens)) { - k = tokens[c].kind - if (k != "space" && k != "newline" && k != "comment") break - c = c + 1 - } - if (c >= length(tokens)) return false - return tokens[c].kind == "=>" -} - -function parse_primary() { - var start = tok - var node = null - var k = tok.kind - var list = null - var pair = null - var left = null - var right = null - var is_ident = false - var is_kw = false - var p1 = null - var elem = null - var fn_start = null - var fn = null - var name_item = null - var params = null - var param = null - var rpos = 0 - var pattern_str = "" - var flags = "" - - if (k == "number") { - node = ast_node("number", start) - node.value = tok.value - node.number = tok.number - advance() - ast_node_end(node) - return node - } - if (k == "text") { - node = ast_node("text", start) - node.value = tok.value - advance() - ast_node_end(node) - return node - } - if (k == "name") { - // Check for single-param arrow: name => - p1 = peek_ahead(1) - if (p1.kind == "=>") { - return parse_arrow_function() - } - node = ast_node("name", start) - node.name = tok.value - advance() - ast_node_end(node) - return node - } - if (k == "null") { - node = ast_node("null", start) - advance() - ast_node_end(node) - return node - } - if (k == "true") { - node = ast_node("true", start) - advance() - ast_node_end(node) - return node - } - if (k == "false") { - node = ast_node("false", start) - advance() - ast_node_end(node) - return node - } - if (k == "this") { - node = ast_node("this", start) - advance() - ast_node_end(node) - return node - } - if (k == "[") { - node = ast_node("array", start) - list = [] - node.list = list - advance() - while (tok.kind != "]" && tok.kind != "eof") { - elem = parse_assign_expr() - if (elem != null) push(list, elem) - if (tok.kind == ",") advance() - else break - } - ast_node_end(node) - if (tok.kind == "]") advance() - else if (tok.kind == "eof") parse_error(tok, "unterminated array literal, expected ']'") - return node - } - if (k == "{") { - node = ast_node("record", start) - list = [] - node.list = list - advance() - while (tok.kind != "}" && tok.kind != "eof") { - pair = {} - is_ident = (tok.kind == "name") - is_kw = is_keyword(tok.kind) - if (is_ident || is_kw || tok.kind == "text" || tok.kind == "number") { - if (is_kw) { - left = ast_node("name", tok) - left.name = tok.kind - advance() - ast_node_end(left) - } else { - left = parse_primary() - } - pair.left = left - } else if (tok.kind == "[") { - advance() - left = parse_assign_expr() - pair.left = left - if (tok.kind == "]") advance() - else parse_error(tok, "expected ']' after computed property") - } else { - parse_error(tok, "expected property name in object literal") - break - } - if (tok.kind == ":") { - advance() - right = parse_assign_expr() - pair.right = right - } else if (tok.kind == "(") { - // Method shorthand - fn_start = tok - fn = ast_node("function", fn_start) - name_item = pair.left - if (name_item != null && name_item.name != null) { - fn.name = name_item.name - } - params = [] - fn.list = params - advance() - while (tok.kind != ")" && tok.kind != "eof") { - if (tok.kind == "name") { - param = ast_node("name", tok) - param.name = tok.value - advance() - ast_node_end(param) - if (tok.kind == "=" || tok.kind == "|") { - advance() - param.expression = parse_expr() - } - push(params, param) - } else { - parse_error(tok, "expected parameter name") - break - } - if (tok.kind == ",") advance() - else break - } - if (tok.kind == ")") advance() - else if (tok.kind == "eof") parse_error(tok, "unterminated method parameter list") - if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") - if (tok.kind == "{") { - advance() - fn.statements = parse_block_statements() - if (tok.kind == "}") advance() - else if (tok.kind == "eof") parse_error(tok, "unterminated method body") - } else { - parse_error(tok, "expected '{' for method body") - } - fn.function_nr = function_nr - function_nr = function_nr + 1 - ast_node_end(fn) - pair.right = fn - } else if (!(is_ident && (tok.kind == "," || tok.kind == "}"))) { - parse_error(tok, "expected ':' after property name") - } - push(list, pair) - if (tok.kind == ",") advance() - else break - } - ast_node_end(node) - if (tok.kind == "}") advance() - else if (tok.kind == "eof") parse_error(tok, "unterminated object literal, expected '}'") - return node - } - if (k == "(") { - if (is_arrow_function()) { - return parse_arrow_function() - } - advance() - node = parse_expr() - if (tok.kind == ")") advance() - else if (tok.kind == "eof") parse_error(tok, "unterminated parenthesized expression, expected ')'") - else parse_error(tok, "expected ')' after expression") - return node - } - if (k == "function") { - return parse_function_inner() - } - if (k == "/") { - // Regex literal - node = ast_node("regexp", start) - // Re-scan from token position to parse regex - rpos = tok.at + 1 - pattern_str = "" - flags = "" - while (rpos < _src_len && cp[rpos] != CP_SLASH) { - if (cp[rpos] == CP_BSLASH && rpos + 1 < _src_len) { - pattern_str = pattern_str + character(cp[rpos]) + character(cp[rpos + 1]) - rpos = rpos + 2 - } else { - pattern_str = pattern_str + character(cp[rpos]) - rpos = rpos + 1 - } - } - if (rpos < _src_len) rpos = rpos + 1 - while (rpos < _src_len && is_alpha(cp[rpos])) { - flags = flags + character(cp[rpos]) - rpos = rpos + 1 - } - node.pattern = pattern_str - if (length(flags) > 0) node.flags = flags - advance() - ast_node_end(node) - return node - } - - // Error - if (k == "eof") { - parse_error(start, "unexpected end of input") - } else { - parse_error(start, "unexpected token where expression expected") - } - advance() - return null -} - -function parse_postfix() { - var node = parse_primary() - var start = null - var new_node = null - var index = null - var arg = null - var args_list = null - if (node == null) return null - while (true) { - start = tok - if (tok.kind == ".") { - advance() - new_node = ast_node(".", start) - new_node.left = node - if (tok.kind == "name" || is_keyword(tok.kind)) { - if (tok.kind == "name") { - new_node.right = tok.value - } else { - new_node.right = tok.kind - } - advance() - } else { - parse_error(tok, "expected property name after '.'") - } - ast_node_end(new_node) - node = new_node - } else if (tok.kind == "[") { - advance() - new_node = ast_node("[", start) - new_node.left = node - if (tok.kind == "]") { - advance() - } else { - index = parse_assign_expr() - new_node.right = index - if (tok.kind == "]") advance() - else parse_error(tok, "expected ']'") - } - ast_node_end(new_node) - node = new_node - } else if (tok.kind == "(") { - advance() - new_node = ast_node("(", start) - new_node.expression = node - args_list = [] - new_node.list = args_list - while (tok.kind != ")" && tok.kind != "eof") { - arg = parse_assign_expr() - if (arg != null) push(args_list, arg) - if (tok.kind == ",") advance() - else break - } - if (tok.kind == ")") advance() - else parse_error(tok, "unterminated argument list, expected ')'") - ast_node_end(new_node) - node = new_node - } else if (tok.kind == "++") { - new_node = ast_node("++", start) - new_node.expression = node - new_node.postfix = true - advance() - ast_node_end(new_node) - node = new_node - } else if (tok.kind == "--") { - new_node = ast_node("--", start) - new_node.expression = node - new_node.postfix = true - advance() - ast_node_end(new_node) - node = new_node - } else { - break - } - } - return node -} - -function parse_unary() { - var start = tok - var node = null - var expr = null - var k = tok.kind - if (k == "!") { - advance() - node = ast_node("!", start) - node.expression = parse_unary() - ast_node_end(node) - return node - } - if (k == "~") { - advance() - node = ast_node("~", start) - node.expression = parse_unary() - ast_node_end(node) - return node - } - if (k == "+") { - advance() - node = ast_node("+unary", start) - node.expression = parse_unary() - ast_node_end(node) - return node - } - if (k == "-") { - advance() - node = ast_node("-unary", start) - node.expression = parse_unary() - ast_node_end(node) - return node - } - if (k == "++") { - advance() - node = ast_node("++", start) - node.expression = parse_unary() - node.postfix = false - ast_node_end(node) - return node - } - if (k == "--") { - advance() - node = ast_node("--", start) - node.expression = parse_unary() - node.postfix = false - ast_node_end(node) - return node - } - if (k == "delete") { - advance() - node = ast_node("delete", start) - node.expression = parse_unary() - ast_node_end(node) - return node - } - return parse_postfix() -} - -// Binary operator precedence -var binop_prec = { - "**": 14, - "*": 13, "/": 13, "%": 13, - "+": 12, "-": 12, - "<<": 11, ">>": 11, ">>>": 11, - "<": 10, ">": 10, "<=": 10, ">=": 10, in: 10, - "==": 9, "!=": 9, "===": 9, "!==": 9, - "&": 8, "^": 7, "|": 6, - "&&": 5, "||": 4 -} - -function parse_binary(min_prec) { - var left_node = parse_unary() - var start = null - var op = null - var prec = null - var next_prec = 0 - var right_node = null - var node = null - if (left_node == null) return null - while (true) { - start = tok - op = tok.kind - prec = binop_prec[op] - if (prec == null || prec < min_prec) break - advance() - next_prec = prec + 1 - if (prec == 14) next_prec = prec // right-assoc for ** - right_node = parse_binary(next_prec) - node = ast_node(op, start) - node.left = left_node - node.right = right_node - ast_node_end(node) - left_node = node - } - return left_node -} - -function parse_ternary() { - var cond = parse_binary(1) - var start = null - var then_expr = null - var else_expr = null - var node = null - if (cond == null) return null - if (tok.kind == "?") { - start = tok - advance() - then_expr = parse_expr() - if (tok.kind == ":") advance() - else parse_error(tok, "expected ':' in ternary expression") - else_expr = parse_expr() - node = ast_node("then", start) - node.expression = cond - node.then = then_expr - node.else = else_expr - ast_node_end(node) - return node - } - return cond -} - -// Assign operators -var assign_ops = { - "=": "assign", "+=": "+=", "-=": "-=", "*=": "*=", "/=": "/=", "%=": "%=", - "<<=": "<<=", ">>=": ">>=", ">>>=": ">>>=", - "&=": "&=", "^=": "^=", "|=": "|=", "**=": "**=", - "&&=": "&&=", "||=": "||=" -} - -parse_assign = function(unused) { - var left_node = parse_ternary() - var start = null - var kind = null - var right_node = null - var node = null - var left_kind = null - var right_kind = null - if (left_node == null) return null - start = tok - kind = assign_ops[tok.kind] - if (kind == null) return left_node - - // Validate assignment target - left_kind = left_node.kind - if (left_kind != "name" && left_kind != "." && left_kind != "[") { - parse_error(start, "invalid assignment left-hand side") - } - - advance() - right_node = parse_assign() - node = ast_node(kind, start) - node.left = left_node - node.right = right_node - - // Check push/pop bracket syntax - if (left_node.kind == "[" && left_node.right == null) node.push = true - if (right_node != null && right_node.kind == "[" && right_node.right == null) node.pop = true - - ast_node_end(node) - return node -} - -parse_assign_expr = function(unused) { - return parse_assign() -} - -parse_expr = function(unused) { - var left_node = parse_assign() - var start = null - var right_node = null - var node = null - if (left_node == null) return null - while (tok.kind == ",") { - start = tok - advance() - right_node = parse_assign() - node = ast_node(",", start) - node.left = left_node - node.right = right_node - ast_node_end(node) - left_node = node - } - return left_node -} - -// ============================================================ -// Section 5: Statement Parsing -// ============================================================ - -var in_disruption = 0 - -function expect_semi() { - if (tok.kind == ";") { advance(); return null } - if (tok.kind == "eof" || tok.kind == "}" || got_lf || tok.kind == "else") return null - parse_error(tok, "expecting ';'") -} - -function sync_to_statement() { - var k = null - while (tok.kind != "eof") { - k = tok.kind - if (k == ";") { advance(); return null } - if (k == "}") return null - if (k == "var" || k == "def" || k == "if" || k == "while" || - k == "for" || k == "return" || k == "disrupt" || - k == "function" || k == "break" || k == "continue" || k == "do") return null - advance() - } -} - -parse_block_statements = function(unused) { - var stmts = [] - var before = null - var stmt = null - while (tok.kind != "}" && tok.kind != "eof") { - before = cursor - stmt = parse_statement() - if (stmt != null) { - push(stmts, stmt) - } else if (cursor == before) { - sync_to_statement() - } - } - return stmts -} - -parse_function_inner = function(unused) { - var start = tok - var node = ast_node("function", start) - var params = [] - var stmts = null - var param = null - var prev_names = null - var pname = null - var dup = false - var j = 0 - var old_dis = 0 - - if (in_disruption) { - parse_error(tok, "cannot define function inside disruption clause") - } - - advance() // skip 'function' - - // Optional name - if (tok.kind == "name") { - node.name = tok.value - advance() - } - - // Parameters - node.list = params - if (tok.kind == "(") { - advance() - prev_names = [] - while (tok.kind != ")" && tok.kind != "eof") { - if (tok.kind == "name") { - param = ast_node("name", tok) - param.name = tok.value - // Check duplicate - pname = tok.value - dup = false - j = 0 - while (j < length(prev_names)) { - if (prev_names[j] == pname) { dup = true; break } - j = j + 1 - } - if (dup) parse_error(tok, "duplicate parameter name '" + pname + "'") - push(prev_names, pname) - advance() - ast_node_end(param) - if (tok.kind == "=" || tok.kind == "|") { - advance() - param.expression = parse_assign_expr() - } - push(params, param) - } else { - parse_error(tok, "expected parameter name") - break - } - if (tok.kind == ",") advance() - else break - } - if (tok.kind == ")") advance() - else if (tok.kind == "eof") parse_error(tok, "unterminated function parameter list, expected ')'") - } else { - parse_error(tok, "expected '(' after function name") - } - - if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") - - // Body - if (tok.kind == "{") { - advance() - stmts = parse_block_statements() - node.statements = stmts - if (tok.kind == "}") advance() - else if (tok.kind == "eof") parse_error(tok, "unterminated function body, expected '}'") - } else { - parse_error(tok, "expected '{' for function body") - } - - // Disruption clause - if (tok.kind == "disruption") { - advance() - if (tok.kind == "{") { - advance() - old_dis = in_disruption - in_disruption = 1 - node.disruption = parse_block_statements() - in_disruption = old_dis - if (tok.kind == "}") advance() - else if (tok.kind == "eof") parse_error(tok, "unterminated disruption clause, expected '}'") - } else { - parse_error(tok, "expected '{' after disruption") - } - } - - node.function_nr = function_nr - function_nr = function_nr + 1 - ast_node_end(node) - return node -} - -parse_arrow_function = function(unused) { - var start = tok - var node = ast_node("function", start) - var params = [] - var param = null - var stmts = null - var ret = null - var expr = null - var prev_names = null - var pname = null - var dup = false - var j = 0 - node.arrow = true - - if (in_disruption) { - parse_error(tok, "cannot define function inside disruption clause") - } - - node.list = params - - if (tok.kind == "name") { - // Single param without parens - param = ast_node("name", tok) - param.name = tok.value - advance() - ast_node_end(param) - push(params, param) - } else if (tok.kind == "(") { - advance() - prev_names = [] - while (tok.kind != ")" && tok.kind != "eof") { - if (tok.kind == "name") { - param = ast_node("name", tok) - param.name = tok.value - pname = tok.value - dup = false - j = 0 - while (j < length(prev_names)) { - if (prev_names[j] == pname) { dup = true; break } - j = j + 1 - } - if (dup) parse_error(tok, "duplicate parameter name '" + pname + "'") - push(prev_names, pname) - advance() - ast_node_end(param) - if (tok.kind == "=" || tok.kind == "|") { - advance() - param.expression = parse_assign_expr() - } - push(params, param) - } else { - parse_error(tok, "expected parameter name") - break - } - if (tok.kind == ",") advance() - else break - } - if (tok.kind == ")") advance() - } - - if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") - - // Arrow token - if (tok.kind != "=>") { - parse_error(tok, "expected '=>' in arrow function") - } else { - advance() - } - - // Body - if (tok.kind == "{") { - advance() - stmts = parse_block_statements() - node.statements = stmts - if (tok.kind == "}") advance() - } else { - // Expression body - stmts = [] - ret = ast_node("return", tok) - expr = parse_assign_expr() - ret.expression = expr - ast_node_end(ret) - push(stmts, ret) - node.statements = stmts - } - - node.function_nr = function_nr - function_nr = function_nr + 1 - ast_node_end(node) - return node -} - -parse_statement = function(unused) { - var start = tok - var node = null - var k = tok.kind - var stmts = null - var cond = null - var then_stmts = null - var else_stmts = null - var else_ifs = null - var body = null - var expr = null - var init = null - var test = null - var update = null - var left_node = null - var right_node = null - var kind_name = null - var is_def = false - var decls = null - var decl_count = 0 - var var_name = null - var right_kind = null - var elif = null - var p1_tok = null - var labeled_stmt = null - - if (k == "{") { - node = ast_node("block", start) - advance() - stmts = parse_block_statements() - node.statements = stmts - if (tok.kind == "}") advance() - ast_node_end(node) - return node - } - - if (k == "var" || k == "def") { - kind_name = k - is_def = (k == "def") - advance() - if (tok.kind != "name") { - parse_error(tok, "expected identifier after '" + kind_name + "'") - return null - } - decls = [] - decl_count = 0 - while (tok.kind == "name") { - node = ast_node(kind_name, start) - left_node = ast_node("name", tok) - left_node.name = tok.value - var_name = tok.value - advance() - ast_node_end(left_node) - node.left = left_node - if (tok.kind == "=") { - advance() - right_node = parse_assign_expr() - node.right = right_node - if (right_node != null && right_node.kind == "[" && right_node.right == null) { - node.pop = true - } - } else if (is_def) { - parse_error(start, "missing initializer for constant '" + var_name + "'") - } - ast_node_end(node) - push(decls, node) - decl_count = decl_count + 1 - if (tok.kind == ",") advance() - else break - } - expect_semi() - if (decl_count == 1) { - return decls[0] - } - node = ast_node("var_list", start) - node.list = decls - ast_node_end(node) - return node - } - - if (k == "if") { - node = ast_node("if", start) - advance() - if (tok.kind == "(") advance() - else parse_error(tok, "expected '(' before condition") - cond = parse_expr() - node.expression = cond - if (tok.kind == ")") advance() - else parse_error(tok, "expected ')' after if condition") - then_stmts = [] - node.then = then_stmts - body = parse_statement() - if (body != null) push(then_stmts, body) - else_ifs = [] - node.list = else_ifs - if (tok.kind == "else") { - advance() - if (tok.kind == "if") { - elif = parse_statement() - if (elif != null) push(else_ifs, elif) - } else { - else_stmts = [] - node.else = else_stmts - body = parse_statement() - if (body != null) push(else_stmts, body) - } - } - ast_node_end(node) - return node - } - - if (k == "while") { - node = ast_node("while", start) - advance() - if (tok.kind == "(") advance() - else parse_error(tok, "expected '(' before condition") - cond = parse_expr() - node.expression = cond - if (tok.kind == ")") advance() - else parse_error(tok, "expected ')' after while condition") - stmts = [] - node.statements = stmts - body = parse_statement() - if (body != null) push(stmts, body) - ast_node_end(node) - return node - } - - if (k == "do") { - node = ast_node("do", start) - advance() - stmts = [] - node.statements = stmts - body = parse_statement() - if (body != null) push(stmts, body) - if (tok.kind == "while") advance() - else parse_error(tok, "expected 'while' after do body") - if (tok.kind == "(") advance() - else parse_error(tok, "expected '(' before condition") - cond = parse_expr() - node.expression = cond - if (tok.kind == ")") advance() - else parse_error(tok, "expected ')' after do-while condition") - expect_semi() - ast_node_end(node) - return node - } - - if (k == "for") { - node = ast_node("for", start) - advance() - if (tok.kind == "(") advance() - else parse_error(tok, "expected '(' after for") - if (tok.kind != ";") { - if (tok.kind == "var" || tok.kind == "def") { - init = parse_statement() - node.init = init - } else { - init = parse_expr() - node.init = init - if (tok.kind == ";") advance() - } - } else { - advance() - } - if (tok.kind != ";") { - test = parse_expr() - node.test = test - } - if (tok.kind == ";") advance() - if (tok.kind != ")") { - update = parse_expr() - node.update = update - } - if (tok.kind == ")") advance() - else parse_error(tok, "expected ')' after for clauses") - stmts = [] - node.statements = stmts - body = parse_statement() - if (body != null) push(stmts, body) - ast_node_end(node) - return node - } - - if (k == "return") { - node = ast_node("return", start) - advance() - if (tok.kind != ";" && tok.kind != "}" && !got_lf) { - expr = parse_expr() - node.expression = expr - } - expect_semi() - ast_node_end(node) - return node - } - - if (k == "go") { - node = ast_node("go", start) - advance() - if (tok.kind != ";" && tok.kind != "}" && !got_lf) { - expr = parse_expr() - node.expression = expr - } - expect_semi() - ast_node_end(node) - return node - } - - if (k == "disrupt") { - node = ast_node("disrupt", start) - advance() - expect_semi() - ast_node_end(node) - return node - } - - if (k == "break") { - node = ast_node("break", start) - advance() - if (tok.kind == "name" && !got_lf) { - node.name = tok.value - advance() - } - expect_semi() - ast_node_end(node) - return node - } - - if (k == "continue") { - node = ast_node("continue", start) - advance() - if (tok.kind == "name" && !got_lf) { - node.name = tok.value - advance() - } - expect_semi() - ast_node_end(node) - return node - } - - if (k == "function") { - return parse_function_inner() - } - - if (k == ";") { - advance() - return null - } - - if (k == "name") { - // Check for labeled statement - p1_tok = peek_ahead(1) - if (p1_tok.kind == ":") { - node = ast_node("label", start) - node.name = tok.value - advance() // skip identifier - advance() // skip colon - labeled_stmt = parse_statement() - node.statement = labeled_stmt - ast_node_end(node) - return node - } - } - - // Expression statement - expr = parse_expr() - if (expr != null) { - node = ast_node("call", start) - node.expression = expr - ast_node_end(node) - expect_semi() - return node - } - parse_error(start, "unexpected token at start of statement") - return null -} - -// ============================================================ -// Section 6: Program -// ============================================================ - -function parse_program() { - var root = {kind: "program", filename: filename} - var functions = [] - var statements = [] - var before = 0 - var stmt = null - root.functions = functions - root.statements = statements - - while (tok.kind != "eof") { - before = cursor - stmt = parse_statement() - if (stmt != null) { - if (stmt.kind == "function") { - push(functions, stmt) - } else { - push(statements, stmt) - } - } else if (cursor == before) { - sync_to_statement() - } - } - return root -} - -// ============================================================ -// Section 7: Semantic Analysis -// ============================================================ - -var sem_errors = [] -var scopes_array = [] -var intrinsics = [] -var block_var_counter = 0 - -function sem_error(node, msg) { - var err = {message: msg} - if (node.from_row != null) err.line = node.from_row + 1 - if (node.from_column != null) err.column = node.from_column + 1 - push(sem_errors, err) -} - -function make_scope(parent, fn_nr, opts) { - return { - parent: parent, - vars: [], - in_loop: opts.in_loop == true, - function_nr: fn_nr, - is_function_scope: opts.is_func == true, - block_depth: opts.bdepth != null ? opts.bdepth : 0 - } -} - -function sem_add_var(scope, name, make_opts) { - push(scope.vars, { - name: name, - scope_name: null, - is_const: make_opts.is_const == true, - make: make_opts.make, - function_nr: make_opts.fn_nr, - nr_uses: 0, - closure: 0 - }) -} - -function sem_lookup_var(scope, name) { - var result = {v: null, level: 0, def_function_nr: -1} - var cur_fn = scope.function_nr - var s = scope - var i = 0 - while (s != null) { - i = 0 - while (i < length(s.vars)) { - if (s.vars[i].name == name) { - result.v = s.vars[i] - result.def_function_nr = s.vars[i].function_nr - return result - } - i = i + 1 - } - if (s.parent != null && s.parent.function_nr != cur_fn) { - result.level = result.level + 1 - cur_fn = s.parent.function_nr - } - s = s.parent - } - return result -} - -function sem_find_var(scope, name) { - var r = sem_lookup_var(scope, name) - return r.v -} - -function sem_in_loop(scope) { - var s = scope - while (s != null) { - if (s.in_loop) return true - s = s.parent - } - return false -} - -function sem_add_intrinsic(name) { - var i = 0 - while (i < length(intrinsics)) { - if (intrinsics[i] == name) return null - i = i + 1 - } - push(intrinsics, name) -} - -var functino_names = { - "+!": true, "-!": true, "*!": true, "/!": true, "%!": true, "**!": true, - "!": true, "<=!": true, ">=!": true, "=!": true, "!=!": true, - "&!": true, "|!": true, "^!": true, "<>!": true, ">>>!": true, - "&&!": true, "||!": true, "~!": true, "[]!": true -} - -function is_functino_name(name) { - return functino_names[name] == true -} - -function sem_propagate_block_vars(parent, block) { - var i = 0 - var v = null - var sn = null - while (i < length(block.vars)) { - v = block.vars[i] - sn = v.scope_name - if (sn == null) sn = v.name - push(parent.vars, { - name: sn, - scope_name: null, - is_const: v.is_const, - make: v.make, - function_nr: v.function_nr, - nr_uses: v.nr_uses, - closure: v.closure - }) - i = i + 1 - } -} - -function sem_build_scope_record(scope) { - var rec = {function_nr: scope.function_nr} - var slots = 0 - var close_slots = 0 - var i = 0 - var v = null - while (i < length(scope.vars)) { - v = scope.vars[i] - rec[v.name] = { - make: v.make, - function_nr: v.function_nr, - nr_uses: v.nr_uses, - closure: v.closure == 1, - level: 0 - } - slots = slots + 1 - if (v.closure) close_slots = close_slots + 1 - i = i + 1 - } - return {rec: rec, nr_slots: slots, nr_close: close_slots} -} - -// Forward declarations -var sem_check_expr = null -var sem_check_stmt = null - -function sem_predeclare_vars(scope, stmts) { - var i = 0 - var stmt = null - var kind = null - var name = null - var item = null - var ik = null - var j = 0 - while (i < length(stmts)) { - stmt = stmts[i] - kind = stmt.kind - if (kind == "function") { - name = stmt.name - if (name != null && sem_find_var(scope, name) == null) { - sem_add_var(scope, name, {make: "function", fn_nr: scope.function_nr}) - } - } else if (kind == "var") { - name = stmt.left.name - if (name != null && sem_find_var(scope, name) == null) { - sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) - } - } else if (kind == "var_list") { - j = 0 - while (j < length(stmt.list)) { - item = stmt.list[j] - ik = item.kind - if (ik == "var") { - name = item.left.name - if (name != null && sem_find_var(scope, name) == null) { - sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) - } - } - j = j + 1 - } - } - i = i + 1 - } -} - -function sem_check_assign_target(scope, left_node) { - if (left_node == null) return null - var kind = left_node.kind - var name = null - var v = null - var r = null - var obj_expr = null - - if (kind == "name") { - name = left_node.name - if (name == null) return null - v = sem_find_var(scope, name) - if (v == null) { - sem_error(left_node, "cannot assign to unbound variable '" + name + "'") - } else if (v.is_const) { - sem_error(left_node, "cannot assign to constant '" + name + "'") - } - r = sem_lookup_var(scope, name) - if (r.v != null) { - left_node.level = r.level - left_node.function_nr = r.def_function_nr - if (r.v.scope_name != null) left_node.scope_name = r.v.scope_name - } else { - left_node.level = -1 - } - } else if (kind == "." || kind == "[") { - obj_expr = left_node.left - sem_check_expr(scope, obj_expr) - if (kind == "[" && left_node.right != null) { - sem_check_expr(scope, left_node.right) - } - } -} - -sem_check_expr = function(scope, expr) { - if (expr == null) return null - var kind = expr.kind - if (kind == null) return null - var name = null - var r = null - var i = 0 - var operand = null - var v = null - var prop = null - var val = null - var fn_nr_val = null - var fn_scope = null - var pname = null - var def_val = null - var sr = null - - // Assignment operators - if (kind == "assign" || kind == "+=" || kind == "-=" || kind == "*=" || - kind == "/=" || kind == "%=" || kind == "<<=" || kind == ">>=" || - kind == ">>>=" || kind == "&=" || kind == "^=" || kind == "|=" || - kind == "**=" || kind == "&&=" || kind == "||=") { - sem_check_assign_target(scope, expr.left) - sem_check_expr(scope, expr.right) - return null - } - - // Increment/decrement - if (kind == "++" || kind == "--") { - operand = expr.expression - if (operand != null && operand.kind == "name") { - name = operand.name - if (name != null) { - v = sem_find_var(scope, name) - if (v == null) { - sem_error(expr, "cannot assign to unbound variable '" + name + "'") - } else if (v.is_const) { - sem_error(expr, "cannot assign to constant '" + name + "'") - } - r = sem_lookup_var(scope, name) - if (r.v != null) { - operand.level = r.level - operand.function_nr = r.def_function_nr - if (r.v.scope_name != null) operand.scope_name = r.v.scope_name - } else { - operand.level = -1 - } - } - } - return null - } - - // Binary ops - if (kind == "," || kind == "+" || kind == "-" || kind == "*" || - kind == "/" || kind == "%" || kind == "==" || kind == "!=" || - kind == "<" || kind == ">" || kind == "<=" || kind == ">=" || - kind == "&&" || kind == "||" || kind == "&" || - kind == "|" || kind == "^" || kind == "<<" || kind == ">>" || - kind == ">>>" || kind == "**" || kind == "in" || - kind == "." || kind == "[") { - sem_check_expr(scope, expr.left) - sem_check_expr(scope, expr.right) - return null - } - - // Ternary - if (kind == "then") { - sem_check_expr(scope, expr.expression) - sem_check_expr(scope, expr.then) - sem_check_expr(scope, expr.else) - return null - } - - // Call - if (kind == "(") { - sem_check_expr(scope, expr.expression) - i = 0 - while (i < length(expr.list)) { - sem_check_expr(scope, expr.list[i]) - i = i + 1 - } - return null - } - - // Unary ops - if (kind == "!" || kind == "~" || kind == "delete" || - kind == "-unary" || kind == "+unary") { - sem_check_expr(scope, expr.expression) - return null - } - - // Array literal - if (kind == "array") { - i = 0 - while (i < length(expr.list)) { - sem_check_expr(scope, expr.list[i]) - i = i + 1 - } - return null - } - - // Record literal - if (kind == "record") { - i = 0 - while (i < length(expr.list)) { - prop = expr.list[i] - val = prop.right - sem_check_expr(scope, val) - i = i + 1 - } - return null - } - - // Function expression - if (kind == "function") { - fn_nr_val = expr.function_nr - if (fn_nr_val == null) fn_nr_val = scope.function_nr - fn_scope = make_scope(scope, fn_nr_val, {is_func: true}) - expr.outer = scope.function_nr - // Add params - i = 0 - while (i < length(expr.list)) { - pname = expr.list[i].name - if (pname != null) sem_add_var(fn_scope, pname, {is_const: true, make: "input", fn_nr: fn_nr_val}) - def_val = expr.list[i].expression - if (def_val != null) sem_check_expr(fn_scope, def_val) - i = i + 1 - } - // Pre-register declarations - if (expr.statements != null) { - sem_predeclare_vars(fn_scope, expr.statements) - i = 0 - while (i < length(expr.statements)) { - sem_check_stmt(fn_scope, expr.statements[i]) - i = i + 1 - } - } - // Disruption - if (expr.disruption != null) { - i = 0 - while (i < length(expr.disruption)) { - sem_check_stmt(fn_scope, expr.disruption[i]) - i = i + 1 - } - } - // Build scope record - sr = sem_build_scope_record(fn_scope) - push(scopes_array, sr.rec) - expr.nr_slots = sr.nr_slots - expr.nr_close_slots = sr.nr_close - return null - } - - // Template literal - if (kind == "text literal") { - i = 0 - while (i < length(expr.list)) { - sem_check_expr(scope, expr.list[i]) - i = i + 1 - } - return null - } - - // Name - if (kind == "name") { - name = expr.name - if (name != null) { - if (is_functino_name(name)) { - expr.make = "functino" - expr.level = -1 - return null - } - r = sem_lookup_var(scope, name) - if (r.v != null) { - expr.level = r.level - expr.function_nr = r.def_function_nr - r.v.nr_uses = r.v.nr_uses + 1 - if (r.level > 0) r.v.closure = 1 - if (r.v.scope_name != null) expr.scope_name = r.v.scope_name - } else { - expr.level = -1 - sem_add_intrinsic(name) - } - } - return null - } - - // Leaf nodes: number, text, regexp, null, true, false, this -} - -sem_check_stmt = function(scope, stmt) { - if (stmt == null) return null - var kind = stmt.kind - if (kind == null) return null - var name = null - var existing = null - var i = 0 - var sn = null - var then_scope = null - var list_scope = null - var else_scope = null - var loop_scope = null - var do_scope = null - var for_scope = null - var init_kind = null - var blk_scope = null - var fn_nr_val = null - var fn_scope = null - var pname = null - var def_val = null - var sr = null - - if (kind == "var_list") { - i = 0 - while (i < length(stmt.list)) { - sem_check_stmt(scope, stmt.list[i]) - i = i + 1 - } - return null - } - - if (kind == "var") { - name = stmt.left.name - if (name != null) { - existing = sem_find_var(scope, name) - if (existing != null && existing.is_const) { - sem_error(stmt.left, "cannot redeclare constant '" + name + "'") - } - if (existing == null || existing.function_nr != scope.function_nr || scope.block_depth > 0) { - sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) - } - if (scope.block_depth > 0) { - sn = "_" + name + "_" + text(block_var_counter) - block_var_counter = block_var_counter + 1 - scope.vars[length(scope.vars) - 1].scope_name = sn - stmt.left.scope_name = sn - } - } - sem_check_expr(scope, stmt.right) - return null - } - - if (kind == "def") { - name = stmt.left.name - if (name != null) { - existing = sem_find_var(scope, name) - if (existing != null && existing.is_const) { - sem_error(stmt.left, "cannot redeclare constant '" + name + "'") - } else if (existing != null && !existing.is_const && existing.function_nr == scope.function_nr) { - existing.is_const = 1 - existing.make = "def" - } else { - sem_add_var(scope, name, {is_const: true, make: "def", fn_nr: scope.function_nr}) - if (scope.block_depth > 0) { - sn = "_" + name + "_" + text(block_var_counter) - block_var_counter = block_var_counter + 1 - scope.vars[length(scope.vars) - 1].scope_name = sn - stmt.left.scope_name = sn - } - } - } - sem_check_expr(scope, stmt.right) - return null - } - - if (kind == "call") { - sem_check_expr(scope, stmt.expression) - return null - } - - if (kind == "if") { - sem_check_expr(scope, stmt.expression) - // then - then_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) - i = 0 - while (i < length(stmt.then)) { - sem_check_stmt(then_scope, stmt.then[i]) - i = i + 1 - } - sem_propagate_block_vars(scope, then_scope) - // else-if list - list_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) - i = 0 - while (i < length(stmt.list)) { - sem_check_stmt(list_scope, stmt.list[i]) - i = i + 1 - } - sem_propagate_block_vars(scope, list_scope) - // else - if (stmt.else != null) { - else_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) - i = 0 - while (i < length(stmt.else)) { - sem_check_stmt(else_scope, stmt.else[i]) - i = i + 1 - } - sem_propagate_block_vars(scope, else_scope) - } - return null - } - - if (kind == "while") { - sem_check_expr(scope, stmt.expression) - loop_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) - i = 0 - while (i < length(stmt.statements)) { - sem_check_stmt(loop_scope, stmt.statements[i]) - i = i + 1 - } - sem_propagate_block_vars(scope, loop_scope) - return null - } - - if (kind == "do") { - do_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) - i = 0 - while (i < length(stmt.statements)) { - sem_check_stmt(do_scope, stmt.statements[i]) - i = i + 1 - } - sem_propagate_block_vars(scope, do_scope) - sem_check_expr(scope, stmt.expression) - return null - } - - if (kind == "for") { - for_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) - if (stmt.init != null) { - init_kind = stmt.init.kind - if (init_kind == "var" || init_kind == "def") { - sem_check_stmt(for_scope, stmt.init) - } else { - sem_check_expr(for_scope, stmt.init) - } - } - sem_check_expr(for_scope, stmt.test) - sem_check_expr(for_scope, stmt.update) - i = 0 - while (i < length(stmt.statements)) { - sem_check_stmt(for_scope, stmt.statements[i]) - i = i + 1 - } - sem_propagate_block_vars(scope, for_scope) - return null - } - - if (kind == "return" || kind == "go") { - sem_check_expr(scope, stmt.expression) - return null - } - - if (kind == "disrupt") { - return null - } - - if (kind == "break") { - if (!sem_in_loop(scope)) { - sem_error(stmt, "'break' used outside of loop") - } - return null - } - - if (kind == "continue") { - if (!sem_in_loop(scope)) { - sem_error(stmt, "'continue' used outside of loop") - } - return null - } - - if (kind == "block") { - blk_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) - i = 0 - while (i < length(stmt.statements)) { - sem_check_stmt(blk_scope, stmt.statements[i]) - i = i + 1 - } - sem_propagate_block_vars(scope, blk_scope) - return null - } - - if (kind == "label") { - sem_check_stmt(scope, stmt.statement) - return null - } - - if (kind == "function") { - name = stmt.name - if (name != null) sem_add_var(scope, name, {make: "function", fn_nr: scope.function_nr}) - fn_nr_val = stmt.function_nr - if (fn_nr_val == null) fn_nr_val = scope.function_nr - fn_scope = make_scope(scope, fn_nr_val, {is_func: true}) - stmt.outer = scope.function_nr - i = 0 - while (i < length(stmt.list)) { - pname = stmt.list[i].name - if (pname != null) sem_add_var(fn_scope, pname, {is_const: true, make: "input", fn_nr: fn_nr_val}) - def_val = stmt.list[i].expression - if (def_val != null) sem_check_expr(fn_scope, def_val) - i = i + 1 - } - sem_predeclare_vars(fn_scope, stmt.statements) - i = 0 - while (i < length(stmt.statements)) { - sem_check_stmt(fn_scope, stmt.statements[i]) - i = i + 1 - } - if (stmt.disruption != null) { - i = 0 - while (i < length(stmt.disruption)) { - sem_check_stmt(fn_scope, stmt.disruption[i]) - i = i + 1 - } - } - sr = sem_build_scope_record(fn_scope) - push(scopes_array, sr.rec) - stmt.nr_slots = sr.nr_slots - stmt.nr_close_slots = sr.nr_close - return null - } -} - -function semantic_check(ast) { - var global_scope = make_scope(null, 0, {is_func: true}) - var i = 0 - var stmt = null - var name = null - - // Pre-register top-level function names - i = 0 - while (i < length(ast.functions)) { - name = ast.functions[i].name - if (name != null) sem_add_var(global_scope, name, {make: "function", fn_nr: 0}) - i = i + 1 - } - - // Check all statements - i = 0 - while (i < length(ast.statements)) { - sem_check_stmt(global_scope, ast.statements[i]) - i = i + 1 - } - - // Check function bodies - i = 0 - while (i < length(ast.functions)) { - sem_check_stmt(global_scope, ast.functions[i]) - i = i + 1 - } - - // Build program scope record and prepend - var sr = sem_build_scope_record(global_scope) - var new_scopes = [sr.rec] - i = 0 - while (i < length(scopes_array)) { - push(new_scopes, scopes_array[i]) - i = i + 1 - } - scopes_array = new_scopes - - // Attach to AST - ast.scopes = scopes_array - ast.intrinsics = intrinsics - if (length(sem_errors) > 0) { - ast.errors = sem_errors - } -} - -// ============================================================ -// Section 8: Main -// ============================================================ - -init_cursor() -var ast = parse_program() - -if (error_count == 0) { - semantic_check(ast) -} - -// Merge parse errors -var _mi = 0 -if (length(errors) > 0) { - if (ast.errors != null) { - _mi = 0 - while (_mi < length(errors)) { - push(ast.errors, errors[_mi]) - _mi = _mi + 1 - } - } else { - ast.errors = errors - } -} - +var result = tokenize(src, filename) +var ast = parse(result.tokens, src, filename) print(json.encode(ast)) diff --git a/parse.cm b/parse.cm new file mode 100644 index 00000000..f3aced0f --- /dev/null +++ b/parse.cm @@ -0,0 +1,1850 @@ +def CP_SLASH = 47 +def CP_BSLASH = 92 + +var is_alpha = function(c) { + return (c >= 65 && c <= 90) || (c >= 97 && c <= 122) +} + +var parse = function(tokens, src, filename) { + var _src_len = length(src) + var cp = [] + var _i = 0 + while (_i < _src_len) { + push(cp, codepoint(src[_i])) + _i = _i + 1 + } + + // ============================================================ + // Parser Cursor + // ============================================================ + + var cursor = 0 + var tok = null + var got_lf = false + var prev_tok = null + + var advance = function() { + var t = null + var k = null + prev_tok = tok + cursor = cursor + 1 + got_lf = false + while (cursor < length(tokens)) { + t = tokens[cursor] + k = t.kind + if (k == "space" || k == "comment") { + cursor = cursor + 1 + continue + } + if (k == "newline") { + got_lf = true + cursor = cursor + 1 + continue + } + tok = t + return null + } + tok = tokens[length(tokens) - 1] + } + + var peek_ahead = function(n) { + var c = cursor + 1 + var count = 0 + var t = null + var k = null + while (c < length(tokens)) { + t = tokens[c] + k = t.kind + if (k != "space" && k != "comment" && k != "newline") { + count = count + 1 + if (count == n) return t + } + c = c + 1 + } + return tokens[length(tokens) - 1] + } + + var init_cursor = function() { + cursor = -1 + advance() + } + + // ============================================================ + // AST Helpers + // ============================================================ + + var errors = [] + var error_count = 0 + var function_nr = 1 + + var ast_node = function(kind, token) { + return { + kind: kind, + at: token.at, + from_row: token.from_row, + from_column: token.from_column + } + } + + var ast_node_end = function(node) { + node.to_row = prev_tok.to_row + node.to_column = prev_tok.to_column + return node + } + + var parse_error = function(token, msg) { + if (error_count >= 5) return null + error_count = error_count + 1 + push(errors, { + message: msg, + line: token.from_row + 1, + column: token.from_column + 1, + offset: token.at + }) + } + + var is_keyword = function(kind) { + return kind == "if" || kind == "in" || kind == "do" || kind == "go" || + kind == "var" || kind == "def" || kind == "for" || + kind == "else" || kind == "this" || kind == "null" || kind == "true" || + kind == "false" || kind == "while" || kind == "break" || + kind == "return" || kind == "delete" || + kind == "disrupt" || kind == "function" || kind == "continue" || + kind == "disruption" + } + + // ============================================================ + // Expression Parsing + // ============================================================ + + // Forward declarations via var + var parse_expr = null + var parse_assign_expr = null + var parse_assign = null + var parse_statement = null + var parse_block_statements = null + var parse_function_inner = null + var parse_arrow_function = null + + var is_arrow_function = function() { + if (tok.kind != "(") return false + var c = cursor + 1 + var depth = 1 + var k = null + while (c < length(tokens) && depth > 0) { + k = tokens[c].kind + if (k == "(") { depth = depth + 1 } + else if (k == ")") { depth = depth - 1 } + else if (k == "text" || k == "number") { null } + c = c + 1 + } + while (c < length(tokens)) { + k = tokens[c].kind + if (k != "space" && k != "newline" && k != "comment") break + c = c + 1 + } + if (c >= length(tokens)) return false + return tokens[c].kind == "=>" + } + + var parse_primary = function() { + var start = tok + var node = null + var k = tok.kind + var list = null + var pair = null + var left = null + var right = null + var is_ident = false + var is_kw = false + var p1 = null + var elem = null + var fn_start = null + var fn = null + var name_item = null + var params = null + var param = null + var rpos = 0 + var pattern_str = "" + var flags = "" + + if (k == "number") { + node = ast_node("number", start) + node.value = tok.value + node.number = tok.number + advance() + ast_node_end(node) + return node + } + if (k == "text") { + node = ast_node("text", start) + node.value = tok.value + advance() + ast_node_end(node) + return node + } + if (k == "name") { + p1 = peek_ahead(1) + if (p1.kind == "=>") { + return parse_arrow_function() + } + node = ast_node("name", start) + node.name = tok.value + advance() + ast_node_end(node) + return node + } + if (k == "null") { + node = ast_node("null", start) + advance() + ast_node_end(node) + return node + } + if (k == "true") { + node = ast_node("true", start) + advance() + ast_node_end(node) + return node + } + if (k == "false") { + node = ast_node("false", start) + advance() + ast_node_end(node) + return node + } + if (k == "this") { + node = ast_node("this", start) + advance() + ast_node_end(node) + return node + } + if (k == "[") { + node = ast_node("array", start) + list = [] + node.list = list + advance() + while (tok.kind != "]" && tok.kind != "eof") { + elem = parse_assign_expr() + if (elem != null) push(list, elem) + if (tok.kind == ",") advance() + else break + } + ast_node_end(node) + if (tok.kind == "]") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated array literal, expected ']'") + return node + } + if (k == "{") { + node = ast_node("record", start) + list = [] + node.list = list + advance() + while (tok.kind != "}" && tok.kind != "eof") { + pair = {} + is_ident = (tok.kind == "name") + is_kw = is_keyword(tok.kind) + if (is_ident || is_kw || tok.kind == "text" || tok.kind == "number") { + if (is_kw) { + left = ast_node("name", tok) + left.name = tok.kind + advance() + ast_node_end(left) + } else { + left = parse_primary() + } + pair.left = left + } else if (tok.kind == "[") { + advance() + left = parse_assign_expr() + pair.left = left + if (tok.kind == "]") advance() + else parse_error(tok, "expected ']' after computed property") + } else { + parse_error(tok, "expected property name in object literal") + break + } + if (tok.kind == ":") { + advance() + right = parse_assign_expr() + pair.right = right + } else if (tok.kind == "(") { + fn_start = tok + fn = ast_node("function", fn_start) + name_item = pair.left + if (name_item != null && name_item.name != null) { + fn.name = name_item.name + } + params = [] + fn.list = params + advance() + while (tok.kind != ")" && tok.kind != "eof") { + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + advance() + ast_node_end(param) + if (tok.kind == "=" || tok.kind == "|") { + advance() + param.expression = parse_expr() + } + push(params, param) + } else { + parse_error(tok, "expected parameter name") + break + } + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated method parameter list") + if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") + if (tok.kind == "{") { + advance() + fn.statements = parse_block_statements() + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated method body") + } else { + parse_error(tok, "expected '{' for method body") + } + fn.function_nr = function_nr + function_nr = function_nr + 1 + ast_node_end(fn) + pair.right = fn + } else if (!(is_ident && (tok.kind == "," || tok.kind == "}"))) { + parse_error(tok, "expected ':' after property name") + } + push(list, pair) + if (tok.kind == ",") advance() + else break + } + ast_node_end(node) + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated object literal, expected '}'") + return node + } + if (k == "(") { + if (is_arrow_function()) { + return parse_arrow_function() + } + advance() + node = parse_expr() + if (tok.kind == ")") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated parenthesized expression, expected ')'") + else parse_error(tok, "expected ')' after expression") + return node + } + if (k == "function") { + return parse_function_inner() + } + if (k == "/") { + node = ast_node("regexp", start) + rpos = tok.at + 1 + pattern_str = "" + flags = "" + while (rpos < _src_len && cp[rpos] != CP_SLASH) { + if (cp[rpos] == CP_BSLASH && rpos + 1 < _src_len) { + pattern_str = pattern_str + character(cp[rpos]) + character(cp[rpos + 1]) + rpos = rpos + 2 + } else { + pattern_str = pattern_str + character(cp[rpos]) + rpos = rpos + 1 + } + } + if (rpos < _src_len) rpos = rpos + 1 + while (rpos < _src_len && is_alpha(cp[rpos])) { + flags = flags + character(cp[rpos]) + rpos = rpos + 1 + } + node.pattern = pattern_str + if (length(flags) > 0) node.flags = flags + advance() + ast_node_end(node) + return node + } + + if (k == "eof") { + parse_error(start, "unexpected end of input") + } else { + parse_error(start, "unexpected token where expression expected") + } + advance() + return null + } + + var parse_postfix = function() { + var node = parse_primary() + var start = null + var new_node = null + var index = null + var arg = null + var args_list = null + if (node == null) return null + while (true) { + start = tok + if (tok.kind == ".") { + advance() + new_node = ast_node(".", start) + new_node.left = node + if (tok.kind == "name" || is_keyword(tok.kind)) { + if (tok.kind == "name") { + new_node.right = tok.value + } else { + new_node.right = tok.kind + } + advance() + } else { + parse_error(tok, "expected property name after '.'") + } + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "[") { + advance() + new_node = ast_node("[", start) + new_node.left = node + if (tok.kind == "]") { + advance() + } else { + index = parse_assign_expr() + new_node.right = index + if (tok.kind == "]") advance() + else parse_error(tok, "expected ']'") + } + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "(") { + advance() + new_node = ast_node("(", start) + new_node.expression = node + args_list = [] + new_node.list = args_list + while (tok.kind != ")" && tok.kind != "eof") { + arg = parse_assign_expr() + if (arg != null) push(args_list, arg) + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + else parse_error(tok, "unterminated argument list, expected ')'") + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "++") { + new_node = ast_node("++", start) + new_node.expression = node + new_node.postfix = true + advance() + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "--") { + new_node = ast_node("--", start) + new_node.expression = node + new_node.postfix = true + advance() + ast_node_end(new_node) + node = new_node + } else { + break + } + } + return node + } + + var parse_unary = function() { + var start = tok + var node = null + var expr = null + var k = tok.kind + if (k == "!") { + advance() + node = ast_node("!", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "~") { + advance() + node = ast_node("~", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "+") { + advance() + node = ast_node("+unary", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "-") { + advance() + node = ast_node("-unary", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "++") { + advance() + node = ast_node("++", start) + node.expression = parse_unary() + node.postfix = false + ast_node_end(node) + return node + } + if (k == "--") { + advance() + node = ast_node("--", start) + node.expression = parse_unary() + node.postfix = false + ast_node_end(node) + return node + } + if (k == "delete") { + advance() + node = ast_node("delete", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + return parse_postfix() + } + + // Binary operator precedence + var binop_prec = { + "**": 14, + "*": 13, "/": 13, "%": 13, + "+": 12, "-": 12, + "<<": 11, ">>": 11, ">>>": 11, + "<": 10, ">": 10, "<=": 10, ">=": 10, in: 10, + "==": 9, "!=": 9, "===": 9, "!==": 9, + "&": 8, "^": 7, "|": 6, + "&&": 5, "||": 4 + } + + var parse_binary = function(min_prec) { + var left_node = parse_unary() + var start = null + var op = null + var prec = null + var next_prec = 0 + var right_node = null + var node = null + if (left_node == null) return null + while (true) { + start = tok + op = tok.kind + prec = binop_prec[op] + if (prec == null || prec < min_prec) break + advance() + next_prec = prec + 1 + if (prec == 14) next_prec = prec // right-assoc for ** + right_node = parse_binary(next_prec) + node = ast_node(op, start) + node.left = left_node + node.right = right_node + ast_node_end(node) + left_node = node + } + return left_node + } + + var parse_ternary = function() { + var cond = parse_binary(1) + var start = null + var then_expr = null + var else_expr = null + var node = null + if (cond == null) return null + if (tok.kind == "?") { + start = tok + advance() + then_expr = parse_expr() + if (tok.kind == ":") advance() + else parse_error(tok, "expected ':' in ternary expression") + else_expr = parse_expr() + node = ast_node("then", start) + node.expression = cond + node.then = then_expr + node.else = else_expr + ast_node_end(node) + return node + } + return cond + } + + // Assign operators + var assign_ops = { + "=": "assign", "+=": "+=", "-=": "-=", "*=": "*=", "/=": "/=", "%=": "%=", + "<<=": "<<=", ">>=": ">>=", ">>>=": ">>>=", + "&=": "&=", "^=": "^=", "|=": "|=", "**=": "**=", + "&&=": "&&=", "||=": "||=" + } + + parse_assign = function(unused) { + var left_node = parse_ternary() + var start = null + var kind = null + var right_node = null + var node = null + var left_kind = null + var right_kind = null + if (left_node == null) return null + start = tok + kind = assign_ops[tok.kind] + if (kind == null) return left_node + + left_kind = left_node.kind + if (left_kind != "name" && left_kind != "." && left_kind != "[") { + parse_error(start, "invalid assignment left-hand side") + } + + advance() + right_node = parse_assign() + node = ast_node(kind, start) + node.left = left_node + node.right = right_node + + if (left_node.kind == "[" && left_node.right == null) node.push = true + if (right_node != null && right_node.kind == "[" && right_node.right == null) node.pop = true + + ast_node_end(node) + return node + } + + parse_assign_expr = function(unused) { + return parse_assign() + } + + parse_expr = function(unused) { + var left_node = parse_assign() + var start = null + var right_node = null + var node = null + if (left_node == null) return null + while (tok.kind == ",") { + start = tok + advance() + right_node = parse_assign() + node = ast_node(",", start) + node.left = left_node + node.right = right_node + ast_node_end(node) + left_node = node + } + return left_node + } + + // ============================================================ + // Statement Parsing + // ============================================================ + + var in_disruption = 0 + + var expect_semi = function() { + if (tok.kind == ";") { advance(); return null } + if (tok.kind == "eof" || tok.kind == "}" || got_lf || tok.kind == "else") return null + parse_error(tok, "expecting ';'") + } + + var sync_to_statement = function() { + var k = null + while (tok.kind != "eof") { + k = tok.kind + if (k == ";") { advance(); return null } + if (k == "}") return null + if (k == "var" || k == "def" || k == "if" || k == "while" || + k == "for" || k == "return" || k == "disrupt" || + k == "function" || k == "break" || k == "continue" || k == "do") return null + advance() + } + } + + parse_block_statements = function(unused) { + var stmts = [] + var before = null + var stmt = null + while (tok.kind != "}" && tok.kind != "eof") { + before = cursor + stmt = parse_statement() + if (stmt != null) { + push(stmts, stmt) + } else if (cursor == before) { + sync_to_statement() + } + } + return stmts + } + + parse_function_inner = function(unused) { + var start = tok + var node = ast_node("function", start) + var params = [] + var stmts = null + var param = null + var prev_names = null + var pname = null + var dup = false + var j = 0 + var old_dis = 0 + + if (in_disruption) { + parse_error(tok, "cannot define function inside disruption clause") + } + + advance() // skip 'function' + + if (tok.kind == "name") { + node.name = tok.value + advance() + } + + node.list = params + if (tok.kind == "(") { + advance() + prev_names = [] + while (tok.kind != ")" && tok.kind != "eof") { + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + pname = tok.value + dup = false + j = 0 + while (j < length(prev_names)) { + if (prev_names[j] == pname) { dup = true; break } + j = j + 1 + } + if (dup) parse_error(tok, "duplicate parameter name '" + pname + "'") + push(prev_names, pname) + advance() + ast_node_end(param) + if (tok.kind == "=" || tok.kind == "|") { + advance() + param.expression = parse_assign_expr() + } + push(params, param) + } else { + parse_error(tok, "expected parameter name") + break + } + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated function parameter list, expected ')'") + } else { + parse_error(tok, "expected '(' after function name") + } + + if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") + + if (tok.kind == "{") { + advance() + stmts = parse_block_statements() + node.statements = stmts + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated function body, expected '}'") + } else { + parse_error(tok, "expected '{' for function body") + } + + if (tok.kind == "disruption") { + advance() + if (tok.kind == "{") { + advance() + old_dis = in_disruption + in_disruption = 1 + node.disruption = parse_block_statements() + in_disruption = old_dis + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated disruption clause, expected '}'") + } else { + parse_error(tok, "expected '{' after disruption") + } + } + + node.function_nr = function_nr + function_nr = function_nr + 1 + ast_node_end(node) + return node + } + + parse_arrow_function = function(unused) { + var start = tok + var node = ast_node("function", start) + var params = [] + var param = null + var stmts = null + var ret = null + var expr = null + var prev_names = null + var pname = null + var dup = false + var j = 0 + node.arrow = true + + if (in_disruption) { + parse_error(tok, "cannot define function inside disruption clause") + } + + node.list = params + + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + advance() + ast_node_end(param) + push(params, param) + } else if (tok.kind == "(") { + advance() + prev_names = [] + while (tok.kind != ")" && tok.kind != "eof") { + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + pname = tok.value + dup = false + j = 0 + while (j < length(prev_names)) { + if (prev_names[j] == pname) { dup = true; break } + j = j + 1 + } + if (dup) parse_error(tok, "duplicate parameter name '" + pname + "'") + push(prev_names, pname) + advance() + ast_node_end(param) + if (tok.kind == "=" || tok.kind == "|") { + advance() + param.expression = parse_assign_expr() + } + push(params, param) + } else { + parse_error(tok, "expected parameter name") + break + } + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + } + + if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") + + if (tok.kind != "=>") { + parse_error(tok, "expected '=>' in arrow function") + } else { + advance() + } + + if (tok.kind == "{") { + advance() + stmts = parse_block_statements() + node.statements = stmts + if (tok.kind == "}") advance() + } else { + stmts = [] + ret = ast_node("return", tok) + expr = parse_assign_expr() + ret.expression = expr + ast_node_end(ret) + push(stmts, ret) + node.statements = stmts + } + + node.function_nr = function_nr + function_nr = function_nr + 1 + ast_node_end(node) + return node + } + + parse_statement = function(unused) { + var start = tok + var node = null + var k = tok.kind + var stmts = null + var cond = null + var then_stmts = null + var else_stmts = null + var else_ifs = null + var body = null + var expr = null + var init = null + var test = null + var update = null + var left_node = null + var right_node = null + var kind_name = null + var is_def = false + var decls = null + var decl_count = 0 + var var_name = null + var right_kind = null + var elif = null + var p1_tok = null + var labeled_stmt = null + + if (k == "{") { + node = ast_node("block", start) + advance() + stmts = parse_block_statements() + node.statements = stmts + if (tok.kind == "}") advance() + ast_node_end(node) + return node + } + + if (k == "var" || k == "def") { + kind_name = k + is_def = (k == "def") + advance() + if (tok.kind != "name") { + parse_error(tok, "expected identifier after '" + kind_name + "'") + return null + } + decls = [] + decl_count = 0 + while (tok.kind == "name") { + node = ast_node(kind_name, start) + left_node = ast_node("name", tok) + left_node.name = tok.value + var_name = tok.value + advance() + ast_node_end(left_node) + node.left = left_node + if (tok.kind == "=") { + advance() + right_node = parse_assign_expr() + node.right = right_node + if (right_node != null && right_node.kind == "[" && right_node.right == null) { + node.pop = true + } + } else if (is_def) { + parse_error(start, "missing initializer for constant '" + var_name + "'") + } + ast_node_end(node) + push(decls, node) + decl_count = decl_count + 1 + if (tok.kind == ",") advance() + else break + } + expect_semi() + if (decl_count == 1) { + return decls[0] + } + node = ast_node("var_list", start) + node.list = decls + ast_node_end(node) + return node + } + + if (k == "if") { + node = ast_node("if", start) + advance() + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' before condition") + cond = parse_expr() + node.expression = cond + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after if condition") + then_stmts = [] + node.then = then_stmts + body = parse_statement() + if (body != null) push(then_stmts, body) + else_ifs = [] + node.list = else_ifs + if (tok.kind == "else") { + advance() + if (tok.kind == "if") { + elif = parse_statement() + if (elif != null) push(else_ifs, elif) + } else { + else_stmts = [] + node.else = else_stmts + body = parse_statement() + if (body != null) push(else_stmts, body) + } + } + ast_node_end(node) + return node + } + + if (k == "while") { + node = ast_node("while", start) + advance() + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' before condition") + cond = parse_expr() + node.expression = cond + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after while condition") + stmts = [] + node.statements = stmts + body = parse_statement() + if (body != null) push(stmts, body) + ast_node_end(node) + return node + } + + if (k == "do") { + node = ast_node("do", start) + advance() + stmts = [] + node.statements = stmts + body = parse_statement() + if (body != null) push(stmts, body) + if (tok.kind == "while") advance() + else parse_error(tok, "expected 'while' after do body") + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' before condition") + cond = parse_expr() + node.expression = cond + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after do-while condition") + expect_semi() + ast_node_end(node) + return node + } + + if (k == "for") { + node = ast_node("for", start) + advance() + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' after for") + if (tok.kind != ";") { + if (tok.kind == "var" || tok.kind == "def") { + init = parse_statement() + node.init = init + } else { + init = parse_expr() + node.init = init + if (tok.kind == ";") advance() + } + } else { + advance() + } + if (tok.kind != ";") { + test = parse_expr() + node.test = test + } + if (tok.kind == ";") advance() + if (tok.kind != ")") { + update = parse_expr() + node.update = update + } + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after for clauses") + stmts = [] + node.statements = stmts + body = parse_statement() + if (body != null) push(stmts, body) + ast_node_end(node) + return node + } + + if (k == "return") { + node = ast_node("return", start) + advance() + if (tok.kind != ";" && tok.kind != "}" && !got_lf) { + expr = parse_expr() + node.expression = expr + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "go") { + node = ast_node("go", start) + advance() + if (tok.kind != ";" && tok.kind != "}" && !got_lf) { + expr = parse_expr() + node.expression = expr + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "disrupt") { + node = ast_node("disrupt", start) + advance() + expect_semi() + ast_node_end(node) + return node + } + + if (k == "break") { + node = ast_node("break", start) + advance() + if (tok.kind == "name" && !got_lf) { + node.name = tok.value + advance() + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "continue") { + node = ast_node("continue", start) + advance() + if (tok.kind == "name" && !got_lf) { + node.name = tok.value + advance() + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "function") { + return parse_function_inner() + } + + if (k == ";") { + advance() + return null + } + + if (k == "name") { + p1_tok = peek_ahead(1) + if (p1_tok.kind == ":") { + node = ast_node("label", start) + node.name = tok.value + advance() // skip identifier + advance() // skip colon + labeled_stmt = parse_statement() + node.statement = labeled_stmt + ast_node_end(node) + return node + } + } + + expr = parse_expr() + if (expr != null) { + node = ast_node("call", start) + node.expression = expr + ast_node_end(node) + expect_semi() + return node + } + parse_error(start, "unexpected token at start of statement") + return null + } + + // ============================================================ + // Program + // ============================================================ + + var parse_program = function() { + var root = {kind: "program", filename: filename} + var functions = [] + var statements = [] + var before = 0 + var stmt = null + root.functions = functions + root.statements = statements + + while (tok.kind != "eof") { + before = cursor + stmt = parse_statement() + if (stmt != null) { + if (stmt.kind == "function") { + push(functions, stmt) + } else { + push(statements, stmt) + } + } else if (cursor == before) { + sync_to_statement() + } + } + return root + } + + // ============================================================ + // Semantic Analysis + // ============================================================ + + var sem_errors = [] + var scopes_array = [] + var intrinsics = [] + var block_var_counter = 0 + + var sem_error = function(node, msg) { + var err = {message: msg} + if (node.from_row != null) err.line = node.from_row + 1 + if (node.from_column != null) err.column = node.from_column + 1 + push(sem_errors, err) + } + + var make_scope = function(parent, fn_nr, opts) { + return { + parent: parent, + vars: [], + in_loop: opts.in_loop == true, + function_nr: fn_nr, + is_function_scope: opts.is_func == true, + block_depth: opts.bdepth != null ? opts.bdepth : 0 + } + } + + var sem_add_var = function(scope, name, make_opts) { + push(scope.vars, { + name: name, + scope_name: null, + is_const: make_opts.is_const == true, + make: make_opts.make, + function_nr: make_opts.fn_nr, + nr_uses: 0, + closure: 0 + }) + } + + var sem_lookup_var = function(scope, name) { + var result = {v: null, level: 0, def_function_nr: -1} + var cur_fn = scope.function_nr + var s = scope + var i = 0 + while (s != null) { + i = 0 + while (i < length(s.vars)) { + if (s.vars[i].name == name) { + result.v = s.vars[i] + result.def_function_nr = s.vars[i].function_nr + return result + } + i = i + 1 + } + if (s.parent != null && s.parent.function_nr != cur_fn) { + result.level = result.level + 1 + cur_fn = s.parent.function_nr + } + s = s.parent + } + return result + } + + var sem_find_var = function(scope, name) { + var r = sem_lookup_var(scope, name) + return r.v + } + + var sem_in_loop = function(scope) { + var s = scope + while (s != null) { + if (s.in_loop) return true + s = s.parent + } + return false + } + + var sem_add_intrinsic = function(name) { + var i = 0 + while (i < length(intrinsics)) { + if (intrinsics[i] == name) return null + i = i + 1 + } + push(intrinsics, name) + } + + var functino_names = { + "+!": true, "-!": true, "*!": true, "/!": true, "%!": true, "**!": true, + "!": true, "<=!": true, ">=!": true, "=!": true, "!=!": true, + "&!": true, "|!": true, "^!": true, "<>!": true, ">>>!": true, + "&&!": true, "||!": true, "~!": true, "[]!": true + } + + var is_functino_name = function(name) { + return functino_names[name] == true + } + + var sem_propagate_block_vars = function(parent, block) { + var i = 0 + var v = null + var sn = null + while (i < length(block.vars)) { + v = block.vars[i] + sn = v.scope_name + if (sn == null) sn = v.name + push(parent.vars, { + name: sn, + scope_name: null, + is_const: v.is_const, + make: v.make, + function_nr: v.function_nr, + nr_uses: v.nr_uses, + closure: v.closure + }) + i = i + 1 + } + } + + var sem_build_scope_record = function(scope) { + var rec = {function_nr: scope.function_nr} + var slots = 0 + var close_slots = 0 + var i = 0 + var v = null + while (i < length(scope.vars)) { + v = scope.vars[i] + rec[v.name] = { + make: v.make, + function_nr: v.function_nr, + nr_uses: v.nr_uses, + closure: v.closure == 1, + level: 0 + } + slots = slots + 1 + if (v.closure) close_slots = close_slots + 1 + i = i + 1 + } + return {rec: rec, nr_slots: slots, nr_close: close_slots} + } + + // Forward declarations + var sem_check_expr = null + var sem_check_stmt = null + + var sem_predeclare_vars = function(scope, stmts) { + var i = 0 + var stmt = null + var kind = null + var name = null + var item = null + var ik = null + var j = 0 + while (i < length(stmts)) { + stmt = stmts[i] + kind = stmt.kind + if (kind == "function") { + name = stmt.name + if (name != null && sem_find_var(scope, name) == null) { + sem_add_var(scope, name, {make: "function", fn_nr: scope.function_nr}) + } + } else if (kind == "var") { + name = stmt.left.name + if (name != null && sem_find_var(scope, name) == null) { + sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) + } + } else if (kind == "var_list") { + j = 0 + while (j < length(stmt.list)) { + item = stmt.list[j] + ik = item.kind + if (ik == "var") { + name = item.left.name + if (name != null && sem_find_var(scope, name) == null) { + sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) + } + } + j = j + 1 + } + } + i = i + 1 + } + } + + var sem_check_assign_target = function(scope, left_node) { + if (left_node == null) return null + var kind = left_node.kind + var name = null + var v = null + var r = null + var obj_expr = null + + if (kind == "name") { + name = left_node.name + if (name == null) return null + v = sem_find_var(scope, name) + if (v == null) { + sem_error(left_node, "cannot assign to unbound variable '" + name + "'") + } else if (v.is_const) { + sem_error(left_node, "cannot assign to constant '" + name + "'") + } + r = sem_lookup_var(scope, name) + if (r.v != null) { + left_node.level = r.level + left_node.function_nr = r.def_function_nr + if (r.v.scope_name != null) left_node.scope_name = r.v.scope_name + } else { + left_node.level = -1 + } + } else if (kind == "." || kind == "[") { + obj_expr = left_node.left + sem_check_expr(scope, obj_expr) + if (kind == "[" && left_node.right != null) { + sem_check_expr(scope, left_node.right) + } + } + } + + sem_check_expr = function(scope, expr) { + if (expr == null) return null + var kind = expr.kind + if (kind == null) return null + var name = null + var r = null + var i = 0 + var operand = null + var v = null + var prop = null + var val = null + var fn_nr_val = null + var fn_scope = null + var pname = null + var def_val = null + var sr = null + + if (kind == "assign" || kind == "+=" || kind == "-=" || kind == "*=" || + kind == "/=" || kind == "%=" || kind == "<<=" || kind == ">>=" || + kind == ">>>=" || kind == "&=" || kind == "^=" || kind == "|=" || + kind == "**=" || kind == "&&=" || kind == "||=") { + sem_check_assign_target(scope, expr.left) + sem_check_expr(scope, expr.right) + return null + } + + if (kind == "++" || kind == "--") { + operand = expr.expression + if (operand != null && operand.kind == "name") { + name = operand.name + if (name != null) { + v = sem_find_var(scope, name) + if (v == null) { + sem_error(expr, "cannot assign to unbound variable '" + name + "'") + } else if (v.is_const) { + sem_error(expr, "cannot assign to constant '" + name + "'") + } + r = sem_lookup_var(scope, name) + if (r.v != null) { + operand.level = r.level + operand.function_nr = r.def_function_nr + if (r.v.scope_name != null) operand.scope_name = r.v.scope_name + } else { + operand.level = -1 + } + } + } + return null + } + + if (kind == "," || kind == "+" || kind == "-" || kind == "*" || + kind == "/" || kind == "%" || kind == "==" || kind == "!=" || + kind == "<" || kind == ">" || kind == "<=" || kind == ">=" || + kind == "&&" || kind == "||" || kind == "&" || + kind == "|" || kind == "^" || kind == "<<" || kind == ">>" || + kind == ">>>" || kind == "**" || kind == "in" || + kind == "." || kind == "[") { + sem_check_expr(scope, expr.left) + sem_check_expr(scope, expr.right) + return null + } + + if (kind == "then") { + sem_check_expr(scope, expr.expression) + sem_check_expr(scope, expr.then) + sem_check_expr(scope, expr.else) + return null + } + + if (kind == "(") { + sem_check_expr(scope, expr.expression) + i = 0 + while (i < length(expr.list)) { + sem_check_expr(scope, expr.list[i]) + i = i + 1 + } + return null + } + + if (kind == "!" || kind == "~" || kind == "delete" || + kind == "-unary" || kind == "+unary") { + sem_check_expr(scope, expr.expression) + return null + } + + if (kind == "array") { + i = 0 + while (i < length(expr.list)) { + sem_check_expr(scope, expr.list[i]) + i = i + 1 + } + return null + } + + if (kind == "record") { + i = 0 + while (i < length(expr.list)) { + prop = expr.list[i] + val = prop.right + sem_check_expr(scope, val) + i = i + 1 + } + return null + } + + if (kind == "function") { + fn_nr_val = expr.function_nr + if (fn_nr_val == null) fn_nr_val = scope.function_nr + fn_scope = make_scope(scope, fn_nr_val, {is_func: true}) + expr.outer = scope.function_nr + i = 0 + while (i < length(expr.list)) { + pname = expr.list[i].name + if (pname != null) sem_add_var(fn_scope, pname, {is_const: true, make: "input", fn_nr: fn_nr_val}) + def_val = expr.list[i].expression + if (def_val != null) sem_check_expr(fn_scope, def_val) + i = i + 1 + } + if (expr.statements != null) { + sem_predeclare_vars(fn_scope, expr.statements) + i = 0 + while (i < length(expr.statements)) { + sem_check_stmt(fn_scope, expr.statements[i]) + i = i + 1 + } + } + if (expr.disruption != null) { + i = 0 + while (i < length(expr.disruption)) { + sem_check_stmt(fn_scope, expr.disruption[i]) + i = i + 1 + } + } + sr = sem_build_scope_record(fn_scope) + push(scopes_array, sr.rec) + expr.nr_slots = sr.nr_slots + expr.nr_close_slots = sr.nr_close + return null + } + + if (kind == "text literal") { + i = 0 + while (i < length(expr.list)) { + sem_check_expr(scope, expr.list[i]) + i = i + 1 + } + return null + } + + if (kind == "name") { + name = expr.name + if (name != null) { + if (is_functino_name(name)) { + expr.make = "functino" + expr.level = -1 + return null + } + r = sem_lookup_var(scope, name) + if (r.v != null) { + expr.level = r.level + expr.function_nr = r.def_function_nr + r.v.nr_uses = r.v.nr_uses + 1 + if (r.level > 0) r.v.closure = 1 + if (r.v.scope_name != null) expr.scope_name = r.v.scope_name + } else { + expr.level = -1 + sem_add_intrinsic(name) + } + } + return null + } + } + + sem_check_stmt = function(scope, stmt) { + if (stmt == null) return null + var kind = stmt.kind + if (kind == null) return null + var name = null + var existing = null + var i = 0 + var sn = null + var then_scope = null + var list_scope = null + var else_scope = null + var loop_scope = null + var do_scope = null + var for_scope = null + var init_kind = null + var blk_scope = null + var fn_nr_val = null + var fn_scope = null + var pname = null + var def_val = null + var sr = null + + if (kind == "var_list") { + i = 0 + while (i < length(stmt.list)) { + sem_check_stmt(scope, stmt.list[i]) + i = i + 1 + } + return null + } + + if (kind == "var") { + name = stmt.left.name + if (name != null) { + existing = sem_find_var(scope, name) + if (existing != null && existing.is_const) { + sem_error(stmt.left, "cannot redeclare constant '" + name + "'") + } + if (existing == null || existing.function_nr != scope.function_nr || scope.block_depth > 0) { + sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) + } + if (scope.block_depth > 0) { + sn = "_" + name + "_" + text(block_var_counter) + block_var_counter = block_var_counter + 1 + scope.vars[length(scope.vars) - 1].scope_name = sn + stmt.left.scope_name = sn + } + } + sem_check_expr(scope, stmt.right) + return null + } + + if (kind == "def") { + name = stmt.left.name + if (name != null) { + existing = sem_find_var(scope, name) + if (existing != null && existing.is_const) { + sem_error(stmt.left, "cannot redeclare constant '" + name + "'") + } else if (existing != null && !existing.is_const && existing.function_nr == scope.function_nr) { + existing.is_const = 1 + existing.make = "def" + } else { + sem_add_var(scope, name, {is_const: true, make: "def", fn_nr: scope.function_nr}) + if (scope.block_depth > 0) { + sn = "_" + name + "_" + text(block_var_counter) + block_var_counter = block_var_counter + 1 + scope.vars[length(scope.vars) - 1].scope_name = sn + stmt.left.scope_name = sn + } + } + } + sem_check_expr(scope, stmt.right) + return null + } + + if (kind == "call") { + sem_check_expr(scope, stmt.expression) + return null + } + + if (kind == "if") { + sem_check_expr(scope, stmt.expression) + then_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.then)) { + sem_check_stmt(then_scope, stmt.then[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, then_scope) + list_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.list)) { + sem_check_stmt(list_scope, stmt.list[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, list_scope) + if (stmt.else != null) { + else_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.else)) { + sem_check_stmt(else_scope, stmt.else[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, else_scope) + } + return null + } + + if (kind == "while") { + sem_check_expr(scope, stmt.expression) + loop_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(loop_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, loop_scope) + return null + } + + if (kind == "do") { + do_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(do_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, do_scope) + sem_check_expr(scope, stmt.expression) + return null + } + + if (kind == "for") { + for_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) + if (stmt.init != null) { + init_kind = stmt.init.kind + if (init_kind == "var" || init_kind == "def") { + sem_check_stmt(for_scope, stmt.init) + } else { + sem_check_expr(for_scope, stmt.init) + } + } + sem_check_expr(for_scope, stmt.test) + sem_check_expr(for_scope, stmt.update) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(for_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, for_scope) + return null + } + + if (kind == "return" || kind == "go") { + sem_check_expr(scope, stmt.expression) + return null + } + + if (kind == "disrupt") { + return null + } + + if (kind == "break") { + if (!sem_in_loop(scope)) { + sem_error(stmt, "'break' used outside of loop") + } + return null + } + + if (kind == "continue") { + if (!sem_in_loop(scope)) { + sem_error(stmt, "'continue' used outside of loop") + } + return null + } + + if (kind == "block") { + blk_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(blk_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, blk_scope) + return null + } + + if (kind == "label") { + sem_check_stmt(scope, stmt.statement) + return null + } + + if (kind == "function") { + name = stmt.name + if (name != null) sem_add_var(scope, name, {make: "function", fn_nr: scope.function_nr}) + fn_nr_val = stmt.function_nr + if (fn_nr_val == null) fn_nr_val = scope.function_nr + fn_scope = make_scope(scope, fn_nr_val, {is_func: true}) + stmt.outer = scope.function_nr + i = 0 + while (i < length(stmt.list)) { + pname = stmt.list[i].name + if (pname != null) sem_add_var(fn_scope, pname, {is_const: true, make: "input", fn_nr: fn_nr_val}) + def_val = stmt.list[i].expression + if (def_val != null) sem_check_expr(fn_scope, def_val) + i = i + 1 + } + sem_predeclare_vars(fn_scope, stmt.statements) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(fn_scope, stmt.statements[i]) + i = i + 1 + } + if (stmt.disruption != null) { + i = 0 + while (i < length(stmt.disruption)) { + sem_check_stmt(fn_scope, stmt.disruption[i]) + i = i + 1 + } + } + sr = sem_build_scope_record(fn_scope) + push(scopes_array, sr.rec) + stmt.nr_slots = sr.nr_slots + stmt.nr_close_slots = sr.nr_close + return null + } + } + + var semantic_check = function(ast) { + var global_scope = make_scope(null, 0, {is_func: true}) + var i = 0 + var stmt = null + var name = null + var sr = null + var new_scopes = null + + i = 0 + while (i < length(ast.functions)) { + name = ast.functions[i].name + if (name != null) sem_add_var(global_scope, name, {make: "function", fn_nr: 0}) + i = i + 1 + } + + i = 0 + while (i < length(ast.statements)) { + sem_check_stmt(global_scope, ast.statements[i]) + i = i + 1 + } + + i = 0 + while (i < length(ast.functions)) { + sem_check_stmt(global_scope, ast.functions[i]) + i = i + 1 + } + + sr = sem_build_scope_record(global_scope) + new_scopes = [sr.rec] + i = 0 + while (i < length(scopes_array)) { + push(new_scopes, scopes_array[i]) + i = i + 1 + } + scopes_array = new_scopes + + ast.scopes = scopes_array + ast.intrinsics = intrinsics + if (length(sem_errors) > 0) { + ast.errors = sem_errors + } + } + + // ============================================================ + // Main + // ============================================================ + + init_cursor() + var ast = parse_program() + + if (error_count == 0) { + semantic_check(ast) + } + + // Merge parse errors + var _mi = 0 + if (length(errors) > 0) { + if (ast.errors != null) { + _mi = 0 + while (_mi < length(errors)) { + push(ast.errors, errors[_mi]) + _mi = _mi + 1 + } + } else { + ast.errors = errors + } + } + + return ast +} + +return parse diff --git a/tokenize.ce b/tokenize.ce index f3d2abde..5284a078 100644 --- a/tokenize.ce +++ b/tokenize.ce @@ -1,569 +1,5 @@ +var tokenize = use("tokenize") var src = args[0] var filename = length(args) > 1 ? args[1] : "" - -// Convert to codepoint array - integers are GC-safe immediate values -var len = length(src) -var cp = [] -var _i = 0 -while (_i < len) { - push(cp, codepoint(src[_i])) - _i = _i + 1 -} - -var pos = 0 -var row = 0 -var col = 0 -var tokens = [] - -// Codepoint constants -def CP_LF = 10 -def CP_CR = 13 -def CP_TAB = 9 -def CP_SPACE = 32 -def CP_BANG = 33 -def CP_DQUOTE = 34 -def CP_HASH = 35 -def CP_DOLLAR = 36 -def CP_PERCENT = 37 -def CP_AMP = 38 -def CP_SQUOTE = 39 -def CP_LPAREN = 40 -def CP_RPAREN = 41 -def CP_STAR = 42 -def CP_PLUS = 43 -def CP_COMMA = 44 -def CP_MINUS = 45 -def CP_DOT = 46 -def CP_SLASH = 47 -def CP_0 = 48 -def CP_1 = 49 -def CP_7 = 55 -def CP_9 = 57 -def CP_COLON = 58 -def CP_SEMI = 59 -def CP_LT = 60 -def CP_EQ = 61 -def CP_GT = 62 -def CP_QMARK = 63 -def CP_AT = 64 -def CP_A = 65 -def CP_B = 66 -def CP_E = 69 -def CP_F = 70 -def CP_O = 79 -def CP_X = 88 -def CP_Z = 90 -def CP_LBRACKET = 91 -def CP_BSLASH = 92 -def CP_RBRACKET = 93 -def CP_CARET = 94 -def CP_UNDERSCORE = 95 -def CP_BACKTICK = 96 -def CP_a = 97 -def CP_b = 98 -def CP_e = 101 -def CP_f = 102 -def CP_n = 110 -def CP_o = 111 -def CP_r = 114 -def CP_t = 116 -def CP_x = 120 -def CP_z = 122 -def CP_LBRACE = 123 -def CP_PIPE = 124 -def CP_RBRACE = 125 -def CP_TILDE = 126 - -// Keywords lookup -var keywords = { - if: "if", in: "in", do: "do", go: "go", - var: "var", def: "def", for: "for", - else: "else", this: "this", null: "null", true: "true", - false: "false", while: "while", break: "break", - return: "return", delete: "delete", - disrupt: "disrupt", function: "function", continue: "continue", - disruption: "disruption" -} - -function pk() { - if (pos >= len) return -1 - return cp[pos] -} - -function pk_at(n) { - var idx = pos + n - if (idx >= len) return -1 - return cp[idx] -} - -function adv() { - var c = cp[pos] - pos = pos + 1 - if (c == CP_LF) { - row = row + 1 - col = 0 - } else { - col = col + 1 - } - return c -} - -function is_digit(c) { - return c >= CP_0 && c <= CP_9 -} - -function is_hex(c) { - return (c >= CP_0 && c <= CP_9) || (c >= CP_a && c <= CP_f) || (c >= CP_A && c <= CP_F) -} - -function is_alpha(c) { - return (c >= CP_a && c <= CP_z) || (c >= CP_A && c <= CP_Z) -} - -function is_alnum(c) { - return is_alpha(c) || is_digit(c) -} - -function is_ident_start(c) { - return is_alpha(c) || c == CP_UNDERSCORE || c == CP_DOLLAR -} - -function is_ident_char(c) { - return is_alnum(c) || c == CP_UNDERSCORE || c == CP_DOLLAR || c == CP_QMARK || c == CP_BANG -} - -function substr(start, end) { - var s = "" - var i = start - while (i < end) { - s = s + character(cp[i]) - i = i + 1 - } - return s -} - -function read_string(quote_cp) { - var start = pos - var start_row = row - var start_col = col - var value = "" - var esc = 0 - adv() // skip opening quote - while (pos < len && pk() != quote_cp) { - if (pk() == CP_BSLASH) { - adv() - esc = adv() - if (esc == CP_n) { value = value + "\n" } - else if (esc == CP_t) { value = value + "\t" } - else if (esc == CP_r) { value = value + "\r" } - else if (esc == CP_BSLASH) { value = value + "\\" } - else if (esc == CP_SQUOTE) { value = value + "'" } - else if (esc == CP_DQUOTE) { value = value + "\"" } - else if (esc == CP_0) { value = value + character(0) } - else if (esc == CP_BACKTICK) { value = value + "`" } - else { value = value + character(esc) } - } else { - value = value + character(adv()) - } - } - if (pos < len) adv() // skip closing quote - push(tokens, { - kind: "text", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: value - }) -} - -function read_template() { - var start = pos - var start_row = row - var start_col = col - var value = "" - var esc = 0 - var depth = 0 - var tc = 0 - var q = 0 - adv() // skip opening backtick - while (pos < len && pk() != CP_BACKTICK) { - if (pk() == CP_BSLASH && pos + 1 < len) { - adv() - esc = adv() - if (esc == CP_n) { value = value + "\n" } - else if (esc == CP_t) { value = value + "\t" } - else if (esc == CP_r) { value = value + "\r" } - else if (esc == CP_BSLASH) { value = value + "\\" } - else if (esc == CP_BACKTICK) { value = value + "`" } - else if (esc == CP_DOLLAR) { value = value + "$" } - else if (esc == CP_0) { value = value + character(0) } - else { value = value + character(esc) } - } else if (pk() == CP_DOLLAR && pos + 1 < len && pk_at(1) == CP_LBRACE) { - adv() // $ - adv() // { - depth = 1 - while (pos < len && depth > 0) { - tc = pk() - if (tc == CP_LBRACE) { depth = depth + 1; adv() } - else if (tc == CP_RBRACE) { depth = depth - 1; adv() } - else if (tc == CP_SQUOTE || tc == CP_DQUOTE || tc == CP_BACKTICK) { - q = adv() - while (pos < len && pk() != q) { - if (pk() == CP_BSLASH && pos + 1 < len) adv() - adv() - } - if (pos < len) adv() - } else { adv() } - } - } else { - value = value + character(adv()) - } - } - if (pos < len) adv() // skip closing backtick - push(tokens, { - kind: "text", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: value - }) -} - -function read_number() { - var start = pos - var start_row = row - var start_col = col - if (pk() == CP_0 && (pk_at(1) == CP_x || pk_at(1) == CP_X)) { - adv(); adv() - while (pos < len && (is_hex(pk()) || pk() == CP_UNDERSCORE)) adv() - } else if (pk() == CP_0 && (pk_at(1) == CP_b || pk_at(1) == CP_B)) { - adv(); adv() - while (pos < len && (pk() == CP_0 || pk() == CP_1 || pk() == CP_UNDERSCORE)) adv() - } else if (pk() == CP_0 && (pk_at(1) == CP_o || pk_at(1) == CP_O)) { - adv(); adv() - while (pos < len && pk() >= CP_0 && pk() <= CP_7) adv() - } else { - while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() - if (pos < len && pk() == CP_DOT) { - adv() - while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() - } - if (pos < len && (pk() == CP_e || pk() == CP_E)) { - adv() - if (pos < len && (pk() == CP_PLUS || pk() == CP_MINUS)) adv() - while (pos < len && is_digit(pk())) adv() - } - } - var raw = substr(start, pos) - push(tokens, { - kind: "number", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: raw, number: number(raw) - }) -} - -function read_name() { - var start = pos - var start_row = row - var start_col = col - while (pos < len && is_ident_char(pk())) adv() - var name = substr(start, pos) - var kw = keywords[name] - if (kw != null) { - push(tokens, { - kind: kw, at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col - }) - } else { - push(tokens, { - kind: "name", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: name - }) - } -} - -function read_comment() { - var start = pos - var start_row = row - var start_col = col - if (pk_at(1) == CP_SLASH) { - while (pos < len && pk() != CP_LF && pk() != CP_CR) adv() - } else { - adv(); adv() // skip /* - while (pos < len) { - if (pk() == CP_STAR && pk_at(1) == CP_SLASH) { - adv(); adv() - break - } - adv() - } - } - var raw = substr(start, pos) - push(tokens, { - kind: "comment", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: raw - }) -} - -function emit_op(kind, count) { - var start = pos - var start_row = row - var start_col = col - var i = 0 - while (i < count) { adv(); i = i + 1 } - push(tokens, { - kind: kind, at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col - }) -} - -function emit_ident(count) { - var start = pos - var start_row = row - var start_col = col - var val = "" - var i = 0 - while (i < count) { val = val + character(adv()); i = i + 1 } - push(tokens, { - kind: "name", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: val - }) -} - -function tokenize_one() { - var c = pk() - var start = 0 - var start_row = 0 - var start_col = 0 - var raw = "" - if (c == -1) return false - - // Newline - if (c == CP_LF) { - start = pos - start_row = row - start_col = col - adv() - push(tokens, { - kind: "newline", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: "\n" - }) - return true - } - - if (c == CP_CR) { - start = pos - start_row = row - start_col = col - adv() - if (pos < len && pk() == CP_LF) adv() - push(tokens, { - kind: "newline", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: "\n" - }) - return true - } - - // Whitespace - if (c == CP_SPACE || c == CP_TAB) { - start = pos - start_row = row - start_col = col - while (pos < len && (pk() == CP_SPACE || pk() == CP_TAB)) adv() - raw = substr(start, pos) - push(tokens, { - kind: "space", at: start, - from_row: start_row, from_column: start_col, - to_row: row, to_column: col, - value: raw - }) - return true - } - - // Strings - if (c == CP_SQUOTE || c == CP_DQUOTE) { - read_string(c) - return true - } - - // Template - if (c == CP_BACKTICK) { - read_template() - return true - } - - // Numbers - if (is_digit(c)) { - read_number() - return true - } - if (c == CP_DOT && is_digit(pk_at(1))) { - read_number() - return true - } - - // Identifiers and keywords - if (is_ident_start(c)) { - read_name() - return true - } - - // Comments and / - if (c == CP_SLASH) { - if (pk_at(1) == CP_SLASH || pk_at(1) == CP_STAR) { - read_comment() - return true - } - if (pk_at(1) == CP_EQ) { emit_op("/=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("/", 1) - return true - } - - // Operators - if (c == CP_STAR) { - if (pk_at(1) == CP_STAR) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("**=", 3); return true } - emit_op("**", 2); return true - } - if (pk_at(1) == CP_EQ) { emit_op("*=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("*", 1); return true - } - - if (c == CP_PERCENT) { - if (pk_at(1) == CP_EQ) { emit_op("%=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("%", 1); return true - } - - if (c == CP_PLUS) { - if (pk_at(1) == CP_EQ) { emit_op("+=", 2); return true } - if (pk_at(1) == CP_PLUS) { emit_op("++", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("+", 1); return true - } - - if (c == CP_MINUS) { - if (pk_at(1) == CP_EQ) { emit_op("-=", 2); return true } - if (pk_at(1) == CP_MINUS) { emit_op("--", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("-", 1); return true - } - - if (c == CP_LT) { - if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(1) == CP_EQ) { emit_op("<=", 2); return true } - if (pk_at(1) == CP_LT) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("<<=", 3); return true } - emit_op("<<", 2); return true - } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("<", 1); return true - } - - if (c == CP_GT) { - if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(1) == CP_EQ) { emit_op(">=", 2); return true } - if (pk_at(1) == CP_GT) { - if (pk_at(2) == CP_GT) { - if (pk_at(3) == CP_BANG) { emit_ident(4); return true } - if (pk_at(3) == CP_EQ) { emit_op(">>>=", 4); return true } - emit_op(">>>", 3); return true - } - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op(">>=", 3); return true } - emit_op(">>", 2); return true - } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op(">", 1); return true - } - - if (c == CP_EQ) { - if (pk_at(1) == CP_EQ) { - if (pk_at(2) == CP_EQ) { emit_op("===", 3); return true } - emit_op("==", 2); return true - } - if (pk_at(1) == CP_GT) { emit_op("=>", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("=", 1); return true - } - - if (c == CP_BANG) { - if (pk_at(1) == CP_EQ) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("!==", 3); return true } - emit_op("!=", 2); return true - } - emit_op("!", 1); return true - } - - if (c == CP_AMP) { - if (pk_at(1) == CP_AMP) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("&&=", 3); return true } - emit_op("&&", 2); return true - } - if (pk_at(1) == CP_EQ) { emit_op("&=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("&", 1); return true - } - - if (c == CP_PIPE) { - if (pk_at(1) == CP_PIPE) { - if (pk_at(2) == CP_BANG) { emit_ident(3); return true } - if (pk_at(2) == CP_EQ) { emit_op("||=", 3); return true } - emit_op("||", 2); return true - } - if (pk_at(1) == CP_EQ) { emit_op("|=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("|", 1); return true - } - - if (c == CP_CARET) { - if (pk_at(1) == CP_EQ) { emit_op("^=", 2); return true } - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("^", 1); return true - } - - if (c == CP_LBRACKET) { - if (pk_at(1) == CP_RBRACKET && pk_at(2) == CP_BANG) { emit_ident(3); return true } - emit_op("[", 1); return true - } - - if (c == CP_TILDE) { - if (pk_at(1) == CP_BANG) { emit_ident(2); return true } - emit_op("~", 1); return true - } - - // Single character tokens - emit_op(character(c), 1) - return true -} - -// Main loop -while (pos < len) { - tokenize_one() -} - -// EOF token -push(tokens, { - kind: "eof", at: pos, - from_row: row, from_column: col, - to_row: row, to_column: col -}) - -print(json.encode({filename: filename, tokens: tokens})) +var result = tokenize(src, filename) +print(json.encode({filename: result.filename, tokens: result.tokens})) diff --git a/tokenize.cm b/tokenize.cm new file mode 100644 index 00000000..2e714e60 --- /dev/null +++ b/tokenize.cm @@ -0,0 +1,499 @@ +var tokenize = function(src, filename) { + var len = length(src) + var cp = [] + var _i = 0 + while (_i < len) { + push(cp, codepoint(src[_i])) + _i = _i + 1 + } + + var pos = 0 + var row = 0 + var col = 0 + var tokens = [] + + // Codepoint constants + def CP_LF = 10 + def CP_CR = 13 + def CP_TAB = 9 + def CP_SPACE = 32 + def CP_BANG = 33 + def CP_DQUOTE = 34 + def CP_HASH = 35 + def CP_DOLLAR = 36 + def CP_PERCENT = 37 + def CP_AMP = 38 + def CP_SQUOTE = 39 + def CP_LPAREN = 40 + def CP_RPAREN = 41 + def CP_STAR = 42 + def CP_PLUS = 43 + def CP_COMMA = 44 + def CP_MINUS = 45 + def CP_DOT = 46 + def CP_SLASH = 47 + def CP_0 = 48 + def CP_1 = 49 + def CP_7 = 55 + def CP_9 = 57 + def CP_COLON = 58 + def CP_SEMI = 59 + def CP_LT = 60 + def CP_EQ = 61 + def CP_GT = 62 + def CP_QMARK = 63 + def CP_AT = 64 + def CP_A = 65 + def CP_B = 66 + def CP_E = 69 + def CP_F = 70 + def CP_O = 79 + def CP_X = 88 + def CP_Z = 90 + def CP_LBRACKET = 91 + def CP_BSLASH = 92 + def CP_RBRACKET = 93 + def CP_CARET = 94 + def CP_UNDERSCORE = 95 + def CP_BACKTICK = 96 + def CP_a = 97 + def CP_b = 98 + def CP_e = 101 + def CP_f = 102 + def CP_n = 110 + def CP_o = 111 + def CP_r = 114 + def CP_t = 116 + def CP_x = 120 + def CP_z = 122 + def CP_LBRACE = 123 + def CP_PIPE = 124 + def CP_RBRACE = 125 + def CP_TILDE = 126 + + // Keywords lookup + var keywords = { + if: "if", in: "in", do: "do", go: "go", + var: "var", def: "def", for: "for", + else: "else", this: "this", null: "null", true: "true", + false: "false", while: "while", break: "break", + return: "return", delete: "delete", + disrupt: "disrupt", function: "function", continue: "continue", + disruption: "disruption" + } + + var pk = function() { + if (pos >= len) return -1 + return cp[pos] + } + + var pk_at = function(n) { + var idx = pos + n + if (idx >= len) return -1 + return cp[idx] + } + + var adv = function() { + var c = cp[pos] + pos = pos + 1 + if (c == CP_LF) { + row = row + 1 + col = 0 + } else { + col = col + 1 + } + return c + } + + var is_digit = function(c) { + return c >= CP_0 && c <= CP_9 + } + + var is_hex = function(c) { + return (c >= CP_0 && c <= CP_9) || (c >= CP_a && c <= CP_f) || (c >= CP_A && c <= CP_F) + } + + var is_alpha = function(c) { + return (c >= CP_a && c <= CP_z) || (c >= CP_A && c <= CP_Z) + } + + var is_alnum = function(c) { + return is_alpha(c) || is_digit(c) + } + + var is_ident_start = function(c) { + return is_alpha(c) || c == CP_UNDERSCORE || c == CP_DOLLAR + } + + var is_ident_char = function(c) { + return is_alnum(c) || c == CP_UNDERSCORE || c == CP_DOLLAR || c == CP_QMARK || c == CP_BANG + } + + var substr = function(start, end) { + var s = "" + var i = start + while (i < end) { + s = s + character(cp[i]) + i = i + 1 + } + return s + } + + var read_string = function(quote_cp) { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + adv() // skip opening quote + while (pos < len && pk() != quote_cp) { + if (pk() == CP_BSLASH) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_SQUOTE) { value = value + "'" } + else if (esc == CP_DQUOTE) { value = value + "\"" } + else if (esc == CP_0) { value = value + character(0) } + else if (esc == CP_BACKTICK) { value = value + "`" } + else { value = value + character(esc) } + } else { + value = value + character(adv()) + } + } + if (pos < len) adv() // skip closing quote + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) + } + + var read_template = function() { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + var depth = 0 + var tc = 0 + var q = 0 + adv() // skip opening backtick + while (pos < len && pk() != CP_BACKTICK) { + if (pk() == CP_BSLASH && pos + 1 < len) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_BACKTICK) { value = value + "`" } + else if (esc == CP_DOLLAR) { value = value + "$" } + else if (esc == CP_0) { value = value + character(0) } + else { value = value + character(esc) } + } else if (pk() == CP_DOLLAR && pos + 1 < len && pk_at(1) == CP_LBRACE) { + adv() // $ + adv() // { + depth = 1 + while (pos < len && depth > 0) { + tc = pk() + if (tc == CP_LBRACE) { depth = depth + 1; adv() } + else if (tc == CP_RBRACE) { depth = depth - 1; adv() } + else if (tc == CP_SQUOTE || tc == CP_DQUOTE || tc == CP_BACKTICK) { + q = adv() + while (pos < len && pk() != q) { + if (pk() == CP_BSLASH && pos + 1 < len) adv() + adv() + } + if (pos < len) adv() + } else { adv() } + } + } else { + value = value + character(adv()) + } + } + if (pos < len) adv() // skip closing backtick + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) + } + + var read_number = function() { + var start = pos + var start_row = row + var start_col = col + var raw = "" + if (pk() == CP_0 && (pk_at(1) == CP_x || pk_at(1) == CP_X)) { + adv(); adv() + while (pos < len && (is_hex(pk()) || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_b || pk_at(1) == CP_B)) { + adv(); adv() + while (pos < len && (pk() == CP_0 || pk() == CP_1 || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_o || pk_at(1) == CP_O)) { + adv(); adv() + while (pos < len && pk() >= CP_0 && pk() <= CP_7) adv() + } else { + while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + if (pos < len && pk() == CP_DOT) { + adv() + while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + } + if (pos < len && (pk() == CP_e || pk() == CP_E)) { + adv() + if (pos < len && (pk() == CP_PLUS || pk() == CP_MINUS)) adv() + while (pos < len && is_digit(pk())) adv() + } + } + raw = substr(start, pos) + push(tokens, { + kind: "number", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw, number: number(raw) + }) + } + + var read_name = function() { + var start = pos + var start_row = row + var start_col = col + var name = "" + var kw = null + while (pos < len && is_ident_char(pk())) adv() + name = substr(start, pos) + kw = keywords[name] + if (kw != null) { + push(tokens, { + kind: kw, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) + } else { + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: name + }) + } + } + + var read_comment = function() { + var start = pos + var start_row = row + var start_col = col + var raw = "" + if (pk_at(1) == CP_SLASH) { + while (pos < len && pk() != CP_LF && pk() != CP_CR) adv() + } else { + adv(); adv() // skip /* + while (pos < len) { + if (pk() == CP_STAR && pk_at(1) == CP_SLASH) { + adv(); adv() + break + } + adv() + } + } + raw = substr(start, pos) + push(tokens, { + kind: "comment", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw + }) + } + + var emit_op = function(kind, count) { + var start = pos + var start_row = row + var start_col = col + var i = 0 + while (i < count) { adv(); i = i + 1 } + push(tokens, { + kind: kind, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) + } + + var emit_ident = function(count) { + var start = pos + var start_row = row + var start_col = col + var val = "" + var i = 0 + while (i < count) { val = val + character(adv()); i = i + 1 } + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: val + }) + } + + var tokenize_one = function() { + var c = pk() + var start = 0 + var start_row = 0 + var start_col = 0 + var raw = "" + if (c == -1) return false + + if (c == CP_LF) { + start = pos; start_row = row; start_col = col + adv() + push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) + return true + } + if (c == CP_CR) { + start = pos; start_row = row; start_col = col + adv() + if (pos < len && pk() == CP_LF) adv() + push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) + return true + } + if (c == CP_SPACE || c == CP_TAB) { + start = pos; start_row = row; start_col = col + while (pos < len && (pk() == CP_SPACE || pk() == CP_TAB)) adv() + raw = substr(start, pos) + push(tokens, { kind: "space", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw }) + return true + } + if (c == CP_SQUOTE || c == CP_DQUOTE) { read_string(c); return true } + if (c == CP_BACKTICK) { read_template(); return true } + if (is_digit(c)) { read_number(); return true } + if (c == CP_DOT && is_digit(pk_at(1))) { read_number(); return true } + if (is_ident_start(c)) { read_name(); return true } + if (c == CP_SLASH) { + if (pk_at(1) == CP_SLASH || pk_at(1) == CP_STAR) { read_comment(); return true } + if (pk_at(1) == CP_EQ) { emit_op("/=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("/", 1); return true + } + if (c == CP_STAR) { + if (pk_at(1) == CP_STAR) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("**=", 3); return true } + emit_op("**", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("*=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("*", 1); return true + } + if (c == CP_PERCENT) { + if (pk_at(1) == CP_EQ) { emit_op("%=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("%", 1); return true + } + if (c == CP_PLUS) { + if (pk_at(1) == CP_EQ) { emit_op("+=", 2); return true } + if (pk_at(1) == CP_PLUS) { emit_op("++", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("+", 1); return true + } + if (c == CP_MINUS) { + if (pk_at(1) == CP_EQ) { emit_op("-=", 2); return true } + if (pk_at(1) == CP_MINUS) { emit_op("--", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("-", 1); return true + } + if (c == CP_LT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op("<=", 2); return true } + if (pk_at(1) == CP_LT) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("<<=", 3); return true } + emit_op("<<", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("<", 1); return true + } + if (c == CP_GT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op(">=", 2); return true } + if (pk_at(1) == CP_GT) { + if (pk_at(2) == CP_GT) { + if (pk_at(3) == CP_BANG) { emit_ident(4); return true } + if (pk_at(3) == CP_EQ) { emit_op(">>>=", 4); return true } + emit_op(">>>", 3); return true + } + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op(">>=", 3); return true } + emit_op(">>", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op(">", 1); return true + } + if (c == CP_EQ) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_EQ) { emit_op("===", 3); return true } + emit_op("==", 2); return true + } + if (pk_at(1) == CP_GT) { emit_op("=>", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("=", 1); return true + } + if (c == CP_BANG) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("!==", 3); return true } + emit_op("!=", 2); return true + } + emit_op("!", 1); return true + } + if (c == CP_AMP) { + if (pk_at(1) == CP_AMP) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("&&=", 3); return true } + emit_op("&&", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("&=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("&", 1); return true + } + if (c == CP_PIPE) { + if (pk_at(1) == CP_PIPE) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("||=", 3); return true } + emit_op("||", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("|=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("|", 1); return true + } + if (c == CP_CARET) { + if (pk_at(1) == CP_EQ) { emit_op("^=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("^", 1); return true + } + if (c == CP_LBRACKET) { + if (pk_at(1) == CP_RBRACKET && pk_at(2) == CP_BANG) { emit_ident(3); return true } + emit_op("[", 1); return true + } + if (c == CP_TILDE) { + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("~", 1); return true + } + emit_op(character(c), 1) + return true + } + + // Main loop + while (pos < len) { + tokenize_one() + } + + // EOF token + push(tokens, { kind: "eof", at: pos, from_row: row, from_column: col, to_row: row, to_column: col }) + + return {filename: filename, tokens: tokens, cp: cp} +} + +return tokenize