From 368511f666317fefeffca51777f062e436696682 Mon Sep 17 00:00:00 2001 From: John Alanbrook Date: Mon, 9 Feb 2026 11:56:09 -0600 Subject: [PATCH] parse.ce and tokenize.ce --- internal/bootstrap.cm | 15 +- parse.ce | 2373 +++++++++++++++++++++++++++++++++++++++++ source/cell.c | 16 +- source/quickjs.h | 3 + source/runtime.c | 45 +- tokenize.ce | 569 ++++++++++ 6 files changed, 3002 insertions(+), 19 deletions(-) create mode 100644 parse.ce create mode 100644 tokenize.ce diff --git a/internal/bootstrap.cm b/internal/bootstrap.cm index 3b53d01d..002a8519 100644 --- a/internal/bootstrap.cm +++ b/internal/bootstrap.cm @@ -1,14 +1,17 @@ -// Hidden vars (os, program) come from env +// Hidden vars (os, args) come from env +// args[0] = script filename, args[1..] = user args var load_internal = os.load_internal function use_embed(name) { return load_internal("js_" + name + "_use") } var fd = use_embed('fd') +var json = use_embed('json') var use_cache = {} use_cache['fd'] = fd use_cache['os'] = os +use_cache['json'] = json function use(path) { if (use_cache[path]) @@ -34,7 +37,15 @@ function use(path) { } // Load and run the user's program +var program = args[0] + +var user_args = [] +var _i = 1 +while (_i < length(args)) { + push(user_args, args[_i]) + _i = _i + 1 +} var blob = fd.slurp(program) stone(blob) var script = text(blob) -mach_eval(program, script, {use: use}) +mach_eval(program, script, {use: use, args: user_args, json: json}) diff --git a/parse.ce b/parse.ce new file mode 100644 index 00000000..70d0637d --- /dev/null +++ b/parse.ce @@ -0,0 +1,2373 @@ +// ============================================================ +// Section 1: Inline Tokenizer (from tokenize.ce) +// ============================================================ + +var src = args[0] +var filename = length(args) > 1 ? args[1] : "" + +// Convert to codepoint array +var _src_len = length(src) +var cp = [] +var _i = 0 +while (_i < _src_len) { + push(cp, codepoint(src[_i])) + _i = _i + 1 +} + +var pos = 0 +var row = 0 +var col = 0 +var tokens = [] + +// Codepoint constants +def CP_LF = 10 +def CP_CR = 13 +def CP_TAB = 9 +def CP_SPACE = 32 +def CP_BANG = 33 +def CP_DQUOTE = 34 +def CP_HASH = 35 +def CP_DOLLAR = 36 +def CP_PERCENT = 37 +def CP_AMP = 38 +def CP_SQUOTE = 39 +def CP_LPAREN = 40 +def CP_RPAREN = 41 +def CP_STAR = 42 +def CP_PLUS = 43 +def CP_COMMA = 44 +def CP_MINUS = 45 +def CP_DOT = 46 +def CP_SLASH = 47 +def CP_0 = 48 +def CP_1 = 49 +def CP_7 = 55 +def CP_9 = 57 +def CP_COLON = 58 +def CP_SEMI = 59 +def CP_LT = 60 +def CP_EQ = 61 +def CP_GT = 62 +def CP_QMARK = 63 +def CP_AT = 64 +def CP_A = 65 +def CP_B = 66 +def CP_E = 69 +def CP_F = 70 +def CP_O = 79 +def CP_X = 88 +def CP_Z = 90 +def CP_LBRACKET = 91 +def CP_BSLASH = 92 +def CP_RBRACKET = 93 +def CP_CARET = 94 +def CP_UNDERSCORE = 95 +def CP_BACKTICK = 96 +def CP_a = 97 +def CP_b = 98 +def CP_e = 101 +def CP_f = 102 +def CP_n = 110 +def CP_o = 111 +def CP_r = 114 +def CP_t = 116 +def CP_x = 120 +def CP_z = 122 +def CP_LBRACE = 123 +def CP_PIPE = 124 +def CP_RBRACE = 125 +def CP_TILDE = 126 + +var keywords = { + if: "if", in: "in", do: "do", go: "go", + var: "var", def: "def", for: "for", + else: "else", this: "this", null: "null", true: "true", + false: "false", while: "while", break: "break", + return: "return", delete: "delete", + disrupt: "disrupt", function: "function", continue: "continue", + disruption: "disruption" +} + +function pk() { + if (pos >= _src_len) return -1 + return cp[pos] +} + +function pk_at(n) { + var idx = pos + n + if (idx >= _src_len) return -1 + return cp[idx] +} + +function adv() { + var c = cp[pos] + pos = pos + 1 + if (c == CP_LF) { + row = row + 1 + col = 0 + } else { + col = col + 1 + } + return c +} + +function is_digit(c) { + return c >= CP_0 && c <= CP_9 +} + +function is_hex(c) { + return (c >= CP_0 && c <= CP_9) || (c >= CP_a && c <= CP_f) || (c >= CP_A && c <= CP_F) +} + +function is_alpha(c) { + return (c >= CP_a && c <= CP_z) || (c >= CP_A && c <= CP_Z) +} + +function is_alnum(c) { + return is_alpha(c) || is_digit(c) +} + +function is_ident_start(c) { + return is_alpha(c) || c == CP_UNDERSCORE || c == CP_DOLLAR +} + +function is_ident_char(c) { + return is_alnum(c) || c == CP_UNDERSCORE || c == CP_DOLLAR || c == CP_QMARK || c == CP_BANG +} + +function substr(start, end) { + var s = "" + var i = start + while (i < end) { + s = s + character(cp[i]) + i = i + 1 + } + return s +} + +function read_string(quote_cp) { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + adv() + while (pos < _src_len && pk() != quote_cp) { + if (pk() == CP_BSLASH) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_SQUOTE) { value = value + "'" } + else if (esc == CP_DQUOTE) { value = value + "\"" } + else if (esc == CP_0) { value = value + character(0) } + else if (esc == CP_BACKTICK) { value = value + "`" } + else { value = value + character(esc) } + } else { + value = value + character(adv()) + } + } + if (pos < _src_len) adv() + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) +} + +function read_template() { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + var depth = 0 + var tc = 0 + var q = 0 + adv() + while (pos < _src_len && pk() != CP_BACKTICK) { + if (pk() == CP_BSLASH && pos + 1 < _src_len) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_BACKTICK) { value = value + "`" } + else if (esc == CP_DOLLAR) { value = value + "$" } + else if (esc == CP_0) { value = value + character(0) } + else { value = value + character(esc) } + } else if (pk() == CP_DOLLAR && pos + 1 < _src_len && pk_at(1) == CP_LBRACE) { + adv() + adv() + depth = 1 + while (pos < _src_len && depth > 0) { + tc = pk() + if (tc == CP_LBRACE) { depth = depth + 1; adv() } + else if (tc == CP_RBRACE) { depth = depth - 1; adv() } + else if (tc == CP_SQUOTE || tc == CP_DQUOTE || tc == CP_BACKTICK) { + q = adv() + while (pos < _src_len && pk() != q) { + if (pk() == CP_BSLASH && pos + 1 < _src_len) adv() + adv() + } + if (pos < _src_len) adv() + } else { adv() } + } + } else { + value = value + character(adv()) + } + } + if (pos < _src_len) adv() + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) +} + +function read_number() { + var start = pos + var start_row = row + var start_col = col + var raw = "" + if (pk() == CP_0 && (pk_at(1) == CP_x || pk_at(1) == CP_X)) { + adv(); adv() + while (pos < _src_len && (is_hex(pk()) || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_b || pk_at(1) == CP_B)) { + adv(); adv() + while (pos < _src_len && (pk() == CP_0 || pk() == CP_1 || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_o || pk_at(1) == CP_O)) { + adv(); adv() + while (pos < _src_len && pk() >= CP_0 && pk() <= CP_7) adv() + } else { + while (pos < _src_len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + if (pos < _src_len && pk() == CP_DOT) { + adv() + while (pos < _src_len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + } + if (pos < _src_len && (pk() == CP_e || pk() == CP_E)) { + adv() + if (pos < _src_len && (pk() == CP_PLUS || pk() == CP_MINUS)) adv() + while (pos < _src_len && is_digit(pk())) adv() + } + } + raw = substr(start, pos) + push(tokens, { + kind: "number", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw, number: number(raw) + }) +} + +function read_name() { + var start = pos + var start_row = row + var start_col = col + var name = "" + var kw = null + while (pos < _src_len && is_ident_char(pk())) adv() + name = substr(start, pos) + kw = keywords[name] + if (kw != null) { + push(tokens, { + kind: kw, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) + } else { + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: name + }) + } +} + +function read_comment() { + var start = pos + var start_row = row + var start_col = col + var raw = "" + if (pk_at(1) == CP_SLASH) { + while (pos < _src_len && pk() != CP_LF && pk() != CP_CR) adv() + } else { + adv(); adv() + while (pos < _src_len) { + if (pk() == CP_STAR && pk_at(1) == CP_SLASH) { + adv(); adv() + break + } + adv() + } + } + raw = substr(start, pos) + push(tokens, { + kind: "comment", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw + }) +} + +function emit_op(kind, count) { + var start = pos + var start_row = row + var start_col = col + var i = 0 + while (i < count) { adv(); i = i + 1 } + push(tokens, { + kind: kind, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) +} + +function emit_ident(count) { + var start = pos + var start_row = row + var start_col = col + var val = "" + var i = 0 + while (i < count) { val = val + character(adv()); i = i + 1 } + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: val + }) +} + +function tokenize_one() { + var c = pk() + var start = 0 + var start_row = 0 + var start_col = 0 + var raw = "" + if (c == -1) return false + + if (c == CP_LF) { + start = pos; start_row = row; start_col = col + adv() + push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) + return true + } + if (c == CP_CR) { + start = pos; start_row = row; start_col = col + adv() + if (pos < _src_len && pk() == CP_LF) adv() + push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) + return true + } + if (c == CP_SPACE || c == CP_TAB) { + start = pos; start_row = row; start_col = col + while (pos < _src_len && (pk() == CP_SPACE || pk() == CP_TAB)) adv() + raw = substr(start, pos) + push(tokens, { kind: "space", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw }) + return true + } + if (c == CP_SQUOTE || c == CP_DQUOTE) { read_string(c); return true } + if (c == CP_BACKTICK) { read_template(); return true } + if (is_digit(c)) { read_number(); return true } + if (c == CP_DOT && is_digit(pk_at(1))) { read_number(); return true } + if (is_ident_start(c)) { read_name(); return true } + if (c == CP_SLASH) { + if (pk_at(1) == CP_SLASH || pk_at(1) == CP_STAR) { read_comment(); return true } + if (pk_at(1) == CP_EQ) { emit_op("/=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("/", 1); return true + } + if (c == CP_STAR) { + if (pk_at(1) == CP_STAR) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("**=", 3); return true } + emit_op("**", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("*=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("*", 1); return true + } + if (c == CP_PERCENT) { + if (pk_at(1) == CP_EQ) { emit_op("%=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("%", 1); return true + } + if (c == CP_PLUS) { + if (pk_at(1) == CP_EQ) { emit_op("+=", 2); return true } + if (pk_at(1) == CP_PLUS) { emit_op("++", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("+", 1); return true + } + if (c == CP_MINUS) { + if (pk_at(1) == CP_EQ) { emit_op("-=", 2); return true } + if (pk_at(1) == CP_MINUS) { emit_op("--", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("-", 1); return true + } + if (c == CP_LT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op("<=", 2); return true } + if (pk_at(1) == CP_LT) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("<<=", 3); return true } + emit_op("<<", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("<", 1); return true + } + if (c == CP_GT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op(">=", 2); return true } + if (pk_at(1) == CP_GT) { + if (pk_at(2) == CP_GT) { + if (pk_at(3) == CP_BANG) { emit_ident(4); return true } + if (pk_at(3) == CP_EQ) { emit_op(">>>=", 4); return true } + emit_op(">>>", 3); return true + } + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op(">>=", 3); return true } + emit_op(">>", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op(">", 1); return true + } + if (c == CP_EQ) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_EQ) { emit_op("===", 3); return true } + emit_op("==", 2); return true + } + if (pk_at(1) == CP_GT) { emit_op("=>", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("=", 1); return true + } + if (c == CP_BANG) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("!==", 3); return true } + emit_op("!=", 2); return true + } + emit_op("!", 1); return true + } + if (c == CP_AMP) { + if (pk_at(1) == CP_AMP) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("&&=", 3); return true } + emit_op("&&", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("&=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("&", 1); return true + } + if (c == CP_PIPE) { + if (pk_at(1) == CP_PIPE) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("||=", 3); return true } + emit_op("||", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("|=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("|", 1); return true + } + if (c == CP_CARET) { + if (pk_at(1) == CP_EQ) { emit_op("^=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("^", 1); return true + } + if (c == CP_LBRACKET) { + if (pk_at(1) == CP_RBRACKET && pk_at(2) == CP_BANG) { emit_ident(3); return true } + emit_op("[", 1); return true + } + if (c == CP_TILDE) { + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("~", 1); return true + } + emit_op(character(c), 1) + return true +} + +// Tokenize +while (pos < _src_len) { + tokenize_one() +} +push(tokens, { kind: "eof", at: pos, from_row: row, from_column: col, to_row: row, to_column: col }) + +// ============================================================ +// Section 2: Parser Cursor +// ============================================================ + +var cursor = 0 +var tok = null +var got_lf = false +var prev_tok = null + +function advance() { + var t = null + var k = null + prev_tok = tok + cursor = cursor + 1 + got_lf = false + while (cursor < length(tokens)) { + t = tokens[cursor] + k = t.kind + if (k == "space" || k == "comment") { + cursor = cursor + 1 + continue + } + if (k == "newline") { + got_lf = true + cursor = cursor + 1 + continue + } + tok = t + return null + } + tok = tokens[length(tokens) - 1] +} + +function peek_ahead(n) { + var c = cursor + 1 + var count = 0 + var t = null + var k = null + while (c < length(tokens)) { + t = tokens[c] + k = t.kind + if (k != "space" && k != "comment" && k != "newline") { + count = count + 1 + if (count == n) return t + } + c = c + 1 + } + return tokens[length(tokens) - 1] +} + +function init_cursor() { + cursor = -1 + advance() +} + +// ============================================================ +// Section 3: AST Helpers +// ============================================================ + +var errors = [] +var error_count = 0 +var function_nr = 1 + +function ast_node(kind, token) { + return { + kind: kind, + at: token.at, + from_row: token.from_row, + from_column: token.from_column + } +} + +function ast_node_end(node) { + node.to_row = prev_tok.to_row + node.to_column = prev_tok.to_column + return node +} + +function parse_error(token, msg) { + if (error_count >= 5) return null + error_count = error_count + 1 + push(errors, { + message: msg, + line: token.from_row + 1, + column: token.from_column + 1, + offset: token.at + }) +} + +function is_keyword(kind) { + return kind == "if" || kind == "in" || kind == "do" || kind == "go" || + kind == "var" || kind == "def" || kind == "for" || + kind == "else" || kind == "this" || kind == "null" || kind == "true" || + kind == "false" || kind == "while" || kind == "break" || + kind == "return" || kind == "delete" || + kind == "disrupt" || kind == "function" || kind == "continue" || + kind == "disruption" +} + +// ============================================================ +// Section 4: Expression Parsing +// ============================================================ + +// Forward declarations via var +var parse_expr = null +var parse_assign_expr = null +var parse_assign = null +var parse_statement = null +var parse_block_statements = null +var parse_function_inner = null +var parse_arrow_function = null + +function is_arrow_function() { + // Check if ( ... ) => pattern + if (tok.kind != "(") return false + var c = cursor + 1 + var depth = 1 + var k = null + while (c < length(tokens) && depth > 0) { + k = tokens[c].kind + if (k == "(") { depth = depth + 1 } + else if (k == ")") { depth = depth - 1 } + else if (k == "text" || k == "number") { null } + c = c + 1 + } + // Skip whitespace/newline/comment tokens + while (c < length(tokens)) { + k = tokens[c].kind + if (k != "space" && k != "newline" && k != "comment") break + c = c + 1 + } + if (c >= length(tokens)) return false + return tokens[c].kind == "=>" +} + +function parse_primary() { + var start = tok + var node = null + var k = tok.kind + var list = null + var pair = null + var left = null + var right = null + var is_ident = false + var is_kw = false + var p1 = null + var elem = null + var fn_start = null + var fn = null + var name_item = null + var params = null + var param = null + var rpos = 0 + var pattern_str = "" + var flags = "" + + if (k == "number") { + node = ast_node("number", start) + node.value = tok.value + node.number = tok.number + advance() + ast_node_end(node) + return node + } + if (k == "text") { + node = ast_node("text", start) + node.value = tok.value + advance() + ast_node_end(node) + return node + } + if (k == "name") { + // Check for single-param arrow: name => + p1 = peek_ahead(1) + if (p1.kind == "=>") { + return parse_arrow_function() + } + node = ast_node("name", start) + node.name = tok.value + advance() + ast_node_end(node) + return node + } + if (k == "null") { + node = ast_node("null", start) + advance() + ast_node_end(node) + return node + } + if (k == "true") { + node = ast_node("true", start) + advance() + ast_node_end(node) + return node + } + if (k == "false") { + node = ast_node("false", start) + advance() + ast_node_end(node) + return node + } + if (k == "this") { + node = ast_node("this", start) + advance() + ast_node_end(node) + return node + } + if (k == "[") { + node = ast_node("array", start) + list = [] + node.list = list + advance() + while (tok.kind != "]" && tok.kind != "eof") { + elem = parse_assign_expr() + if (elem != null) push(list, elem) + if (tok.kind == ",") advance() + else break + } + ast_node_end(node) + if (tok.kind == "]") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated array literal, expected ']'") + return node + } + if (k == "{") { + node = ast_node("record", start) + list = [] + node.list = list + advance() + while (tok.kind != "}" && tok.kind != "eof") { + pair = {} + is_ident = (tok.kind == "name") + is_kw = is_keyword(tok.kind) + if (is_ident || is_kw || tok.kind == "text" || tok.kind == "number") { + if (is_kw) { + left = ast_node("name", tok) + left.name = tok.kind + advance() + ast_node_end(left) + } else { + left = parse_primary() + } + pair.left = left + } else if (tok.kind == "[") { + advance() + left = parse_assign_expr() + pair.left = left + if (tok.kind == "]") advance() + else parse_error(tok, "expected ']' after computed property") + } else { + parse_error(tok, "expected property name in object literal") + break + } + if (tok.kind == ":") { + advance() + right = parse_assign_expr() + pair.right = right + } else if (tok.kind == "(") { + // Method shorthand + fn_start = tok + fn = ast_node("function", fn_start) + name_item = pair.left + if (name_item != null && name_item.name != null) { + fn.name = name_item.name + } + params = [] + fn.list = params + advance() + while (tok.kind != ")" && tok.kind != "eof") { + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + advance() + ast_node_end(param) + if (tok.kind == "=" || tok.kind == "|") { + advance() + param.expression = parse_expr() + } + push(params, param) + } else { + parse_error(tok, "expected parameter name") + break + } + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated method parameter list") + if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") + if (tok.kind == "{") { + advance() + fn.statements = parse_block_statements() + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated method body") + } else { + parse_error(tok, "expected '{' for method body") + } + fn.function_nr = function_nr + function_nr = function_nr + 1 + ast_node_end(fn) + pair.right = fn + } else if (!(is_ident && (tok.kind == "," || tok.kind == "}"))) { + parse_error(tok, "expected ':' after property name") + } + push(list, pair) + if (tok.kind == ",") advance() + else break + } + ast_node_end(node) + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated object literal, expected '}'") + return node + } + if (k == "(") { + if (is_arrow_function()) { + return parse_arrow_function() + } + advance() + node = parse_expr() + if (tok.kind == ")") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated parenthesized expression, expected ')'") + else parse_error(tok, "expected ')' after expression") + return node + } + if (k == "function") { + return parse_function_inner() + } + if (k == "/") { + // Regex literal + node = ast_node("regexp", start) + // Re-scan from token position to parse regex + rpos = tok.at + 1 + pattern_str = "" + flags = "" + while (rpos < _src_len && cp[rpos] != CP_SLASH) { + if (cp[rpos] == CP_BSLASH && rpos + 1 < _src_len) { + pattern_str = pattern_str + character(cp[rpos]) + character(cp[rpos + 1]) + rpos = rpos + 2 + } else { + pattern_str = pattern_str + character(cp[rpos]) + rpos = rpos + 1 + } + } + if (rpos < _src_len) rpos = rpos + 1 + while (rpos < _src_len && is_alpha(cp[rpos])) { + flags = flags + character(cp[rpos]) + rpos = rpos + 1 + } + node.pattern = pattern_str + if (length(flags) > 0) node.flags = flags + advance() + ast_node_end(node) + return node + } + + // Error + if (k == "eof") { + parse_error(start, "unexpected end of input") + } else { + parse_error(start, "unexpected token where expression expected") + } + advance() + return null +} + +function parse_postfix() { + var node = parse_primary() + var start = null + var new_node = null + var index = null + var arg = null + var args_list = null + if (node == null) return null + while (true) { + start = tok + if (tok.kind == ".") { + advance() + new_node = ast_node(".", start) + new_node.left = node + if (tok.kind == "name" || is_keyword(tok.kind)) { + if (tok.kind == "name") { + new_node.right = tok.value + } else { + new_node.right = tok.kind + } + advance() + } else { + parse_error(tok, "expected property name after '.'") + } + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "[") { + advance() + new_node = ast_node("[", start) + new_node.left = node + if (tok.kind == "]") { + advance() + } else { + index = parse_assign_expr() + new_node.right = index + if (tok.kind == "]") advance() + else parse_error(tok, "expected ']'") + } + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "(") { + advance() + new_node = ast_node("(", start) + new_node.expression = node + args_list = [] + new_node.list = args_list + while (tok.kind != ")" && tok.kind != "eof") { + arg = parse_assign_expr() + if (arg != null) push(args_list, arg) + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + else parse_error(tok, "unterminated argument list, expected ')'") + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "++") { + new_node = ast_node("++", start) + new_node.expression = node + new_node.postfix = true + advance() + ast_node_end(new_node) + node = new_node + } else if (tok.kind == "--") { + new_node = ast_node("--", start) + new_node.expression = node + new_node.postfix = true + advance() + ast_node_end(new_node) + node = new_node + } else { + break + } + } + return node +} + +function parse_unary() { + var start = tok + var node = null + var expr = null + var k = tok.kind + if (k == "!") { + advance() + node = ast_node("!", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "~") { + advance() + node = ast_node("~", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "+") { + advance() + node = ast_node("+unary", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "-") { + advance() + node = ast_node("-unary", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + if (k == "++") { + advance() + node = ast_node("++", start) + node.expression = parse_unary() + node.postfix = false + ast_node_end(node) + return node + } + if (k == "--") { + advance() + node = ast_node("--", start) + node.expression = parse_unary() + node.postfix = false + ast_node_end(node) + return node + } + if (k == "delete") { + advance() + node = ast_node("delete", start) + node.expression = parse_unary() + ast_node_end(node) + return node + } + return parse_postfix() +} + +// Binary operator precedence +var binop_prec = { + "**": 14, + "*": 13, "/": 13, "%": 13, + "+": 12, "-": 12, + "<<": 11, ">>": 11, ">>>": 11, + "<": 10, ">": 10, "<=": 10, ">=": 10, in: 10, + "==": 9, "!=": 9, "===": 9, "!==": 9, + "&": 8, "^": 7, "|": 6, + "&&": 5, "||": 4 +} + +function parse_binary(min_prec) { + var left_node = parse_unary() + var start = null + var op = null + var prec = null + var next_prec = 0 + var right_node = null + var node = null + if (left_node == null) return null + while (true) { + start = tok + op = tok.kind + prec = binop_prec[op] + if (prec == null || prec < min_prec) break + advance() + next_prec = prec + 1 + if (prec == 14) next_prec = prec // right-assoc for ** + right_node = parse_binary(next_prec) + node = ast_node(op, start) + node.left = left_node + node.right = right_node + ast_node_end(node) + left_node = node + } + return left_node +} + +function parse_ternary() { + var cond = parse_binary(1) + var start = null + var then_expr = null + var else_expr = null + var node = null + if (cond == null) return null + if (tok.kind == "?") { + start = tok + advance() + then_expr = parse_expr() + if (tok.kind == ":") advance() + else parse_error(tok, "expected ':' in ternary expression") + else_expr = parse_expr() + node = ast_node("then", start) + node.expression = cond + node.then = then_expr + node.else = else_expr + ast_node_end(node) + return node + } + return cond +} + +// Assign operators +var assign_ops = { + "=": "assign", "+=": "+=", "-=": "-=", "*=": "*=", "/=": "/=", "%=": "%=", + "<<=": "<<=", ">>=": ">>=", ">>>=": ">>>=", + "&=": "&=", "^=": "^=", "|=": "|=", "**=": "**=", + "&&=": "&&=", "||=": "||=" +} + +parse_assign = function(unused) { + var left_node = parse_ternary() + var start = null + var kind = null + var right_node = null + var node = null + var left_kind = null + var right_kind = null + if (left_node == null) return null + start = tok + kind = assign_ops[tok.kind] + if (kind == null) return left_node + + // Validate assignment target + left_kind = left_node.kind + if (left_kind != "name" && left_kind != "." && left_kind != "[") { + parse_error(start, "invalid assignment left-hand side") + } + + advance() + right_node = parse_assign() + node = ast_node(kind, start) + node.left = left_node + node.right = right_node + + // Check push/pop bracket syntax + if (left_node.kind == "[" && left_node.right == null) node.push = true + if (right_node != null && right_node.kind == "[" && right_node.right == null) node.pop = true + + ast_node_end(node) + return node +} + +parse_assign_expr = function(unused) { + return parse_assign() +} + +parse_expr = function(unused) { + var left_node = parse_assign() + var start = null + var right_node = null + var node = null + if (left_node == null) return null + while (tok.kind == ",") { + start = tok + advance() + right_node = parse_assign() + node = ast_node(",", start) + node.left = left_node + node.right = right_node + ast_node_end(node) + left_node = node + } + return left_node +} + +// ============================================================ +// Section 5: Statement Parsing +// ============================================================ + +var in_disruption = 0 + +function expect_semi() { + if (tok.kind == ";") { advance(); return null } + if (tok.kind == "eof" || tok.kind == "}" || got_lf || tok.kind == "else") return null + parse_error(tok, "expecting ';'") +} + +function sync_to_statement() { + var k = null + while (tok.kind != "eof") { + k = tok.kind + if (k == ";") { advance(); return null } + if (k == "}") return null + if (k == "var" || k == "def" || k == "if" || k == "while" || + k == "for" || k == "return" || k == "disrupt" || + k == "function" || k == "break" || k == "continue" || k == "do") return null + advance() + } +} + +parse_block_statements = function(unused) { + var stmts = [] + var before = null + var stmt = null + while (tok.kind != "}" && tok.kind != "eof") { + before = cursor + stmt = parse_statement() + if (stmt != null) { + push(stmts, stmt) + } else if (cursor == before) { + sync_to_statement() + } + } + return stmts +} + +parse_function_inner = function(unused) { + var start = tok + var node = ast_node("function", start) + var params = [] + var stmts = null + var param = null + var prev_names = null + var pname = null + var dup = false + var j = 0 + var old_dis = 0 + + if (in_disruption) { + parse_error(tok, "cannot define function inside disruption clause") + } + + advance() // skip 'function' + + // Optional name + if (tok.kind == "name") { + node.name = tok.value + advance() + } + + // Parameters + node.list = params + if (tok.kind == "(") { + advance() + prev_names = [] + while (tok.kind != ")" && tok.kind != "eof") { + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + // Check duplicate + pname = tok.value + dup = false + j = 0 + while (j < length(prev_names)) { + if (prev_names[j] == pname) { dup = true; break } + j = j + 1 + } + if (dup) parse_error(tok, "duplicate parameter name '" + pname + "'") + push(prev_names, pname) + advance() + ast_node_end(param) + if (tok.kind == "=" || tok.kind == "|") { + advance() + param.expression = parse_assign_expr() + } + push(params, param) + } else { + parse_error(tok, "expected parameter name") + break + } + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated function parameter list, expected ')'") + } else { + parse_error(tok, "expected '(' after function name") + } + + if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") + + // Body + if (tok.kind == "{") { + advance() + stmts = parse_block_statements() + node.statements = stmts + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated function body, expected '}'") + } else { + parse_error(tok, "expected '{' for function body") + } + + // Disruption clause + if (tok.kind == "disruption") { + advance() + if (tok.kind == "{") { + advance() + old_dis = in_disruption + in_disruption = 1 + node.disruption = parse_block_statements() + in_disruption = old_dis + if (tok.kind == "}") advance() + else if (tok.kind == "eof") parse_error(tok, "unterminated disruption clause, expected '}'") + } else { + parse_error(tok, "expected '{' after disruption") + } + } + + node.function_nr = function_nr + function_nr = function_nr + 1 + ast_node_end(node) + return node +} + +parse_arrow_function = function(unused) { + var start = tok + var node = ast_node("function", start) + var params = [] + var param = null + var stmts = null + var ret = null + var expr = null + var prev_names = null + var pname = null + var dup = false + var j = 0 + node.arrow = true + + if (in_disruption) { + parse_error(tok, "cannot define function inside disruption clause") + } + + node.list = params + + if (tok.kind == "name") { + // Single param without parens + param = ast_node("name", tok) + param.name = tok.value + advance() + ast_node_end(param) + push(params, param) + } else if (tok.kind == "(") { + advance() + prev_names = [] + while (tok.kind != ")" && tok.kind != "eof") { + if (tok.kind == "name") { + param = ast_node("name", tok) + param.name = tok.value + pname = tok.value + dup = false + j = 0 + while (j < length(prev_names)) { + if (prev_names[j] == pname) { dup = true; break } + j = j + 1 + } + if (dup) parse_error(tok, "duplicate parameter name '" + pname + "'") + push(prev_names, pname) + advance() + ast_node_end(param) + if (tok.kind == "=" || tok.kind == "|") { + advance() + param.expression = parse_assign_expr() + } + push(params, param) + } else { + parse_error(tok, "expected parameter name") + break + } + if (tok.kind == ",") advance() + else break + } + if (tok.kind == ")") advance() + } + + if (length(params) > 4) parse_error(tok, "functions cannot have more than 4 parameters") + + // Arrow token + if (tok.kind != "=>") { + parse_error(tok, "expected '=>' in arrow function") + } else { + advance() + } + + // Body + if (tok.kind == "{") { + advance() + stmts = parse_block_statements() + node.statements = stmts + if (tok.kind == "}") advance() + } else { + // Expression body + stmts = [] + ret = ast_node("return", tok) + expr = parse_assign_expr() + ret.expression = expr + ast_node_end(ret) + push(stmts, ret) + node.statements = stmts + } + + node.function_nr = function_nr + function_nr = function_nr + 1 + ast_node_end(node) + return node +} + +parse_statement = function(unused) { + var start = tok + var node = null + var k = tok.kind + var stmts = null + var cond = null + var then_stmts = null + var else_stmts = null + var else_ifs = null + var body = null + var expr = null + var init = null + var test = null + var update = null + var left_node = null + var right_node = null + var kind_name = null + var is_def = false + var decls = null + var decl_count = 0 + var var_name = null + var right_kind = null + var elif = null + var p1_tok = null + var labeled_stmt = null + + if (k == "{") { + node = ast_node("block", start) + advance() + stmts = parse_block_statements() + node.statements = stmts + if (tok.kind == "}") advance() + ast_node_end(node) + return node + } + + if (k == "var" || k == "def") { + kind_name = k + is_def = (k == "def") + advance() + if (tok.kind != "name") { + parse_error(tok, "expected identifier after '" + kind_name + "'") + return null + } + decls = [] + decl_count = 0 + while (tok.kind == "name") { + node = ast_node(kind_name, start) + left_node = ast_node("name", tok) + left_node.name = tok.value + var_name = tok.value + advance() + ast_node_end(left_node) + node.left = left_node + if (tok.kind == "=") { + advance() + right_node = parse_assign_expr() + node.right = right_node + if (right_node != null && right_node.kind == "[" && right_node.right == null) { + node.pop = true + } + } else if (is_def) { + parse_error(start, "missing initializer for constant '" + var_name + "'") + } + ast_node_end(node) + push(decls, node) + decl_count = decl_count + 1 + if (tok.kind == ",") advance() + else break + } + expect_semi() + if (decl_count == 1) { + return decls[0] + } + node = ast_node("var_list", start) + node.list = decls + ast_node_end(node) + return node + } + + if (k == "if") { + node = ast_node("if", start) + advance() + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' before condition") + cond = parse_expr() + node.expression = cond + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after if condition") + then_stmts = [] + node.then = then_stmts + body = parse_statement() + if (body != null) push(then_stmts, body) + else_ifs = [] + node.list = else_ifs + if (tok.kind == "else") { + advance() + if (tok.kind == "if") { + elif = parse_statement() + if (elif != null) push(else_ifs, elif) + } else { + else_stmts = [] + node.else = else_stmts + body = parse_statement() + if (body != null) push(else_stmts, body) + } + } + ast_node_end(node) + return node + } + + if (k == "while") { + node = ast_node("while", start) + advance() + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' before condition") + cond = parse_expr() + node.expression = cond + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after while condition") + stmts = [] + node.statements = stmts + body = parse_statement() + if (body != null) push(stmts, body) + ast_node_end(node) + return node + } + + if (k == "do") { + node = ast_node("do", start) + advance() + stmts = [] + node.statements = stmts + body = parse_statement() + if (body != null) push(stmts, body) + if (tok.kind == "while") advance() + else parse_error(tok, "expected 'while' after do body") + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' before condition") + cond = parse_expr() + node.expression = cond + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after do-while condition") + expect_semi() + ast_node_end(node) + return node + } + + if (k == "for") { + node = ast_node("for", start) + advance() + if (tok.kind == "(") advance() + else parse_error(tok, "expected '(' after for") + if (tok.kind != ";") { + if (tok.kind == "var" || tok.kind == "def") { + init = parse_statement() + node.init = init + } else { + init = parse_expr() + node.init = init + if (tok.kind == ";") advance() + } + } else { + advance() + } + if (tok.kind != ";") { + test = parse_expr() + node.test = test + } + if (tok.kind == ";") advance() + if (tok.kind != ")") { + update = parse_expr() + node.update = update + } + if (tok.kind == ")") advance() + else parse_error(tok, "expected ')' after for clauses") + stmts = [] + node.statements = stmts + body = parse_statement() + if (body != null) push(stmts, body) + ast_node_end(node) + return node + } + + if (k == "return") { + node = ast_node("return", start) + advance() + if (tok.kind != ";" && tok.kind != "}" && !got_lf) { + expr = parse_expr() + node.expression = expr + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "go") { + node = ast_node("go", start) + advance() + if (tok.kind != ";" && tok.kind != "}" && !got_lf) { + expr = parse_expr() + node.expression = expr + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "disrupt") { + node = ast_node("disrupt", start) + advance() + expect_semi() + ast_node_end(node) + return node + } + + if (k == "break") { + node = ast_node("break", start) + advance() + if (tok.kind == "name" && !got_lf) { + node.name = tok.value + advance() + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "continue") { + node = ast_node("continue", start) + advance() + if (tok.kind == "name" && !got_lf) { + node.name = tok.value + advance() + } + expect_semi() + ast_node_end(node) + return node + } + + if (k == "function") { + return parse_function_inner() + } + + if (k == ";") { + advance() + return null + } + + if (k == "name") { + // Check for labeled statement + p1_tok = peek_ahead(1) + if (p1_tok.kind == ":") { + node = ast_node("label", start) + node.name = tok.value + advance() // skip identifier + advance() // skip colon + labeled_stmt = parse_statement() + node.statement = labeled_stmt + ast_node_end(node) + return node + } + } + + // Expression statement + expr = parse_expr() + if (expr != null) { + node = ast_node("call", start) + node.expression = expr + ast_node_end(node) + expect_semi() + return node + } + parse_error(start, "unexpected token at start of statement") + return null +} + +// ============================================================ +// Section 6: Program +// ============================================================ + +function parse_program() { + var root = {kind: "program", filename: filename} + var functions = [] + var statements = [] + var before = 0 + var stmt = null + root.functions = functions + root.statements = statements + + while (tok.kind != "eof") { + before = cursor + stmt = parse_statement() + if (stmt != null) { + if (stmt.kind == "function") { + push(functions, stmt) + } else { + push(statements, stmt) + } + } else if (cursor == before) { + sync_to_statement() + } + } + return root +} + +// ============================================================ +// Section 7: Semantic Analysis +// ============================================================ + +var sem_errors = [] +var scopes_array = [] +var intrinsics = [] +var block_var_counter = 0 + +function sem_error(node, msg) { + var err = {message: msg} + if (node.from_row != null) err.line = node.from_row + 1 + if (node.from_column != null) err.column = node.from_column + 1 + push(sem_errors, err) +} + +function make_scope(parent, fn_nr, opts) { + return { + parent: parent, + vars: [], + in_loop: opts.in_loop == true, + function_nr: fn_nr, + is_function_scope: opts.is_func == true, + block_depth: opts.bdepth != null ? opts.bdepth : 0 + } +} + +function sem_add_var(scope, name, make_opts) { + push(scope.vars, { + name: name, + scope_name: null, + is_const: make_opts.is_const == true, + make: make_opts.make, + function_nr: make_opts.fn_nr, + nr_uses: 0, + closure: 0 + }) +} + +function sem_lookup_var(scope, name) { + var result = {v: null, level: 0, def_function_nr: -1} + var cur_fn = scope.function_nr + var s = scope + var i = 0 + while (s != null) { + i = 0 + while (i < length(s.vars)) { + if (s.vars[i].name == name) { + result.v = s.vars[i] + result.def_function_nr = s.vars[i].function_nr + return result + } + i = i + 1 + } + if (s.parent != null && s.parent.function_nr != cur_fn) { + result.level = result.level + 1 + cur_fn = s.parent.function_nr + } + s = s.parent + } + return result +} + +function sem_find_var(scope, name) { + var r = sem_lookup_var(scope, name) + return r.v +} + +function sem_in_loop(scope) { + var s = scope + while (s != null) { + if (s.in_loop) return true + s = s.parent + } + return false +} + +function sem_add_intrinsic(name) { + var i = 0 + while (i < length(intrinsics)) { + if (intrinsics[i] == name) return null + i = i + 1 + } + push(intrinsics, name) +} + +var functino_names = { + "+!": true, "-!": true, "*!": true, "/!": true, "%!": true, "**!": true, + "!": true, "<=!": true, ">=!": true, "=!": true, "!=!": true, + "&!": true, "|!": true, "^!": true, "<>!": true, ">>>!": true, + "&&!": true, "||!": true, "~!": true, "[]!": true +} + +function is_functino_name(name) { + return functino_names[name] == true +} + +function sem_propagate_block_vars(parent, block) { + var i = 0 + var v = null + var sn = null + while (i < length(block.vars)) { + v = block.vars[i] + sn = v.scope_name + if (sn == null) sn = v.name + push(parent.vars, { + name: sn, + scope_name: null, + is_const: v.is_const, + make: v.make, + function_nr: v.function_nr, + nr_uses: v.nr_uses, + closure: v.closure + }) + i = i + 1 + } +} + +function sem_build_scope_record(scope) { + var rec = {function_nr: scope.function_nr} + var slots = 0 + var close_slots = 0 + var i = 0 + var v = null + while (i < length(scope.vars)) { + v = scope.vars[i] + rec[v.name] = { + make: v.make, + function_nr: v.function_nr, + nr_uses: v.nr_uses, + closure: v.closure == 1, + level: 0 + } + slots = slots + 1 + if (v.closure) close_slots = close_slots + 1 + i = i + 1 + } + return {rec: rec, nr_slots: slots, nr_close: close_slots} +} + +// Forward declarations +var sem_check_expr = null +var sem_check_stmt = null + +function sem_predeclare_vars(scope, stmts) { + var i = 0 + var stmt = null + var kind = null + var name = null + var item = null + var ik = null + var j = 0 + while (i < length(stmts)) { + stmt = stmts[i] + kind = stmt.kind + if (kind == "function") { + name = stmt.name + if (name != null && sem_find_var(scope, name) == null) { + sem_add_var(scope, name, {make: "function", fn_nr: scope.function_nr}) + } + } else if (kind == "var") { + name = stmt.left.name + if (name != null && sem_find_var(scope, name) == null) { + sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) + } + } else if (kind == "var_list") { + j = 0 + while (j < length(stmt.list)) { + item = stmt.list[j] + ik = item.kind + if (ik == "var") { + name = item.left.name + if (name != null && sem_find_var(scope, name) == null) { + sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) + } + } + j = j + 1 + } + } + i = i + 1 + } +} + +function sem_check_assign_target(scope, left_node) { + if (left_node == null) return null + var kind = left_node.kind + var name = null + var v = null + var r = null + var obj_expr = null + + if (kind == "name") { + name = left_node.name + if (name == null) return null + v = sem_find_var(scope, name) + if (v == null) { + sem_error(left_node, "cannot assign to unbound variable '" + name + "'") + } else if (v.is_const) { + sem_error(left_node, "cannot assign to constant '" + name + "'") + } + r = sem_lookup_var(scope, name) + if (r.v != null) { + left_node.level = r.level + left_node.function_nr = r.def_function_nr + if (r.v.scope_name != null) left_node.scope_name = r.v.scope_name + } else { + left_node.level = -1 + } + } else if (kind == "." || kind == "[") { + obj_expr = left_node.left + sem_check_expr(scope, obj_expr) + if (kind == "[" && left_node.right != null) { + sem_check_expr(scope, left_node.right) + } + } +} + +sem_check_expr = function(scope, expr) { + if (expr == null) return null + var kind = expr.kind + if (kind == null) return null + var name = null + var r = null + var i = 0 + var operand = null + var v = null + var prop = null + var val = null + var fn_nr_val = null + var fn_scope = null + var pname = null + var def_val = null + var sr = null + + // Assignment operators + if (kind == "assign" || kind == "+=" || kind == "-=" || kind == "*=" || + kind == "/=" || kind == "%=" || kind == "<<=" || kind == ">>=" || + kind == ">>>=" || kind == "&=" || kind == "^=" || kind == "|=" || + kind == "**=" || kind == "&&=" || kind == "||=") { + sem_check_assign_target(scope, expr.left) + sem_check_expr(scope, expr.right) + return null + } + + // Increment/decrement + if (kind == "++" || kind == "--") { + operand = expr.expression + if (operand != null && operand.kind == "name") { + name = operand.name + if (name != null) { + v = sem_find_var(scope, name) + if (v == null) { + sem_error(expr, "cannot assign to unbound variable '" + name + "'") + } else if (v.is_const) { + sem_error(expr, "cannot assign to constant '" + name + "'") + } + r = sem_lookup_var(scope, name) + if (r.v != null) { + operand.level = r.level + operand.function_nr = r.def_function_nr + if (r.v.scope_name != null) operand.scope_name = r.v.scope_name + } else { + operand.level = -1 + } + } + } + return null + } + + // Binary ops + if (kind == "," || kind == "+" || kind == "-" || kind == "*" || + kind == "/" || kind == "%" || kind == "==" || kind == "!=" || + kind == "<" || kind == ">" || kind == "<=" || kind == ">=" || + kind == "&&" || kind == "||" || kind == "&" || + kind == "|" || kind == "^" || kind == "<<" || kind == ">>" || + kind == ">>>" || kind == "**" || kind == "in" || + kind == "." || kind == "[") { + sem_check_expr(scope, expr.left) + sem_check_expr(scope, expr.right) + return null + } + + // Ternary + if (kind == "then") { + sem_check_expr(scope, expr.expression) + sem_check_expr(scope, expr.then) + sem_check_expr(scope, expr.else) + return null + } + + // Call + if (kind == "(") { + sem_check_expr(scope, expr.expression) + i = 0 + while (i < length(expr.list)) { + sem_check_expr(scope, expr.list[i]) + i = i + 1 + } + return null + } + + // Unary ops + if (kind == "!" || kind == "~" || kind == "delete" || + kind == "-unary" || kind == "+unary") { + sem_check_expr(scope, expr.expression) + return null + } + + // Array literal + if (kind == "array") { + i = 0 + while (i < length(expr.list)) { + sem_check_expr(scope, expr.list[i]) + i = i + 1 + } + return null + } + + // Record literal + if (kind == "record") { + i = 0 + while (i < length(expr.list)) { + prop = expr.list[i] + val = prop.right + sem_check_expr(scope, val) + i = i + 1 + } + return null + } + + // Function expression + if (kind == "function") { + fn_nr_val = expr.function_nr + if (fn_nr_val == null) fn_nr_val = scope.function_nr + fn_scope = make_scope(scope, fn_nr_val, {is_func: true}) + expr.outer = scope.function_nr + // Add params + i = 0 + while (i < length(expr.list)) { + pname = expr.list[i].name + if (pname != null) sem_add_var(fn_scope, pname, {is_const: true, make: "input", fn_nr: fn_nr_val}) + def_val = expr.list[i].expression + if (def_val != null) sem_check_expr(fn_scope, def_val) + i = i + 1 + } + // Pre-register declarations + if (expr.statements != null) { + sem_predeclare_vars(fn_scope, expr.statements) + i = 0 + while (i < length(expr.statements)) { + sem_check_stmt(fn_scope, expr.statements[i]) + i = i + 1 + } + } + // Disruption + if (expr.disruption != null) { + i = 0 + while (i < length(expr.disruption)) { + sem_check_stmt(fn_scope, expr.disruption[i]) + i = i + 1 + } + } + // Build scope record + sr = sem_build_scope_record(fn_scope) + push(scopes_array, sr.rec) + expr.nr_slots = sr.nr_slots + expr.nr_close_slots = sr.nr_close + return null + } + + // Template literal + if (kind == "text literal") { + i = 0 + while (i < length(expr.list)) { + sem_check_expr(scope, expr.list[i]) + i = i + 1 + } + return null + } + + // Name + if (kind == "name") { + name = expr.name + if (name != null) { + if (is_functino_name(name)) { + expr.make = "functino" + expr.level = -1 + return null + } + r = sem_lookup_var(scope, name) + if (r.v != null) { + expr.level = r.level + expr.function_nr = r.def_function_nr + r.v.nr_uses = r.v.nr_uses + 1 + if (r.level > 0) r.v.closure = 1 + if (r.v.scope_name != null) expr.scope_name = r.v.scope_name + } else { + expr.level = -1 + sem_add_intrinsic(name) + } + } + return null + } + + // Leaf nodes: number, text, regexp, null, true, false, this +} + +sem_check_stmt = function(scope, stmt) { + if (stmt == null) return null + var kind = stmt.kind + if (kind == null) return null + var name = null + var existing = null + var i = 0 + var sn = null + var then_scope = null + var list_scope = null + var else_scope = null + var loop_scope = null + var do_scope = null + var for_scope = null + var init_kind = null + var blk_scope = null + var fn_nr_val = null + var fn_scope = null + var pname = null + var def_val = null + var sr = null + + if (kind == "var_list") { + i = 0 + while (i < length(stmt.list)) { + sem_check_stmt(scope, stmt.list[i]) + i = i + 1 + } + return null + } + + if (kind == "var") { + name = stmt.left.name + if (name != null) { + existing = sem_find_var(scope, name) + if (existing != null && existing.is_const) { + sem_error(stmt.left, "cannot redeclare constant '" + name + "'") + } + if (existing == null || existing.function_nr != scope.function_nr || scope.block_depth > 0) { + sem_add_var(scope, name, {make: "var", fn_nr: scope.function_nr}) + } + if (scope.block_depth > 0) { + sn = "_" + name + "_" + text(block_var_counter) + block_var_counter = block_var_counter + 1 + scope.vars[length(scope.vars) - 1].scope_name = sn + stmt.left.scope_name = sn + } + } + sem_check_expr(scope, stmt.right) + return null + } + + if (kind == "def") { + name = stmt.left.name + if (name != null) { + existing = sem_find_var(scope, name) + if (existing != null && existing.is_const) { + sem_error(stmt.left, "cannot redeclare constant '" + name + "'") + } else if (existing != null && !existing.is_const && existing.function_nr == scope.function_nr) { + existing.is_const = 1 + existing.make = "def" + } else { + sem_add_var(scope, name, {is_const: true, make: "def", fn_nr: scope.function_nr}) + if (scope.block_depth > 0) { + sn = "_" + name + "_" + text(block_var_counter) + block_var_counter = block_var_counter + 1 + scope.vars[length(scope.vars) - 1].scope_name = sn + stmt.left.scope_name = sn + } + } + } + sem_check_expr(scope, stmt.right) + return null + } + + if (kind == "call") { + sem_check_expr(scope, stmt.expression) + return null + } + + if (kind == "if") { + sem_check_expr(scope, stmt.expression) + // then + then_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.then)) { + sem_check_stmt(then_scope, stmt.then[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, then_scope) + // else-if list + list_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.list)) { + sem_check_stmt(list_scope, stmt.list[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, list_scope) + // else + if (stmt.else != null) { + else_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.else)) { + sem_check_stmt(else_scope, stmt.else[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, else_scope) + } + return null + } + + if (kind == "while") { + sem_check_expr(scope, stmt.expression) + loop_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(loop_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, loop_scope) + return null + } + + if (kind == "do") { + do_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(do_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, do_scope) + sem_check_expr(scope, stmt.expression) + return null + } + + if (kind == "for") { + for_scope = make_scope(scope, scope.function_nr, {in_loop: true, bdepth: scope.block_depth + 1}) + if (stmt.init != null) { + init_kind = stmt.init.kind + if (init_kind == "var" || init_kind == "def") { + sem_check_stmt(for_scope, stmt.init) + } else { + sem_check_expr(for_scope, stmt.init) + } + } + sem_check_expr(for_scope, stmt.test) + sem_check_expr(for_scope, stmt.update) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(for_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, for_scope) + return null + } + + if (kind == "return" || kind == "go") { + sem_check_expr(scope, stmt.expression) + return null + } + + if (kind == "disrupt") { + return null + } + + if (kind == "break") { + if (!sem_in_loop(scope)) { + sem_error(stmt, "'break' used outside of loop") + } + return null + } + + if (kind == "continue") { + if (!sem_in_loop(scope)) { + sem_error(stmt, "'continue' used outside of loop") + } + return null + } + + if (kind == "block") { + blk_scope = make_scope(scope, scope.function_nr, {bdepth: scope.block_depth + 1}) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(blk_scope, stmt.statements[i]) + i = i + 1 + } + sem_propagate_block_vars(scope, blk_scope) + return null + } + + if (kind == "label") { + sem_check_stmt(scope, stmt.statement) + return null + } + + if (kind == "function") { + name = stmt.name + if (name != null) sem_add_var(scope, name, {make: "function", fn_nr: scope.function_nr}) + fn_nr_val = stmt.function_nr + if (fn_nr_val == null) fn_nr_val = scope.function_nr + fn_scope = make_scope(scope, fn_nr_val, {is_func: true}) + stmt.outer = scope.function_nr + i = 0 + while (i < length(stmt.list)) { + pname = stmt.list[i].name + if (pname != null) sem_add_var(fn_scope, pname, {is_const: true, make: "input", fn_nr: fn_nr_val}) + def_val = stmt.list[i].expression + if (def_val != null) sem_check_expr(fn_scope, def_val) + i = i + 1 + } + sem_predeclare_vars(fn_scope, stmt.statements) + i = 0 + while (i < length(stmt.statements)) { + sem_check_stmt(fn_scope, stmt.statements[i]) + i = i + 1 + } + if (stmt.disruption != null) { + i = 0 + while (i < length(stmt.disruption)) { + sem_check_stmt(fn_scope, stmt.disruption[i]) + i = i + 1 + } + } + sr = sem_build_scope_record(fn_scope) + push(scopes_array, sr.rec) + stmt.nr_slots = sr.nr_slots + stmt.nr_close_slots = sr.nr_close + return null + } +} + +function semantic_check(ast) { + var global_scope = make_scope(null, 0, {is_func: true}) + var i = 0 + var stmt = null + var name = null + + // Pre-register top-level function names + i = 0 + while (i < length(ast.functions)) { + name = ast.functions[i].name + if (name != null) sem_add_var(global_scope, name, {make: "function", fn_nr: 0}) + i = i + 1 + } + + // Check all statements + i = 0 + while (i < length(ast.statements)) { + sem_check_stmt(global_scope, ast.statements[i]) + i = i + 1 + } + + // Check function bodies + i = 0 + while (i < length(ast.functions)) { + sem_check_stmt(global_scope, ast.functions[i]) + i = i + 1 + } + + // Build program scope record and prepend + var sr = sem_build_scope_record(global_scope) + var new_scopes = [sr.rec] + i = 0 + while (i < length(scopes_array)) { + push(new_scopes, scopes_array[i]) + i = i + 1 + } + scopes_array = new_scopes + + // Attach to AST + ast.scopes = scopes_array + ast.intrinsics = intrinsics + if (length(sem_errors) > 0) { + ast.errors = sem_errors + } +} + +// ============================================================ +// Section 8: Main +// ============================================================ + +init_cursor() +var ast = parse_program() + +if (error_count == 0) { + semantic_check(ast) +} + +// Merge parse errors +var _mi = 0 +if (length(errors) > 0) { + if (ast.errors != null) { + _mi = 0 + while (_mi < length(errors)) { + push(ast.errors, errors[_mi]) + _mi = _mi + 1 + } + } else { + ast.errors = errors + } +} + +print(json.encode(ast)) diff --git a/source/cell.c b/source/cell.c index 56367f2b..5c4ad392 100644 --- a/source/cell.c +++ b/source/cell.c @@ -727,7 +727,6 @@ int cell_init(int argc, char **argv) /* Check for --mach-run flag to compile and run through MACH VM */ if (argc >= 3 && strcmp(argv[1], "--mach-run") == 0) { - const char *filename = argv[2]; if (!find_cell_shop()) return 1; size_t boot_size; @@ -755,7 +754,7 @@ int cell_init(int argc, char **argv) cJSON_Delete(boot_ast); return 1; } - JSContext *ctx = JS_NewContextWithHeapSize(rt, 256 * 1024); + JSContext *ctx = JS_NewContextWithHeapSize(rt, 16 * 1024 * 1024); if (!ctx) { printf("Failed to create JS context\n"); cJSON_Delete(boot_ast); JS_FreeRuntime(rt); @@ -766,7 +765,12 @@ int cell_init(int argc, char **argv) JSValue hidden_env = JS_NewObject(ctx); JS_SetPropertyStr(ctx, hidden_env, "os", js_os_use(ctx)); - JS_SetPropertyStr(ctx, hidden_env, "program", JS_NewString(ctx, filename)); + JSValue args_arr = JS_NewArray(ctx); + for (int i = 2; i < argc; i++) { + JSValue str = JS_NewString(ctx, argv[i]); + JS_ArrayPush(ctx, &args_arr, str); + } + JS_SetPropertyStr(ctx, hidden_env, "args", args_arr); hidden_env = JS_Stone(ctx, hidden_env); JSValue result = JS_RunMachTree(ctx, boot_ast, hidden_env); @@ -775,7 +779,9 @@ int cell_init(int argc, char **argv) int exit_code = 0; if (JS_IsException(result)) { JSValue exc = JS_GetException(ctx); - const char *err_str = JS_ToCString(ctx, exc); +const char *err_str = NULL; +JSValue msg = JS_GetPropertyStr(ctx, exc, "message"); +err_str = JS_ToCString(ctx, msg); if (err_str) { printf("Error: %s\n", err_str); JS_FreeCString(ctx, err_str); @@ -921,4 +927,4 @@ int uncaught_exception(JSContext *js, JSValue v) JS_FreeValue(js, exp); JS_FreeValue(js, v); return 0; -} \ No newline at end of file +} diff --git a/source/quickjs.h b/source/quickjs.h index d00ce026..c4a78d69 100644 --- a/source/quickjs.h +++ b/source/quickjs.h @@ -697,6 +697,9 @@ JSValue JS_GetProperty (JSContext *ctx, JSValue this_obj, JSValue prop); // For records JSValue JS_GetPropertyStr (JSContext *ctx, JSValue this_obj, const char *prop); int JS_SetPropertyStr (JSContext *ctx, JSValue this_obj, const char *prop, JSValue val); + +// Set property on the global object +int JS_SetGlobalStr (JSContext *ctx, const char *prop, JSValue val); int JS_SetProperty (JSContext *ctx, JSValue this_obj, JSValue prop, JSValue val); JSValue JS_GetPrototype (JSContext *ctx, JSValue val); diff --git a/source/runtime.c b/source/runtime.c index 324df6ce..780a2a1d 100644 --- a/source/runtime.c +++ b/source/runtime.c @@ -6204,6 +6204,13 @@ static int js_json_to_str (JSContext *ctx, JSONStringifyContext *jsc, JSValue ho goto exception; } + /* Heap strings are JS_TAG_PTR but must be quoted, not iterated as objects */ + if (JS_IsText (val_ref.val) && !MIST_IsImmediateASCII (val_ref.val)) { + val_ref.val = JS_ToQuotedString (ctx, val_ref.val); + if (JS_IsException (val_ref.val)) goto exception; + goto concat_value; + } + if (JS_IsObject ( val_ref.val)) { /* includes arrays (OBJ_ARRAY) since they have JS_TAG_PTR */ v = js_array_includes (ctx, jsc->stack, 1, &val_ref.val); @@ -9013,17 +9020,28 @@ static JSValue js_cell_array (JSContext *ctx, JSValue this_val, int argc, JSValu if (argc < 2 || JS_IsNull (argv[1])) { /* Split into characters */ - JSValue result = JS_NewArrayLen (ctx, len); - if (JS_IsException (result)) { return result; } - JSArray *out = JS_VALUE_GET_ARRAY (result); + JSGCRef arr_ref, str_ref; + JS_PushGCRef (ctx, &arr_ref); + JS_PushGCRef (ctx, &str_ref); + str_ref.val = arg; + arr_ref.val = JS_NewArray (ctx); + if (JS_IsException (arr_ref.val)) { + JS_PopGCRef (ctx, &str_ref); + JS_PopGCRef (ctx, &arr_ref); + return JS_EXCEPTION; + } for (int i = 0; i < len; i++) { - JSValue ch = js_sub_string_val (ctx, arg, i, i + 1); + JSValue ch = js_sub_string_val (ctx, str_ref.val, i, i + 1); if (JS_IsException (ch)) { + JS_PopGCRef (ctx, &str_ref); + JS_PopGCRef (ctx, &arr_ref); return JS_EXCEPTION; } - out->values[i] = ch; + JS_ArrayPush (ctx, &arr_ref.val, ch); } - out->len = len; + JSValue result = arr_ref.val; + JS_PopGCRef (ctx, &str_ref); + JS_PopGCRef (ctx, &arr_ref); return result; } @@ -11404,11 +11422,11 @@ static JSValue js_cell_length (JSContext *ctx, JSValue this_val, int argc, JSVal int tag = JS_VALUE_GET_TAG (val); /* Strings return codepoint count */ - if (tag == JS_TAG_STRING_IMM) { + if (MIST_IsImmediateASCII (val)) { return JS_NewInt32 (ctx, MIST_GetImmediateASCIILen (val)); } - if (tag == JS_TAG_STRING) { - JSText *p = JS_VALUE_GET_STRING (val); + if (JS_IsPtr (val) && objhdr_type (*chase (val)) == OBJ_TEXT) { + JSText *p = (JSText *)chase (val); return JS_NewInt32 (ctx, (int)JSText_len (p)); } @@ -11582,8 +11600,7 @@ static JSValue js_cell_is_stone (JSContext *ctx, JSValue this_val, int argc, JSV /* is_text(val) */ static JSValue js_cell_is_text (JSContext *ctx, JSValue this_val, int argc, JSValue *argv) { if (argc < 1) return JS_FALSE; - int tag = JS_VALUE_GET_TAG (argv[0]); - return JS_NewBool (ctx, tag == JS_TAG_STRING || tag == JS_TAG_STRING_IMM); + return JS_NewBool (ctx, JS_IsText (argv[0])); } /* is_proto(val, master) - check if val has master in prototype chain */ @@ -11737,6 +11754,10 @@ static JSValue js_cell_some(JSContext *ctx, JSValue this_val, int argc, JSValue /* GC-SAFE: Helper to set a global function. Creates function first, then reads ctx->global_obj to ensure it's not stale if GC ran during function creation. */ +int JS_SetGlobalStr (JSContext *ctx, const char *prop, JSValue val) { + return JS_SetPropertyStr(ctx, ctx->global_obj, prop, val); +} + static void js_set_global_cfunc(JSContext *ctx, const char *name, JSCFunction *func, int length) { JSGCRef ref; JS_PushGCRef(ctx, &ref); @@ -11799,7 +11820,7 @@ static void JS_AddIntrinsicBaseObjects (JSContext *ctx) { /* Core functions - using GC-safe helper */ js_set_global_cfunc(ctx, "eval", js_cell_eval, 2); - js_set_global_cfunc(ctx, "mach_eval", js_mach_eval, 2); + js_set_global_cfunc(ctx, "mach_eval", js_mach_eval, 3); js_set_global_cfunc(ctx, "stone", js_cell_stone, 1); js_set_global_cfunc(ctx, "length", js_cell_length, 1); js_set_global_cfunc(ctx, "call", js_cell_call, 3); diff --git a/tokenize.ce b/tokenize.ce new file mode 100644 index 00000000..f3d2abde --- /dev/null +++ b/tokenize.ce @@ -0,0 +1,569 @@ +var src = args[0] +var filename = length(args) > 1 ? args[1] : "" + +// Convert to codepoint array - integers are GC-safe immediate values +var len = length(src) +var cp = [] +var _i = 0 +while (_i < len) { + push(cp, codepoint(src[_i])) + _i = _i + 1 +} + +var pos = 0 +var row = 0 +var col = 0 +var tokens = [] + +// Codepoint constants +def CP_LF = 10 +def CP_CR = 13 +def CP_TAB = 9 +def CP_SPACE = 32 +def CP_BANG = 33 +def CP_DQUOTE = 34 +def CP_HASH = 35 +def CP_DOLLAR = 36 +def CP_PERCENT = 37 +def CP_AMP = 38 +def CP_SQUOTE = 39 +def CP_LPAREN = 40 +def CP_RPAREN = 41 +def CP_STAR = 42 +def CP_PLUS = 43 +def CP_COMMA = 44 +def CP_MINUS = 45 +def CP_DOT = 46 +def CP_SLASH = 47 +def CP_0 = 48 +def CP_1 = 49 +def CP_7 = 55 +def CP_9 = 57 +def CP_COLON = 58 +def CP_SEMI = 59 +def CP_LT = 60 +def CP_EQ = 61 +def CP_GT = 62 +def CP_QMARK = 63 +def CP_AT = 64 +def CP_A = 65 +def CP_B = 66 +def CP_E = 69 +def CP_F = 70 +def CP_O = 79 +def CP_X = 88 +def CP_Z = 90 +def CP_LBRACKET = 91 +def CP_BSLASH = 92 +def CP_RBRACKET = 93 +def CP_CARET = 94 +def CP_UNDERSCORE = 95 +def CP_BACKTICK = 96 +def CP_a = 97 +def CP_b = 98 +def CP_e = 101 +def CP_f = 102 +def CP_n = 110 +def CP_o = 111 +def CP_r = 114 +def CP_t = 116 +def CP_x = 120 +def CP_z = 122 +def CP_LBRACE = 123 +def CP_PIPE = 124 +def CP_RBRACE = 125 +def CP_TILDE = 126 + +// Keywords lookup +var keywords = { + if: "if", in: "in", do: "do", go: "go", + var: "var", def: "def", for: "for", + else: "else", this: "this", null: "null", true: "true", + false: "false", while: "while", break: "break", + return: "return", delete: "delete", + disrupt: "disrupt", function: "function", continue: "continue", + disruption: "disruption" +} + +function pk() { + if (pos >= len) return -1 + return cp[pos] +} + +function pk_at(n) { + var idx = pos + n + if (idx >= len) return -1 + return cp[idx] +} + +function adv() { + var c = cp[pos] + pos = pos + 1 + if (c == CP_LF) { + row = row + 1 + col = 0 + } else { + col = col + 1 + } + return c +} + +function is_digit(c) { + return c >= CP_0 && c <= CP_9 +} + +function is_hex(c) { + return (c >= CP_0 && c <= CP_9) || (c >= CP_a && c <= CP_f) || (c >= CP_A && c <= CP_F) +} + +function is_alpha(c) { + return (c >= CP_a && c <= CP_z) || (c >= CP_A && c <= CP_Z) +} + +function is_alnum(c) { + return is_alpha(c) || is_digit(c) +} + +function is_ident_start(c) { + return is_alpha(c) || c == CP_UNDERSCORE || c == CP_DOLLAR +} + +function is_ident_char(c) { + return is_alnum(c) || c == CP_UNDERSCORE || c == CP_DOLLAR || c == CP_QMARK || c == CP_BANG +} + +function substr(start, end) { + var s = "" + var i = start + while (i < end) { + s = s + character(cp[i]) + i = i + 1 + } + return s +} + +function read_string(quote_cp) { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + adv() // skip opening quote + while (pos < len && pk() != quote_cp) { + if (pk() == CP_BSLASH) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_SQUOTE) { value = value + "'" } + else if (esc == CP_DQUOTE) { value = value + "\"" } + else if (esc == CP_0) { value = value + character(0) } + else if (esc == CP_BACKTICK) { value = value + "`" } + else { value = value + character(esc) } + } else { + value = value + character(adv()) + } + } + if (pos < len) adv() // skip closing quote + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) +} + +function read_template() { + var start = pos + var start_row = row + var start_col = col + var value = "" + var esc = 0 + var depth = 0 + var tc = 0 + var q = 0 + adv() // skip opening backtick + while (pos < len && pk() != CP_BACKTICK) { + if (pk() == CP_BSLASH && pos + 1 < len) { + adv() + esc = adv() + if (esc == CP_n) { value = value + "\n" } + else if (esc == CP_t) { value = value + "\t" } + else if (esc == CP_r) { value = value + "\r" } + else if (esc == CP_BSLASH) { value = value + "\\" } + else if (esc == CP_BACKTICK) { value = value + "`" } + else if (esc == CP_DOLLAR) { value = value + "$" } + else if (esc == CP_0) { value = value + character(0) } + else { value = value + character(esc) } + } else if (pk() == CP_DOLLAR && pos + 1 < len && pk_at(1) == CP_LBRACE) { + adv() // $ + adv() // { + depth = 1 + while (pos < len && depth > 0) { + tc = pk() + if (tc == CP_LBRACE) { depth = depth + 1; adv() } + else if (tc == CP_RBRACE) { depth = depth - 1; adv() } + else if (tc == CP_SQUOTE || tc == CP_DQUOTE || tc == CP_BACKTICK) { + q = adv() + while (pos < len && pk() != q) { + if (pk() == CP_BSLASH && pos + 1 < len) adv() + adv() + } + if (pos < len) adv() + } else { adv() } + } + } else { + value = value + character(adv()) + } + } + if (pos < len) adv() // skip closing backtick + push(tokens, { + kind: "text", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: value + }) +} + +function read_number() { + var start = pos + var start_row = row + var start_col = col + if (pk() == CP_0 && (pk_at(1) == CP_x || pk_at(1) == CP_X)) { + adv(); adv() + while (pos < len && (is_hex(pk()) || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_b || pk_at(1) == CP_B)) { + adv(); adv() + while (pos < len && (pk() == CP_0 || pk() == CP_1 || pk() == CP_UNDERSCORE)) adv() + } else if (pk() == CP_0 && (pk_at(1) == CP_o || pk_at(1) == CP_O)) { + adv(); adv() + while (pos < len && pk() >= CP_0 && pk() <= CP_7) adv() + } else { + while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + if (pos < len && pk() == CP_DOT) { + adv() + while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() + } + if (pos < len && (pk() == CP_e || pk() == CP_E)) { + adv() + if (pos < len && (pk() == CP_PLUS || pk() == CP_MINUS)) adv() + while (pos < len && is_digit(pk())) adv() + } + } + var raw = substr(start, pos) + push(tokens, { + kind: "number", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw, number: number(raw) + }) +} + +function read_name() { + var start = pos + var start_row = row + var start_col = col + while (pos < len && is_ident_char(pk())) adv() + var name = substr(start, pos) + var kw = keywords[name] + if (kw != null) { + push(tokens, { + kind: kw, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) + } else { + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: name + }) + } +} + +function read_comment() { + var start = pos + var start_row = row + var start_col = col + if (pk_at(1) == CP_SLASH) { + while (pos < len && pk() != CP_LF && pk() != CP_CR) adv() + } else { + adv(); adv() // skip /* + while (pos < len) { + if (pk() == CP_STAR && pk_at(1) == CP_SLASH) { + adv(); adv() + break + } + adv() + } + } + var raw = substr(start, pos) + push(tokens, { + kind: "comment", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw + }) +} + +function emit_op(kind, count) { + var start = pos + var start_row = row + var start_col = col + var i = 0 + while (i < count) { adv(); i = i + 1 } + push(tokens, { + kind: kind, at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col + }) +} + +function emit_ident(count) { + var start = pos + var start_row = row + var start_col = col + var val = "" + var i = 0 + while (i < count) { val = val + character(adv()); i = i + 1 } + push(tokens, { + kind: "name", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: val + }) +} + +function tokenize_one() { + var c = pk() + var start = 0 + var start_row = 0 + var start_col = 0 + var raw = "" + if (c == -1) return false + + // Newline + if (c == CP_LF) { + start = pos + start_row = row + start_col = col + adv() + push(tokens, { + kind: "newline", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: "\n" + }) + return true + } + + if (c == CP_CR) { + start = pos + start_row = row + start_col = col + adv() + if (pos < len && pk() == CP_LF) adv() + push(tokens, { + kind: "newline", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: "\n" + }) + return true + } + + // Whitespace + if (c == CP_SPACE || c == CP_TAB) { + start = pos + start_row = row + start_col = col + while (pos < len && (pk() == CP_SPACE || pk() == CP_TAB)) adv() + raw = substr(start, pos) + push(tokens, { + kind: "space", at: start, + from_row: start_row, from_column: start_col, + to_row: row, to_column: col, + value: raw + }) + return true + } + + // Strings + if (c == CP_SQUOTE || c == CP_DQUOTE) { + read_string(c) + return true + } + + // Template + if (c == CP_BACKTICK) { + read_template() + return true + } + + // Numbers + if (is_digit(c)) { + read_number() + return true + } + if (c == CP_DOT && is_digit(pk_at(1))) { + read_number() + return true + } + + // Identifiers and keywords + if (is_ident_start(c)) { + read_name() + return true + } + + // Comments and / + if (c == CP_SLASH) { + if (pk_at(1) == CP_SLASH || pk_at(1) == CP_STAR) { + read_comment() + return true + } + if (pk_at(1) == CP_EQ) { emit_op("/=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("/", 1) + return true + } + + // Operators + if (c == CP_STAR) { + if (pk_at(1) == CP_STAR) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("**=", 3); return true } + emit_op("**", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("*=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("*", 1); return true + } + + if (c == CP_PERCENT) { + if (pk_at(1) == CP_EQ) { emit_op("%=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("%", 1); return true + } + + if (c == CP_PLUS) { + if (pk_at(1) == CP_EQ) { emit_op("+=", 2); return true } + if (pk_at(1) == CP_PLUS) { emit_op("++", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("+", 1); return true + } + + if (c == CP_MINUS) { + if (pk_at(1) == CP_EQ) { emit_op("-=", 2); return true } + if (pk_at(1) == CP_MINUS) { emit_op("--", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("-", 1); return true + } + + if (c == CP_LT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op("<=", 2); return true } + if (pk_at(1) == CP_LT) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("<<=", 3); return true } + emit_op("<<", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("<", 1); return true + } + + if (c == CP_GT) { + if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(1) == CP_EQ) { emit_op(">=", 2); return true } + if (pk_at(1) == CP_GT) { + if (pk_at(2) == CP_GT) { + if (pk_at(3) == CP_BANG) { emit_ident(4); return true } + if (pk_at(3) == CP_EQ) { emit_op(">>>=", 4); return true } + emit_op(">>>", 3); return true + } + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op(">>=", 3); return true } + emit_op(">>", 2); return true + } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op(">", 1); return true + } + + if (c == CP_EQ) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_EQ) { emit_op("===", 3); return true } + emit_op("==", 2); return true + } + if (pk_at(1) == CP_GT) { emit_op("=>", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("=", 1); return true + } + + if (c == CP_BANG) { + if (pk_at(1) == CP_EQ) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("!==", 3); return true } + emit_op("!=", 2); return true + } + emit_op("!", 1); return true + } + + if (c == CP_AMP) { + if (pk_at(1) == CP_AMP) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("&&=", 3); return true } + emit_op("&&", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("&=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("&", 1); return true + } + + if (c == CP_PIPE) { + if (pk_at(1) == CP_PIPE) { + if (pk_at(2) == CP_BANG) { emit_ident(3); return true } + if (pk_at(2) == CP_EQ) { emit_op("||=", 3); return true } + emit_op("||", 2); return true + } + if (pk_at(1) == CP_EQ) { emit_op("|=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("|", 1); return true + } + + if (c == CP_CARET) { + if (pk_at(1) == CP_EQ) { emit_op("^=", 2); return true } + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("^", 1); return true + } + + if (c == CP_LBRACKET) { + if (pk_at(1) == CP_RBRACKET && pk_at(2) == CP_BANG) { emit_ident(3); return true } + emit_op("[", 1); return true + } + + if (c == CP_TILDE) { + if (pk_at(1) == CP_BANG) { emit_ident(2); return true } + emit_op("~", 1); return true + } + + // Single character tokens + emit_op(character(c), 1) + return true +} + +// Main loop +while (pos < len) { + tokenize_one() +} + +// EOF token +push(tokens, { + kind: "eof", at: pos, + from_row: row, from_column: col, + to_row: row, to_column: col +}) + +print(json.encode({filename: filename, tokens: tokens}))