var tokenize = function(src, filename) { var len = length(src) var cp = array(array(src), codepoint) var pos = 0 var row = 0 var col = 0 var tokens = [] // Codepoint constants def CP_LF = 10 def CP_CR = 13 def CP_TAB = 9 def CP_SPACE = 32 def CP_BANG = 33 def CP_DQUOTE = 34 def CP_HASH = 35 def CP_DOLLAR = 36 def CP_PERCENT = 37 def CP_AMP = 38 def CP_SQUOTE = 39 def CP_LPAREN = 40 def CP_RPAREN = 41 def CP_STAR = 42 def CP_PLUS = 43 def CP_COMMA = 44 def CP_MINUS = 45 def CP_DOT = 46 def CP_SLASH = 47 def CP_0 = 48 def CP_1 = 49 def CP_7 = 55 def CP_9 = 57 def CP_COLON = 58 def CP_SEMI = 59 def CP_LT = 60 def CP_EQ = 61 def CP_GT = 62 def CP_QMARK = 63 def CP_AT = 64 def CP_A = 65 def CP_B = 66 def CP_E = 69 def CP_F = 70 def CP_O = 79 def CP_X = 88 def CP_Z = 90 def CP_LBRACKET = 91 def CP_BSLASH = 92 def CP_RBRACKET = 93 def CP_CARET = 94 def CP_UNDERSCORE = 95 def CP_BACKTICK = 96 def CP_a = 97 def CP_b = 98 def CP_e = 101 def CP_f = 102 def CP_n = 110 def CP_o = 111 def CP_r = 114 def CP_t = 116 def CP_u = 117 def CP_x = 120 def CP_z = 122 def CP_LBRACE = 123 def CP_PIPE = 124 def CP_RBRACE = 125 def CP_TILDE = 126 // Keywords lookup var keywords = { if: "if", in: "in", do: "do", go: "go", var: "var", def: "def", for: "for", else: "else", this: "this", null: "null", true: "true", false: "false", while: "while", break: "break", return: "return", delete: "delete", disrupt: "disrupt", function: "function", continue: "continue", disruption: "disruption" } var pk = function() { if (pos >= len) return -1 return cp[pos] } var pk_at = function(n) { var idx = pos + n if (idx >= len) return -1 return cp[idx] } var adv = function() { var c = cp[pos] pos = pos + 1 if (c == CP_LF) { row = row + 1 col = 0 } else { col = col + 1 } return c } var is_digit = function(c) { return c >= CP_0 && c <= CP_9 } var is_hex = function(c) { return (c >= CP_0 && c <= CP_9) || (c >= CP_a && c <= CP_f) || (c >= CP_A && c <= CP_F) } var hex_val = function(c) { if (c >= CP_0 && c <= CP_9) return c - CP_0 if (c >= CP_a && c <= CP_f) return c - CP_a + 10 if (c >= CP_A && c <= CP_F) return c - CP_A + 10 return 0 } var read_unicode_escape = function() { var cp_val = 0 var hi = 0 while (hi < 4 && pos < len && is_hex(pk())) { cp_val = cp_val * 16 + hex_val(adv()) hi = hi + 1 } return character(cp_val) } var is_alpha = function(c) { return (c >= CP_a && c <= CP_z) || (c >= CP_A && c <= CP_Z) } var is_alnum = function(c) { return is_alpha(c) || is_digit(c) } var is_ident_start = function(c) { return is_alpha(c) || c == CP_UNDERSCORE || c == CP_DOLLAR } var is_ident_char = function(c) { return is_alnum(c) || c == CP_UNDERSCORE || c == CP_DOLLAR || c == CP_QMARK || c == CP_BANG } var substr = function(start, end) { return text(src, start, end) } var read_string = function(quote_cp) { var start = pos var start_row = row var start_col = col var parts = [] var run_start = 0 var esc = 0 adv() // skip opening quote run_start = pos while (pos < len && pk() != quote_cp) { if (pk() == CP_BSLASH) { if (pos > run_start) push(parts, text(src, run_start, pos)) adv() esc = adv() if (esc == CP_n) { push(parts, "\n") } else if (esc == CP_t) { push(parts, "\t") } else if (esc == CP_r) { push(parts, "\r") } else if (esc == CP_BSLASH) { push(parts, "\\") } else if (esc == CP_SQUOTE) { push(parts, "'") } else if (esc == CP_DQUOTE) { push(parts, "\"") } else if (esc == CP_0) { push(parts, character(0)) } else if (esc == CP_BACKTICK) { push(parts, "`") } else if (esc == CP_u) { push(parts, read_unicode_escape()) } else { push(parts, character(esc)) } run_start = pos } else { adv() } } if (pos > run_start) push(parts, text(src, run_start, pos)) if (pos < len) adv() // skip closing quote push(tokens, { kind: "text", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: text(parts) }) } var read_template = function() { var start = pos var start_row = row var start_col = col var parts = [] var run_start = 0 var depth = 0 var tc = 0 var q = 0 var interp_start = 0 adv() // skip opening backtick run_start = pos while (pos < len && pk() != CP_BACKTICK) { if (pk() == CP_BSLASH && pos + 1 < len) { if (pos > run_start) push(parts, text(src, run_start, pos)) push(parts, text(src, pos, pos + 2)) adv(); adv() run_start = pos } else if (pk() == CP_DOLLAR && pos + 1 < len && pk_at(1) == CP_LBRACE) { if (pos > run_start) push(parts, text(src, run_start, pos)) interp_start = pos adv(); adv() // $ { depth = 1 while (pos < len && depth > 0) { tc = pk() if (tc == CP_LBRACE) { depth = depth + 1; adv() } else if (tc == CP_RBRACE) { depth = depth - 1 adv() } else if (tc == CP_SQUOTE || tc == CP_DQUOTE || tc == CP_BACKTICK) { q = adv() while (pos < len && pk() != q) { if (pk() == CP_BSLASH && pos + 1 < len) adv() adv() } if (pos < len) adv() } else { adv() } } push(parts, text(src, interp_start, pos)) run_start = pos } else { adv() } } if (pos > run_start) push(parts, text(src, run_start, pos)) if (pos < len) adv() // skip closing backtick push(tokens, { kind: "text", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: text(parts) }) } var read_number = function() { var start = pos var start_row = row var start_col = col var raw = "" if (pk() == CP_0 && (pk_at(1) == CP_x || pk_at(1) == CP_X)) { adv(); adv() while (pos < len && (is_hex(pk()) || pk() == CP_UNDERSCORE)) adv() } else if (pk() == CP_0 && (pk_at(1) == CP_b || pk_at(1) == CP_B)) { adv(); adv() while (pos < len && (pk() == CP_0 || pk() == CP_1 || pk() == CP_UNDERSCORE)) adv() } else if (pk() == CP_0 && (pk_at(1) == CP_o || pk_at(1) == CP_O)) { adv(); adv() while (pos < len && pk() >= CP_0 && pk() <= CP_7) adv() } else { while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() if (pos < len && pk() == CP_DOT) { adv() while (pos < len && (is_digit(pk()) || pk() == CP_UNDERSCORE)) adv() } if (pos < len && (pk() == CP_e || pk() == CP_E)) { adv() if (pos < len && (pk() == CP_PLUS || pk() == CP_MINUS)) adv() while (pos < len && is_digit(pk())) adv() } } raw = substr(start, pos) push(tokens, { kind: "number", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw, number: number(raw) }) } var read_name = function() { var start = pos var start_row = row var start_col = col var name = "" var kw = null while (pos < len && is_ident_char(pk())) adv() name = substr(start, pos) kw = keywords[name] if (kw != null) { push(tokens, { kind: kw, at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col }) } else { push(tokens, { kind: "name", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: name }) } } var read_comment = function() { var start = pos var start_row = row var start_col = col var raw = "" if (pk_at(1) == CP_SLASH) { while (pos < len && pk() != CP_LF && pk() != CP_CR) adv() } else { adv(); adv() // skip /* while (pos < len) { if (pk() == CP_STAR && pk_at(1) == CP_SLASH) { adv(); adv() break } adv() } } raw = substr(start, pos) push(tokens, { kind: "comment", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw }) } var emit_op = function(kind, count) { var start = pos var start_row = row var start_col = col var i = 0 while (i < count) { adv(); i = i + 1 } push(tokens, { kind: kind, at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col }) } var emit_ident = function(count) { var start = pos var start_row = row var start_col = col var i = 0 while (i < count) { adv(); i = i + 1 } push(tokens, { kind: "name", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: text(src, start, pos) }) } var tokenize_one = function() { var c = pk() var start = 0 var start_row = 0 var start_col = 0 var raw = "" if (c == -1) return false if (c == CP_LF) { start = pos; start_row = row; start_col = col adv() push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) return true } if (c == CP_CR) { start = pos; start_row = row; start_col = col adv() if (pos < len && pk() == CP_LF) adv() push(tokens, { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }) return true } if (c == CP_SPACE || c == CP_TAB) { start = pos; start_row = row; start_col = col while (pos < len && (pk() == CP_SPACE || pk() == CP_TAB)) adv() raw = substr(start, pos) push(tokens, { kind: "space", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw }) return true } if (c == CP_SQUOTE || c == CP_DQUOTE) { read_string(c); return true } if (c == CP_BACKTICK) { read_template(); return true } if (is_digit(c)) { read_number(); return true } if (c == CP_DOT && is_digit(pk_at(1))) { read_number(); return true } if (is_ident_start(c)) { read_name(); return true } if (c == CP_SLASH) { if (pk_at(1) == CP_SLASH || pk_at(1) == CP_STAR) { read_comment(); return true } if (pk_at(1) == CP_EQ) { emit_op("/=", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("/", 1); return true } if (c == CP_STAR) { if (pk_at(1) == CP_STAR) { if (pk_at(2) == CP_BANG) { emit_ident(3); return true } if (pk_at(2) == CP_EQ) { emit_op("**=", 3); return true } emit_op("**", 2); return true } if (pk_at(1) == CP_EQ) { emit_op("*=", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("*", 1); return true } if (c == CP_PERCENT) { if (pk_at(1) == CP_EQ) { emit_op("%=", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("%", 1); return true } if (c == CP_PLUS) { if (pk_at(1) == CP_EQ) { emit_op("+=", 2); return true } if (pk_at(1) == CP_PLUS) { emit_op("++", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("+", 1); return true } if (c == CP_MINUS) { if (pk_at(1) == CP_EQ) { emit_op("-=", 2); return true } if (pk_at(1) == CP_MINUS) { emit_op("--", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("-", 1); return true } if (c == CP_LT) { if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } if (pk_at(1) == CP_EQ) { emit_op("<=", 2); return true } if (pk_at(1) == CP_LT) { if (pk_at(2) == CP_BANG) { emit_ident(3); return true } if (pk_at(2) == CP_EQ) { emit_op("<<=", 3); return true } emit_op("<<", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("<", 1); return true } if (c == CP_GT) { if (pk_at(1) == CP_EQ && pk_at(2) == CP_BANG) { emit_ident(3); return true } if (pk_at(1) == CP_EQ) { emit_op(">=", 2); return true } if (pk_at(1) == CP_GT) { if (pk_at(2) == CP_GT) { if (pk_at(3) == CP_BANG) { emit_ident(4); return true } if (pk_at(3) == CP_EQ) { emit_op(">>>=", 4); return true } emit_op(">>>", 3); return true } if (pk_at(2) == CP_BANG) { emit_ident(3); return true } if (pk_at(2) == CP_EQ) { emit_op(">>=", 3); return true } emit_op(">>", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op(">", 1); return true } if (c == CP_EQ) { if (pk_at(1) == CP_EQ) { if (pk_at(2) == CP_EQ) { emit_op("===", 3); return true } emit_op("==", 2); return true } if (pk_at(1) == CP_GT) { emit_op("=>", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("=", 1); return true } if (c == CP_BANG) { if (pk_at(1) == CP_EQ) { if (pk_at(2) == CP_BANG) { emit_ident(3); return true } if (pk_at(2) == CP_EQ) { emit_op("!==", 3); return true } emit_op("!=", 2); return true } emit_op("!", 1); return true } if (c == CP_AMP) { if (pk_at(1) == CP_AMP) { if (pk_at(2) == CP_BANG) { emit_ident(3); return true } if (pk_at(2) == CP_EQ) { emit_op("&&=", 3); return true } emit_op("&&", 2); return true } if (pk_at(1) == CP_EQ) { emit_op("&=", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("&", 1); return true } if (c == CP_PIPE) { if (pk_at(1) == CP_PIPE) { if (pk_at(2) == CP_BANG) { emit_ident(3); return true } if (pk_at(2) == CP_EQ) { emit_op("||=", 3); return true } emit_op("||", 2); return true } if (pk_at(1) == CP_EQ) { emit_op("|=", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("|", 1); return true } if (c == CP_CARET) { if (pk_at(1) == CP_EQ) { emit_op("^=", 2); return true } if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("^", 1); return true } if (c == CP_LBRACKET) { if (pk_at(1) == CP_RBRACKET && pk_at(2) == CP_BANG) { emit_ident(3); return true } emit_op("[", 1); return true } if (c == CP_TILDE) { if (pk_at(1) == CP_BANG) { emit_ident(2); return true } emit_op("~", 1); return true } emit_op(character(c), 1) return true } // Main loop while (pos < len) { tokenize_one() } // EOF token push(tokens, { kind: "eof", at: pos, from_row: row, from_column: col, to_row: row, to_column: col }) return {filename: filename, tokens: tokens, cp: cp} } return tokenize