var tokenize = function(src, filename) { var len = length(src) var pos = 0 var row = 0 var col = 0 var tokens = [] // Keywords lookup var keywords = { if: "if", in: "in", do: "do", go: "go", var: "var", def: "def", for: "for", else: "else", this: "this", null: "null", true: "true", false: "false", while: "while", break: "break", return: "return", delete: "delete", disrupt: "disrupt", function: "function", continue: "continue", disruption: "disruption" } var escape_map = { n: "\n", t: "\t", r: "\r", "\\": "\\", "'": "'", "\"": "\"", "`": "`", "0": character(0) } var pk = function() { if (pos >= len) return null return src[pos] } var pk_at = function(n) { var idx = pos + n if (idx >= len) return null return src[idx] } var adv = function() { var c = src[pos] pos = pos + 1 if (c == "\n") { row = row + 1 col = 0 } else { col = col + 1 } return c } var is_digit = function(c) { return c >= "0" && c <= "9" } var is_hex = function(c) { return (c >= "0" && c <= "9") || (c >= "a" && c <= "f") || (c >= "A" && c <= "F") } var hex_val = function(c) { if (c >= "0" && c <= "9") return codepoint(c) - codepoint("0") if (c >= "a" && c <= "f") return codepoint(c) - codepoint("a") + 10 if (c >= "A" && c <= "F") return codepoint(c) - codepoint("A") + 10 return 0 } var read_unicode_escape = function() { var cp_val = 0 var hi = 0 while (hi < 4 && pos < len && is_hex(pk())) { cp_val = cp_val * 16 + hex_val(adv()) hi = hi + 1 } return character(cp_val) } var is_alpha = function(c) { return (c >= "a" && c <= "z") || (c >= "A" && c <= "Z") } var is_alnum = function(c) { return is_alpha(c) || is_digit(c) } var is_ident_start = function(c) { return is_alpha(c) || c == "_" || c == "$" } var is_ident_char = function(c) { return is_alnum(c) || c == "_" || c == "$" || c == "?" || c == "!" } var substr = function(start, end) { return text(src, start, end) } var read_string = function(quote) { var start = pos var start_row = row var start_col = col var parts = [] var run_start = 0 var esc = null var esc_val = null adv() // skip opening quote run_start = pos while (pos < len && pk() != quote) { if (pk() == "\\") { if (pos > run_start) parts[] = text(src, run_start, pos) adv() esc = adv() esc_val = escape_map[esc] if (esc_val != null) { parts[] = esc_val } else if (esc == "u") { parts[] = read_unicode_escape() } else { parts[] = esc } run_start = pos } else { adv() } } if (pos > run_start) parts[] = text(src, run_start, pos) if (pos < len) adv() // skip closing quote tokens[] = { kind: "text", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: text(parts) } } var read_template = function() { var start = pos var start_row = row var start_col = col var parts = [] var run_start = 0 var depth = 0 var tc = null var q = null var interp_start = 0 adv() // skip opening backtick run_start = pos while (pos < len && pk() != "`") { if (pk() == "\\" && pos + 1 < len) { if (pos > run_start) parts[] = text(src, run_start, pos) parts[] = text(src, pos, pos + 2) adv(); adv() run_start = pos } else if (pk() == "$" && pos + 1 < len && pk_at(1) == "{") { if (pos > run_start) parts[] = text(src, run_start, pos) interp_start = pos adv(); adv() // $ { depth = 1 while (pos < len && depth > 0) { tc = pk() if (tc == "{") { depth = depth + 1; adv() } else if (tc == "}") { depth = depth - 1 adv() } else if (tc == "'" || tc == "\"" || tc == "`") { q = adv() while (pos < len && pk() != q) { if (pk() == "\\" && pos + 1 < len) adv() adv() } if (pos < len) adv() } else { adv() } } parts[] = text(src, interp_start, pos) run_start = pos } else { adv() } } if (pos > run_start) parts[] = text(src, run_start, pos) if (pos < len) adv() // skip closing backtick tokens[] = { kind: "text", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: text(parts) } } var read_number = function() { var start = pos var start_row = row var start_col = col var raw = "" if (pk() == "0" && (pk_at(1) == "x" || pk_at(1) == "X")) { adv(); adv() while (pos < len && (is_hex(pk()) || pk() == "_")) adv() } else if (pk() == "0" && (pk_at(1) == "b" || pk_at(1) == "B")) { adv(); adv() while (pos < len && (pk() == "0" || pk() == "1" || pk() == "_")) adv() } else if (pk() == "0" && (pk_at(1) == "o" || pk_at(1) == "O")) { adv(); adv() while (pos < len && pk() >= "0" && pk() <= "7") adv() } else { while (pos < len && (is_digit(pk()) || pk() == "_")) adv() if (pos < len && pk() == ".") { adv() while (pos < len && (is_digit(pk()) || pk() == "_")) adv() } if (pos < len && (pk() == "e" || pk() == "E")) { adv() if (pos < len && (pk() == "+" || pk() == "-")) adv() while (pos < len && is_digit(pk())) adv() } } raw = substr(start, pos) tokens[] = { kind: "number", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw, number: number(raw) } } var read_name = function() { var start = pos var start_row = row var start_col = col var name = "" var kw = null while (pos < len && is_ident_char(pk())) adv() name = substr(start, pos) kw = keywords[name] if (kw != null) { tokens[] = { kind: kw, at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col } } else { tokens[] = { kind: "name", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: name } } } var read_comment = function() { var start = pos var start_row = row var start_col = col var raw = "" if (pk_at(1) == "/") { while (pos < len && pk() != "\n" && pk() != "\r") adv() } else { adv(); adv() // skip /* while (pos < len) { if (pk() == "*" && pk_at(1) == "/") { adv(); adv() break } adv() } } raw = substr(start, pos) tokens[] = { kind: "comment", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw } } var emit_op = function(kind, count) { var start = pos var start_row = row var start_col = col var i = 0 while (i < count) { adv(); i = i + 1 } tokens[] = { kind: kind, at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col } } var emit_ident = function(count) { var start = pos var start_row = row var start_col = col var i = 0 while (i < count) { adv(); i = i + 1 } tokens[] = { kind: "name", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: text(src, start, pos) } } var tokenize_one = function() { var c = pk() var start = 0 var start_row = 0 var start_col = 0 var raw = "" if (c == null) return false if (c == "\n") { start = pos; start_row = row; start_col = col adv() tokens[] = { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" } return true } if (c == "\r") { start = pos; start_row = row; start_col = col adv() if (pos < len && pk() == "\n") adv() tokens[] = { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" } return true } if (c == " " || c == "\t") { start = pos; start_row = row; start_col = col while (pos < len && (pk() == " " || pk() == "\t")) adv() raw = substr(start, pos) tokens[] = { kind: "space", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw } return true } if (c == "'" || c == "\"") { read_string(c); return true } if (c == "`") { read_template(); return true } if (is_digit(c)) { read_number(); return true } if (c == "." && is_digit(pk_at(1))) { read_number(); return true } if (is_ident_start(c)) { read_name(); return true } if (c == "/") { if (pk_at(1) == "/" || pk_at(1) == "*") { read_comment(); return true } if (pk_at(1) == "=") { emit_op("/=", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("/", 1); return true } if (c == "*") { if (pk_at(1) == "*") { if (pk_at(2) == "!") { emit_ident(3); return true } if (pk_at(2) == "=") { emit_op("**=", 3); return true } emit_op("**", 2); return true } if (pk_at(1) == "=") { emit_op("*=", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("*", 1); return true } if (c == "%") { if (pk_at(1) == "=") { emit_op("%=", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("%", 1); return true } if (c == "+") { if (pk_at(1) == "=") { emit_op("+=", 2); return true } if (pk_at(1) == "+") { emit_op("++", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("+", 1); return true } if (c == "-") { if (pk_at(1) == "=") { emit_op("-=", 2); return true } if (pk_at(1) == "-") { emit_op("--", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("-", 1); return true } if (c == "<") { if (pk_at(1) == "=" && pk_at(2) == "!") { emit_ident(3); return true } if (pk_at(1) == "=") { emit_op("<=", 2); return true } if (pk_at(1) == "<") { if (pk_at(2) == "!") { emit_ident(3); return true } if (pk_at(2) == "=") { emit_op("<<=", 3); return true } emit_op("<<", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("<", 1); return true } if (c == ">") { if (pk_at(1) == "=" && pk_at(2) == "!") { emit_ident(3); return true } if (pk_at(1) == "=") { emit_op(">=", 2); return true } if (pk_at(1) == ">") { if (pk_at(2) == ">") { if (pk_at(3) == "!") { emit_ident(4); return true } if (pk_at(3) == "=") { emit_op(">>>=", 4); return true } emit_op(">>>", 3); return true } if (pk_at(2) == "!") { emit_ident(3); return true } if (pk_at(2) == "=") { emit_op(">>=", 3); return true } emit_op(">>", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op(">", 1); return true } if (c == "=") { if (pk_at(1) == "=") { if (pk_at(2) == "=") { emit_op("===", 3); return true } emit_op("==", 2); return true } if (pk_at(1) == ">") { emit_op("=>", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("=", 1); return true } if (c == "!") { if (pk_at(1) == "=") { if (pk_at(2) == "!") { emit_ident(3); return true } if (pk_at(2) == "=") { emit_op("!==", 3); return true } emit_op("!=", 2); return true } emit_op("!", 1); return true } if (c == "&") { if (pk_at(1) == "&") { if (pk_at(2) == "!") { emit_ident(3); return true } if (pk_at(2) == "=") { emit_op("&&=", 3); return true } emit_op("&&", 2); return true } if (pk_at(1) == "=") { emit_op("&=", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("&", 1); return true } if (c == "|") { if (pk_at(1) == "|") { if (pk_at(2) == "!") { emit_ident(3); return true } if (pk_at(2) == "=") { emit_op("||=", 3); return true } emit_op("||", 2); return true } if (pk_at(1) == "=") { emit_op("|=", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("|", 1); return true } if (c == "^") { if (pk_at(1) == "=") { emit_op("^=", 2); return true } if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("^", 1); return true } if (c == "[") { if (pk_at(1) == "]" && pk_at(2) == "!") { emit_ident(3); return true } emit_op("[", 1); return true } if (c == "~") { if (pk_at(1) == "!") { emit_ident(2); return true } emit_op("~", 1); return true } emit_op(c, 1) return true } // Main loop while (pos < len) { tokenize_one() } // EOF token tokens[] = { kind: "eof", at: pos, from_row: row, from_column: col, to_row: row, to_column: col } return {filename: filename, tokens: tokens} } return tokenize