455 lines
13 KiB
Plaintext
455 lines
13 KiB
Plaintext
var tokenize = function(src, filename) {
|
|
var len = length(src)
|
|
|
|
var pos = 0
|
|
var row = 0
|
|
var col = 0
|
|
var tokens = []
|
|
|
|
// Keywords lookup
|
|
var keywords = {
|
|
if: "if", in: "in", do: "do", go: "go",
|
|
var: "var", def: "def", for: "for",
|
|
else: "else", this: "this", null: "null", true: "true",
|
|
false: "false", while: "while", break: "break",
|
|
return: "return", delete: "delete",
|
|
disrupt: "disrupt", function: "function", continue: "continue",
|
|
disruption: "disruption"
|
|
}
|
|
|
|
var escape_map = {
|
|
n: "\n", t: "\t", r: "\r", "\\": "\\",
|
|
"'": "'", "\"": "\"", "`": "`",
|
|
"0": character(0)
|
|
}
|
|
|
|
var pk = function() {
|
|
if (pos >= len) return null
|
|
return src[pos]
|
|
}
|
|
|
|
var pk_at = function(n) {
|
|
var idx = pos + n
|
|
if (idx >= len) return null
|
|
return src[idx]
|
|
}
|
|
|
|
var adv = function() {
|
|
var c = src[pos]
|
|
pos = pos + 1
|
|
if (c == "\n") {
|
|
row = row + 1
|
|
col = 0
|
|
} else {
|
|
col = col + 1
|
|
}
|
|
return c
|
|
}
|
|
|
|
var is_digit = function(c) {
|
|
return c >= "0" && c <= "9"
|
|
}
|
|
|
|
var is_hex = function(c) {
|
|
return (c >= "0" && c <= "9") || (c >= "a" && c <= "f") || (c >= "A" && c <= "F")
|
|
}
|
|
|
|
var hex_val = function(c) {
|
|
if (c >= "0" && c <= "9") return codepoint(c) - codepoint("0")
|
|
if (c >= "a" && c <= "f") return codepoint(c) - codepoint("a") + 10
|
|
if (c >= "A" && c <= "F") return codepoint(c) - codepoint("A") + 10
|
|
return 0
|
|
}
|
|
|
|
var read_unicode_escape = function() {
|
|
var cp_val = 0
|
|
var hi = 0
|
|
while (hi < 4 && pos < len && is_hex(pk())) {
|
|
cp_val = cp_val * 16 + hex_val(adv())
|
|
hi = hi + 1
|
|
}
|
|
return character(cp_val)
|
|
}
|
|
|
|
var is_alpha = function(c) {
|
|
return (c >= "a" && c <= "z") || (c >= "A" && c <= "Z")
|
|
}
|
|
|
|
var is_alnum = function(c) {
|
|
return is_alpha(c) || is_digit(c)
|
|
}
|
|
|
|
var is_ident_start = function(c) {
|
|
return is_alpha(c) || c == "_" || c == "$"
|
|
}
|
|
|
|
var is_ident_char = function(c) {
|
|
return is_alnum(c) || c == "_" || c == "$" || c == "?" || c == "!"
|
|
}
|
|
|
|
var substr = function(start, end) {
|
|
return text(src, start, end)
|
|
}
|
|
|
|
var read_string = function(quote) {
|
|
var start = pos
|
|
var start_row = row
|
|
var start_col = col
|
|
var parts = []
|
|
var run_start = 0
|
|
var esc = null
|
|
var esc_val = null
|
|
adv() // skip opening quote
|
|
run_start = pos
|
|
while (pos < len && pk() != quote) {
|
|
if (pk() == "\\") {
|
|
if (pos > run_start) parts[] = text(src, run_start, pos)
|
|
adv()
|
|
esc = adv()
|
|
esc_val = escape_map[esc]
|
|
if (esc_val != null) { parts[] = esc_val }
|
|
else if (esc == "u") { parts[] = read_unicode_escape() }
|
|
else { parts[] = esc }
|
|
run_start = pos
|
|
} else {
|
|
adv()
|
|
}
|
|
}
|
|
if (pos > run_start) parts[] = text(src, run_start, pos)
|
|
if (pos < len) adv() // skip closing quote
|
|
tokens[] = {
|
|
kind: "text", at: start,
|
|
from_row: start_row, from_column: start_col,
|
|
to_row: row, to_column: col,
|
|
value: text(parts)
|
|
}
|
|
}
|
|
|
|
var read_template = function() {
|
|
var start = pos
|
|
var start_row = row
|
|
var start_col = col
|
|
var parts = []
|
|
var run_start = 0
|
|
var depth = 0
|
|
var tc = null
|
|
var q = null
|
|
var interp_start = 0
|
|
adv() // skip opening backtick
|
|
run_start = pos
|
|
while (pos < len && pk() != "`") {
|
|
if (pk() == "\\" && pos + 1 < len) {
|
|
if (pos > run_start) parts[] = text(src, run_start, pos)
|
|
parts[] = text(src, pos, pos + 2)
|
|
adv(); adv()
|
|
run_start = pos
|
|
} else if (pk() == "$" && pos + 1 < len && pk_at(1) == "{") {
|
|
if (pos > run_start) parts[] = text(src, run_start, pos)
|
|
interp_start = pos
|
|
adv(); adv() // $ {
|
|
depth = 1
|
|
while (pos < len && depth > 0) {
|
|
tc = pk()
|
|
if (tc == "{") { depth = depth + 1; adv() }
|
|
else if (tc == "}") {
|
|
depth = depth - 1
|
|
adv()
|
|
}
|
|
else if (tc == "'" || tc == "\"" || tc == "`") {
|
|
q = adv()
|
|
while (pos < len && pk() != q) {
|
|
if (pk() == "\\" && pos + 1 < len) adv()
|
|
adv()
|
|
}
|
|
if (pos < len) adv()
|
|
} else { adv() }
|
|
}
|
|
parts[] = text(src, interp_start, pos)
|
|
run_start = pos
|
|
} else {
|
|
adv()
|
|
}
|
|
}
|
|
if (pos > run_start) parts[] = text(src, run_start, pos)
|
|
if (pos < len) adv() // skip closing backtick
|
|
tokens[] = {
|
|
kind: "text", at: start,
|
|
from_row: start_row, from_column: start_col,
|
|
to_row: row, to_column: col,
|
|
value: text(parts)
|
|
}
|
|
}
|
|
|
|
var read_number = function() {
|
|
var start = pos
|
|
var start_row = row
|
|
var start_col = col
|
|
var raw = ""
|
|
if (pk() == "0" && (pk_at(1) == "x" || pk_at(1) == "X")) {
|
|
adv(); adv()
|
|
while (pos < len && (is_hex(pk()) || pk() == "_")) adv()
|
|
} else if (pk() == "0" && (pk_at(1) == "b" || pk_at(1) == "B")) {
|
|
adv(); adv()
|
|
while (pos < len && (pk() == "0" || pk() == "1" || pk() == "_")) adv()
|
|
} else if (pk() == "0" && (pk_at(1) == "o" || pk_at(1) == "O")) {
|
|
adv(); adv()
|
|
while (pos < len && pk() >= "0" && pk() <= "7") adv()
|
|
} else {
|
|
while (pos < len && (is_digit(pk()) || pk() == "_")) adv()
|
|
if (pos < len && pk() == ".") {
|
|
adv()
|
|
while (pos < len && (is_digit(pk()) || pk() == "_")) adv()
|
|
}
|
|
if (pos < len && (pk() == "e" || pk() == "E")) {
|
|
adv()
|
|
if (pos < len && (pk() == "+" || pk() == "-")) adv()
|
|
while (pos < len && is_digit(pk())) adv()
|
|
}
|
|
}
|
|
raw = substr(start, pos)
|
|
tokens[] = {
|
|
kind: "number", at: start,
|
|
from_row: start_row, from_column: start_col,
|
|
to_row: row, to_column: col,
|
|
value: raw, number: number(raw)
|
|
}
|
|
}
|
|
|
|
var read_name = function() {
|
|
var start = pos
|
|
var start_row = row
|
|
var start_col = col
|
|
var name = ""
|
|
var kw = null
|
|
while (pos < len && is_ident_char(pk())) adv()
|
|
name = substr(start, pos)
|
|
kw = keywords[name]
|
|
if (kw != null) {
|
|
tokens[] = {
|
|
kind: kw, at: start,
|
|
from_row: start_row, from_column: start_col,
|
|
to_row: row, to_column: col
|
|
}
|
|
} else {
|
|
tokens[] = {
|
|
kind: "name", at: start,
|
|
from_row: start_row, from_column: start_col,
|
|
to_row: row, to_column: col,
|
|
value: name
|
|
}
|
|
}
|
|
}
|
|
|
|
var read_comment = function() {
|
|
var start = pos
|
|
var start_row = row
|
|
var start_col = col
|
|
var raw = ""
|
|
if (pk_at(1) == "/") {
|
|
while (pos < len && pk() != "\n" && pk() != "\r") adv()
|
|
} else {
|
|
adv(); adv() // skip /*
|
|
while (pos < len) {
|
|
if (pk() == "*" && pk_at(1) == "/") {
|
|
adv(); adv()
|
|
break
|
|
}
|
|
adv()
|
|
}
|
|
}
|
|
raw = substr(start, pos)
|
|
tokens[] = {
|
|
kind: "comment", at: start,
|
|
from_row: start_row, from_column: start_col,
|
|
to_row: row, to_column: col,
|
|
value: raw
|
|
}
|
|
}
|
|
|
|
var emit_op = function(kind, count) {
|
|
var start = pos
|
|
var start_row = row
|
|
var start_col = col
|
|
var i = 0
|
|
while (i < count) { adv(); i = i + 1 }
|
|
tokens[] = {
|
|
kind: kind, at: start,
|
|
from_row: start_row, from_column: start_col,
|
|
to_row: row, to_column: col
|
|
}
|
|
}
|
|
|
|
var emit_ident = function(count) {
|
|
var start = pos
|
|
var start_row = row
|
|
var start_col = col
|
|
var i = 0
|
|
while (i < count) { adv(); i = i + 1 }
|
|
tokens[] = {
|
|
kind: "name", at: start,
|
|
from_row: start_row, from_column: start_col,
|
|
to_row: row, to_column: col,
|
|
value: text(src, start, pos)
|
|
}
|
|
}
|
|
|
|
var tokenize_one = function() {
|
|
var c = pk()
|
|
var start = 0
|
|
var start_row = 0
|
|
var start_col = 0
|
|
var raw = ""
|
|
if (c == null) return false
|
|
|
|
if (c == "\n") {
|
|
start = pos; start_row = row; start_col = col
|
|
adv()
|
|
tokens[] = { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }
|
|
return true
|
|
}
|
|
if (c == "\r") {
|
|
start = pos; start_row = row; start_col = col
|
|
adv()
|
|
if (pos < len && pk() == "\n") adv()
|
|
tokens[] = { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }
|
|
return true
|
|
}
|
|
if (c == " " || c == "\t") {
|
|
start = pos; start_row = row; start_col = col
|
|
while (pos < len && (pk() == " " || pk() == "\t")) adv()
|
|
raw = substr(start, pos)
|
|
tokens[] = { kind: "space", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw }
|
|
return true
|
|
}
|
|
if (c == "'" || c == "\"") { read_string(c); return true }
|
|
if (c == "`") { read_template(); return true }
|
|
if (is_digit(c)) { read_number(); return true }
|
|
if (c == "." && is_digit(pk_at(1))) { read_number(); return true }
|
|
if (is_ident_start(c)) { read_name(); return true }
|
|
if (c == "/") {
|
|
if (pk_at(1) == "/" || pk_at(1) == "*") { read_comment(); return true }
|
|
if (pk_at(1) == "=") { emit_op("/=", 2); return true }
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("/", 1); return true
|
|
}
|
|
if (c == "*") {
|
|
if (pk_at(1) == "*") {
|
|
if (pk_at(2) == "!") { emit_ident(3); return true }
|
|
if (pk_at(2) == "=") { emit_op("**=", 3); return true }
|
|
emit_op("**", 2); return true
|
|
}
|
|
if (pk_at(1) == "=") { emit_op("*=", 2); return true }
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("*", 1); return true
|
|
}
|
|
if (c == "%") {
|
|
if (pk_at(1) == "=") { emit_op("%=", 2); return true }
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("%", 1); return true
|
|
}
|
|
if (c == "+") {
|
|
if (pk_at(1) == "=") { emit_op("+=", 2); return true }
|
|
if (pk_at(1) == "+") { emit_op("++", 2); return true }
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("+", 1); return true
|
|
}
|
|
if (c == "-") {
|
|
if (pk_at(1) == "=") { emit_op("-=", 2); return true }
|
|
if (pk_at(1) == "-") { emit_op("--", 2); return true }
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("-", 1); return true
|
|
}
|
|
if (c == "<") {
|
|
if (pk_at(1) == "=" && pk_at(2) == "!") { emit_ident(3); return true }
|
|
if (pk_at(1) == "=") { emit_op("<=", 2); return true }
|
|
if (pk_at(1) == "<") {
|
|
if (pk_at(2) == "!") { emit_ident(3); return true }
|
|
if (pk_at(2) == "=") { emit_op("<<=", 3); return true }
|
|
emit_op("<<", 2); return true
|
|
}
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("<", 1); return true
|
|
}
|
|
if (c == ">") {
|
|
if (pk_at(1) == "=" && pk_at(2) == "!") { emit_ident(3); return true }
|
|
if (pk_at(1) == "=") { emit_op(">=", 2); return true }
|
|
if (pk_at(1) == ">") {
|
|
if (pk_at(2) == ">") {
|
|
if (pk_at(3) == "!") { emit_ident(4); return true }
|
|
if (pk_at(3) == "=") { emit_op(">>>=", 4); return true }
|
|
emit_op(">>>", 3); return true
|
|
}
|
|
if (pk_at(2) == "!") { emit_ident(3); return true }
|
|
if (pk_at(2) == "=") { emit_op(">>=", 3); return true }
|
|
emit_op(">>", 2); return true
|
|
}
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op(">", 1); return true
|
|
}
|
|
if (c == "=") {
|
|
if (pk_at(1) == "=") {
|
|
if (pk_at(2) == "=") { emit_op("===", 3); return true }
|
|
emit_op("==", 2); return true
|
|
}
|
|
if (pk_at(1) == ">") { emit_op("=>", 2); return true }
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("=", 1); return true
|
|
}
|
|
if (c == "!") {
|
|
if (pk_at(1) == "=") {
|
|
if (pk_at(2) == "!") { emit_ident(3); return true }
|
|
if (pk_at(2) == "=") { emit_op("!==", 3); return true }
|
|
emit_op("!=", 2); return true
|
|
}
|
|
emit_op("!", 1); return true
|
|
}
|
|
if (c == "&") {
|
|
if (pk_at(1) == "&") {
|
|
if (pk_at(2) == "!") { emit_ident(3); return true }
|
|
if (pk_at(2) == "=") { emit_op("&&=", 3); return true }
|
|
emit_op("&&", 2); return true
|
|
}
|
|
if (pk_at(1) == "=") { emit_op("&=", 2); return true }
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("&", 1); return true
|
|
}
|
|
if (c == "|") {
|
|
if (pk_at(1) == "|") {
|
|
if (pk_at(2) == "!") { emit_ident(3); return true }
|
|
if (pk_at(2) == "=") { emit_op("||=", 3); return true }
|
|
emit_op("||", 2); return true
|
|
}
|
|
if (pk_at(1) == "=") { emit_op("|=", 2); return true }
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("|", 1); return true
|
|
}
|
|
if (c == "^") {
|
|
if (pk_at(1) == "=") { emit_op("^=", 2); return true }
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("^", 1); return true
|
|
}
|
|
if (c == "[") {
|
|
if (pk_at(1) == "]" && pk_at(2) == "!") { emit_ident(3); return true }
|
|
emit_op("[", 1); return true
|
|
}
|
|
if (c == "~") {
|
|
if (pk_at(1) == "!") { emit_ident(2); return true }
|
|
emit_op("~", 1); return true
|
|
}
|
|
emit_op(c, 1)
|
|
return true
|
|
}
|
|
|
|
// Main loop
|
|
while (pos < len) {
|
|
tokenize_one()
|
|
}
|
|
|
|
// EOF token
|
|
tokens[] = { kind: "eof", at: pos, from_row: row, from_column: col, to_row: row, to_column: col }
|
|
|
|
return {filename: filename, tokens: tokens}
|
|
}
|
|
|
|
return tokenize
|