Files
cell/tokenize.cm
2026-02-26 08:13:18 -06:00

455 lines
13 KiB
Plaintext

var tokenize = function(src, filename) {
var len = length(src)
var pos = 0
var row = 0
var col = 0
var tokens = []
// Keywords lookup
var keywords = {
if: "if", in: "in", do: "do", go: "go",
var: "var", def: "def", for: "for",
else: "else", this: "this", null: "null", true: "true",
false: "false", while: "while", break: "break",
return: "return", delete: "delete",
disrupt: "disrupt", function: "function", continue: "continue",
disruption: "disruption"
}
var escape_map = {
n: "\n", t: "\t", r: "\r", "\\": "\\",
"'": "'", "\"": "\"", "`": "`",
"0": character(0)
}
var pk = function() {
if (pos >= len) return null
return src[pos]
}
var pk_at = function(n) {
var idx = pos + n
if (idx >= len) return null
return src[idx]
}
var adv = function() {
var c = src[pos]
pos = pos + 1
if (c == "\n") {
row = row + 1
col = 0
} else {
col = col + 1
}
return c
}
var is_digit = function(c) {
return c >= "0" && c <= "9"
}
var is_hex = function(c) {
return (c >= "0" && c <= "9") || (c >= "a" && c <= "f") || (c >= "A" && c <= "F")
}
var hex_val = function(c) {
if (c >= "0" && c <= "9") return codepoint(c) - codepoint("0")
if (c >= "a" && c <= "f") return codepoint(c) - codepoint("a") + 10
if (c >= "A" && c <= "F") return codepoint(c) - codepoint("A") + 10
return 0
}
var read_unicode_escape = function() {
var cp_val = 0
var hi = 0
while (hi < 4 && pos < len && is_hex(pk())) {
cp_val = cp_val * 16 + hex_val(adv())
hi = hi + 1
}
return character(cp_val)
}
var is_alpha = function(c) {
return (c >= "a" && c <= "z") || (c >= "A" && c <= "Z")
}
var is_alnum = function(c) {
return is_alpha(c) || is_digit(c)
}
var is_ident_start = function(c) {
return is_alpha(c) || c == "_" || c == "$"
}
var is_ident_char = function(c) {
return is_alnum(c) || c == "_" || c == "$" || c == "?" || c == "!"
}
var substr = function(start, end) {
return text(src, start, end)
}
var read_string = function(quote) {
var start = pos
var start_row = row
var start_col = col
var parts = []
var run_start = 0
var esc = null
var esc_val = null
adv() // skip opening quote
run_start = pos
while (pos < len && pk() != quote) {
if (pk() == "\\") {
if (pos > run_start) parts[] = text(src, run_start, pos)
adv()
esc = adv()
esc_val = escape_map[esc]
if (esc_val != null) { parts[] = esc_val }
else if (esc == "u") { parts[] = read_unicode_escape() }
else { parts[] = esc }
run_start = pos
} else {
adv()
}
}
if (pos > run_start) parts[] = text(src, run_start, pos)
if (pos < len) adv() // skip closing quote
tokens[] = {
kind: "text", at: start,
from_row: start_row, from_column: start_col,
to_row: row, to_column: col,
value: text(parts)
}
}
var read_template = function() {
var start = pos
var start_row = row
var start_col = col
var parts = []
var run_start = 0
var depth = 0
var tc = null
var q = null
var interp_start = 0
adv() // skip opening backtick
run_start = pos
while (pos < len && pk() != "`") {
if (pk() == "\\" && pos + 1 < len) {
if (pos > run_start) parts[] = text(src, run_start, pos)
parts[] = text(src, pos, pos + 2)
adv(); adv()
run_start = pos
} else if (pk() == "$" && pos + 1 < len && pk_at(1) == "{") {
if (pos > run_start) parts[] = text(src, run_start, pos)
interp_start = pos
adv(); adv() // $ {
depth = 1
while (pos < len && depth > 0) {
tc = pk()
if (tc == "{") { depth = depth + 1; adv() }
else if (tc == "}") {
depth = depth - 1
adv()
}
else if (tc == "'" || tc == "\"" || tc == "`") {
q = adv()
while (pos < len && pk() != q) {
if (pk() == "\\" && pos + 1 < len) adv()
adv()
}
if (pos < len) adv()
} else { adv() }
}
parts[] = text(src, interp_start, pos)
run_start = pos
} else {
adv()
}
}
if (pos > run_start) parts[] = text(src, run_start, pos)
if (pos < len) adv() // skip closing backtick
tokens[] = {
kind: "text", at: start,
from_row: start_row, from_column: start_col,
to_row: row, to_column: col,
value: text(parts)
}
}
var read_number = function() {
var start = pos
var start_row = row
var start_col = col
var raw = ""
if (pk() == "0" && (pk_at(1) == "x" || pk_at(1) == "X")) {
adv(); adv()
while (pos < len && (is_hex(pk()) || pk() == "_")) adv()
} else if (pk() == "0" && (pk_at(1) == "b" || pk_at(1) == "B")) {
adv(); adv()
while (pos < len && (pk() == "0" || pk() == "1" || pk() == "_")) adv()
} else if (pk() == "0" && (pk_at(1) == "o" || pk_at(1) == "O")) {
adv(); adv()
while (pos < len && pk() >= "0" && pk() <= "7") adv()
} else {
while (pos < len && (is_digit(pk()) || pk() == "_")) adv()
if (pos < len && pk() == ".") {
adv()
while (pos < len && (is_digit(pk()) || pk() == "_")) adv()
}
if (pos < len && (pk() == "e" || pk() == "E")) {
adv()
if (pos < len && (pk() == "+" || pk() == "-")) adv()
while (pos < len && is_digit(pk())) adv()
}
}
raw = substr(start, pos)
tokens[] = {
kind: "number", at: start,
from_row: start_row, from_column: start_col,
to_row: row, to_column: col,
value: raw, number: number(raw)
}
}
var read_name = function() {
var start = pos
var start_row = row
var start_col = col
var name = ""
var kw = null
while (pos < len && is_ident_char(pk())) adv()
name = substr(start, pos)
kw = keywords[name]
if (kw != null) {
tokens[] = {
kind: kw, at: start,
from_row: start_row, from_column: start_col,
to_row: row, to_column: col
}
} else {
tokens[] = {
kind: "name", at: start,
from_row: start_row, from_column: start_col,
to_row: row, to_column: col,
value: name
}
}
}
var read_comment = function() {
var start = pos
var start_row = row
var start_col = col
var raw = ""
if (pk_at(1) == "/") {
while (pos < len && pk() != "\n" && pk() != "\r") adv()
} else {
adv(); adv() // skip /*
while (pos < len) {
if (pk() == "*" && pk_at(1) == "/") {
adv(); adv()
break
}
adv()
}
}
raw = substr(start, pos)
tokens[] = {
kind: "comment", at: start,
from_row: start_row, from_column: start_col,
to_row: row, to_column: col,
value: raw
}
}
var emit_op = function(kind, count) {
var start = pos
var start_row = row
var start_col = col
var i = 0
while (i < count) { adv(); i = i + 1 }
tokens[] = {
kind: kind, at: start,
from_row: start_row, from_column: start_col,
to_row: row, to_column: col
}
}
var emit_ident = function(count) {
var start = pos
var start_row = row
var start_col = col
var i = 0
while (i < count) { adv(); i = i + 1 }
tokens[] = {
kind: "name", at: start,
from_row: start_row, from_column: start_col,
to_row: row, to_column: col,
value: text(src, start, pos)
}
}
var tokenize_one = function() {
var c = pk()
var start = 0
var start_row = 0
var start_col = 0
var raw = ""
if (c == null) return false
if (c == "\n") {
start = pos; start_row = row; start_col = col
adv()
tokens[] = { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }
return true
}
if (c == "\r") {
start = pos; start_row = row; start_col = col
adv()
if (pos < len && pk() == "\n") adv()
tokens[] = { kind: "newline", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: "\n" }
return true
}
if (c == " " || c == "\t") {
start = pos; start_row = row; start_col = col
while (pos < len && (pk() == " " || pk() == "\t")) adv()
raw = substr(start, pos)
tokens[] = { kind: "space", at: start, from_row: start_row, from_column: start_col, to_row: row, to_column: col, value: raw }
return true
}
if (c == "'" || c == "\"") { read_string(c); return true }
if (c == "`") { read_template(); return true }
if (is_digit(c)) { read_number(); return true }
if (c == "." && is_digit(pk_at(1))) { read_number(); return true }
if (is_ident_start(c)) { read_name(); return true }
if (c == "/") {
if (pk_at(1) == "/" || pk_at(1) == "*") { read_comment(); return true }
if (pk_at(1) == "=") { emit_op("/=", 2); return true }
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("/", 1); return true
}
if (c == "*") {
if (pk_at(1) == "*") {
if (pk_at(2) == "!") { emit_ident(3); return true }
if (pk_at(2) == "=") { emit_op("**=", 3); return true }
emit_op("**", 2); return true
}
if (pk_at(1) == "=") { emit_op("*=", 2); return true }
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("*", 1); return true
}
if (c == "%") {
if (pk_at(1) == "=") { emit_op("%=", 2); return true }
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("%", 1); return true
}
if (c == "+") {
if (pk_at(1) == "=") { emit_op("+=", 2); return true }
if (pk_at(1) == "+") { emit_op("++", 2); return true }
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("+", 1); return true
}
if (c == "-") {
if (pk_at(1) == "=") { emit_op("-=", 2); return true }
if (pk_at(1) == "-") { emit_op("--", 2); return true }
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("-", 1); return true
}
if (c == "<") {
if (pk_at(1) == "=" && pk_at(2) == "!") { emit_ident(3); return true }
if (pk_at(1) == "=") { emit_op("<=", 2); return true }
if (pk_at(1) == "<") {
if (pk_at(2) == "!") { emit_ident(3); return true }
if (pk_at(2) == "=") { emit_op("<<=", 3); return true }
emit_op("<<", 2); return true
}
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("<", 1); return true
}
if (c == ">") {
if (pk_at(1) == "=" && pk_at(2) == "!") { emit_ident(3); return true }
if (pk_at(1) == "=") { emit_op(">=", 2); return true }
if (pk_at(1) == ">") {
if (pk_at(2) == ">") {
if (pk_at(3) == "!") { emit_ident(4); return true }
if (pk_at(3) == "=") { emit_op(">>>=", 4); return true }
emit_op(">>>", 3); return true
}
if (pk_at(2) == "!") { emit_ident(3); return true }
if (pk_at(2) == "=") { emit_op(">>=", 3); return true }
emit_op(">>", 2); return true
}
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op(">", 1); return true
}
if (c == "=") {
if (pk_at(1) == "=") {
if (pk_at(2) == "=") { emit_op("===", 3); return true }
emit_op("==", 2); return true
}
if (pk_at(1) == ">") { emit_op("=>", 2); return true }
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("=", 1); return true
}
if (c == "!") {
if (pk_at(1) == "=") {
if (pk_at(2) == "!") { emit_ident(3); return true }
if (pk_at(2) == "=") { emit_op("!==", 3); return true }
emit_op("!=", 2); return true
}
emit_op("!", 1); return true
}
if (c == "&") {
if (pk_at(1) == "&") {
if (pk_at(2) == "!") { emit_ident(3); return true }
if (pk_at(2) == "=") { emit_op("&&=", 3); return true }
emit_op("&&", 2); return true
}
if (pk_at(1) == "=") { emit_op("&=", 2); return true }
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("&", 1); return true
}
if (c == "|") {
if (pk_at(1) == "|") {
if (pk_at(2) == "!") { emit_ident(3); return true }
if (pk_at(2) == "=") { emit_op("||=", 3); return true }
emit_op("||", 2); return true
}
if (pk_at(1) == "=") { emit_op("|=", 2); return true }
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("|", 1); return true
}
if (c == "^") {
if (pk_at(1) == "=") { emit_op("^=", 2); return true }
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("^", 1); return true
}
if (c == "[") {
if (pk_at(1) == "]" && pk_at(2) == "!") { emit_ident(3); return true }
emit_op("[", 1); return true
}
if (c == "~") {
if (pk_at(1) == "!") { emit_ident(2); return true }
emit_op("~", 1); return true
}
emit_op(c, 1)
return true
}
// Main loop
while (pos < len) {
tokenize_one()
}
// EOF token
tokens[] = { kind: "eof", at: pos, from_row: row, from_column: col, to_row: row, to_column: col }
return {filename: filename, tokens: tokens}
}
return tokenize