fix issues with parse.cm and tokenize.cm

2026-02-09 17:43:44 -06:00
parent 68e2395b92
commit d5209e1d59
5 changed files with 201 additions and 57 deletions
--- a/internal/bootstrap.cm
+++ b/internal/bootstrap.cm
@@ -38,7 +38,7 @@ if (use_mcode) {
 // analyze: tokenize + parse, check for errors
 function analyze(src, filename) {
  var tok_result = tokenize_mod(src, filename)
-  var ast = parse_mod(tok_result.tokens, src, filename)
+  var ast = parse_mod(tok_result.tokens, src, filename, tokenize_mod)
  var _i = 0
  var prev_line = -1
  var prev_msg = null
--- a/parse.cm
+++ b/parse.cm
@@ -5,7 +5,7 @@ var is_alpha = function(c) {
  return (c >= 65 && c <= 90) || (c >= 97 && c <= 122)
 }

-var parse = function(tokens, src, filename) {
+var parse = function(tokens, src, filename, tokenizer) {
  var _src_len = length(src)
  var cp = []
  var _i = 0
@@ -167,6 +167,23 @@ var parse = function(tokens, src, filename) {
    var rpos = 0
    var pattern_str = ""
    var flags = ""
+    var tv = null
+    var has_interp = false
+    var ti = 0
+    var tpl_list = null
+    var fmt = null
+    var idx = 0
+    var tvi = 0
+    var tvlen = 0
+    var depth = 0
+    var expr_str = null
+    var tc = null
+    var tq = null
+    var esc_ch = null
+    var expr_tokens = null
+    var sub_ast = null
+    var sub_stmt = null
+    var sub_expr = null

    if (k == "number") {
      node = ast_node("number", start)
@@ -177,8 +194,96 @@ var parse = function(tokens, src, filename) {
      return node
    }
    if (k == "text") {
-      node = ast_node("text", start)
-      node.value = tok.value
+      // Check for template interpolation: ${...}
+      tv = tok.value
+      has_interp = false
+      ti = 0
+      while (ti < length(tv) - 1) {
+        if (tv[ti] == "$" && tv[ti + 1] == "{") {
+          if (ti == 0 || tv[ti - 1] != "\\") {
+            has_interp = true
+            break
+          }
+        }
+        ti = ti + 1
+      }
+      if (!has_interp || tokenizer == null) {
+        node = ast_node("text", start)
+        node.value = tok.value
+        advance()
+        ast_node_end(node)
+        return node
+      }
+      // Template literal with interpolation
+      node = ast_node("text literal", start)
+      tpl_list = []
+      node.list = tpl_list
+      fmt = ""
+      idx = 0
+      tvi = 0
+      tvlen = length(tv)
+      while (tvi < tvlen) {
+        if (tv[tvi] == "\\" && tvi + 1 < tvlen) {
+          esc_ch = tv[tvi + 1]
+          if (esc_ch == "n") { fmt = fmt + "\n" }
+          else if (esc_ch == "t") { fmt = fmt + "\t" }
+          else if (esc_ch == "r") { fmt = fmt + "\r" }
+          else if (esc_ch == "\\") { fmt = fmt + "\\" }
+          else if (esc_ch == "`") { fmt = fmt + "`" }
+          else if (esc_ch == "$") { fmt = fmt + "$" }
+          else if (esc_ch == "0") { fmt = fmt + character(0) }
+          else { fmt = fmt + esc_ch }
+          tvi = tvi + 2
+        } else if (tv[tvi] == "$" && tvi + 1 < tvlen && tv[tvi + 1] == "{") {
+          tvi = tvi + 2
+          depth = 1
+          expr_str = ""
+          while (tvi < tvlen && depth > 0) {
+            tc = tv[tvi]
+            if (tc == "{") { depth = depth + 1; expr_str = expr_str + tc; tvi = tvi + 1 }
+            else if (tc == "}") {
+              depth = depth - 1
+              if (depth > 0) { expr_str = expr_str + tc }
+              tvi = tvi + 1
+            }
+            else if (tc == "'" || tc == "\"" || tc == "`") {
+              tq = tc
+              expr_str = expr_str + tc
+              tvi = tvi + 1
+              while (tvi < tvlen && tv[tvi] != tq) {
+                if (tv[tvi] == "\\" && tvi + 1 < tvlen) {
+                  expr_str = expr_str + tv[tvi]
+                  tvi = tvi + 1
+                }
+                expr_str = expr_str + tv[tvi]
+                tvi = tvi + 1
+              }
+              if (tvi < tvlen) { expr_str = expr_str + tv[tvi]; tvi = tvi + 1 }
+            } else {
+              expr_str = expr_str + tc
+              tvi = tvi + 1
+            }
+          }
+          expr_tokens = tokenizer(expr_str, "<template>").tokens
+          sub_ast = parse(expr_tokens, expr_str, "<template>", tokenizer)
+          if (sub_ast != null && sub_ast.statements != null && length(sub_ast.statements) > 0) {
+            sub_stmt = sub_ast.statements[0]
+            sub_expr = null
+            if (sub_stmt.kind == "call") {
+              sub_expr = sub_stmt.expression
+            } else {
+              sub_expr = sub_stmt
+            }
+            push(tpl_list, sub_expr)
+          }
+          fmt = fmt + "{" + text(idx) + "}"
+          idx = idx + 1
+        } else {
+          fmt = fmt + tv[tvi]
+          tvi = tvi + 1
+        }
+      }
+      node.value = fmt
      advance()
      ast_node_end(node)
      return node
--- a/source/cell.c
+++ b/source/cell.c
@@ -552,59 +552,81 @@ int cell_init(int argc, char **argv)

  /* Check for --mach-run flag to compile and run through MACH VM */
  if (argc >= 3 && strcmp(argv[1], "--mach-run") == 0) {
-    if (!find_cell_shop()) return 1;
+    const char *script_name = argv[2];
+    char *script = NULL;
+    char *allocated_script = NULL;
+    const char *filename = script_name;

-    size_t boot_size;
-    char *boot_data = load_core_file("internal/bootstrap.cm", &boot_size);
-    if (!boot_data) {
-      printf("ERROR: Could not load internal/bootstrap.cm from %s\n", core_path);
+    struct stat st;
+    if (stat(script_name, &st) == 0 && S_ISREG(st.st_mode)) {
+      /* Exact name found */
+    } else {
+      /* Try .ce then .cm extension */
+      static char pathbuf[4096];
+      snprintf(pathbuf, sizeof(pathbuf), "%s.ce", script_name);
+      if (stat(pathbuf, &st) == 0 && S_ISREG(st.st_mode)) {
+        script_name = pathbuf;
+        filename = pathbuf;
+      } else {
+        snprintf(pathbuf, sizeof(pathbuf), "%s.cm", script_name);
+        if (stat(pathbuf, &st) == 0 && S_ISREG(st.st_mode)) {
+          script_name = pathbuf;
+          filename = pathbuf;
+        } else {
+          printf("Failed to find file: %s\n", argv[2]);
+          return 1;
+        }
+      }
+    }
+
+    FILE *f = fopen(script_name, "r");
+    if (!f) {
+      printf("Failed to open file: %s\n", script_name);
+      return 1;
+    }
+    allocated_script = malloc(st.st_size + 1);
+    if (!allocated_script) {
+      fclose(f);
+      printf("Failed to allocate memory for script\n");
+      return 1;
+    }
+    size_t read_size = fread(allocated_script, 1, st.st_size, f);
+    fclose(f);
+    allocated_script[read_size] = '\0';
+    script = allocated_script;
+
+    cJSON *ast = JS_ASTTree(script, read_size, filename);
+    free(allocated_script);
+    if (!ast) {
+      printf("Failed to parse %s\n", filename);
      return 1;
    }

-    cJSON *boot_ast = JS_ASTTree(boot_data, boot_size, "internal/bootstrap.cm");
-    free(boot_data);
-    if (!boot_ast) {
-      printf("Failed to parse internal/bootstrap.cm\n");
-      return 1;
-    }
-
-    if (print_tree_errors(boot_ast)) {
-      cJSON_Delete(boot_ast);
+    if (print_tree_errors(ast)) {
+      cJSON_Delete(ast);
      return 1;
    }

    JSRuntime *rt = JS_NewRuntime();
    if (!rt) {
      printf("Failed to create JS runtime\n");
-      cJSON_Delete(boot_ast);
+      cJSON_Delete(ast);
      return 1;
    }
    JSContext *ctx = JS_NewContextWithHeapSize(rt, 16 * 1024 * 1024);
    if (!ctx) {
      printf("Failed to create JS context\n");
-      cJSON_Delete(boot_ast); JS_FreeRuntime(rt);
+      cJSON_Delete(ast); JS_FreeRuntime(rt);
      return 1;
    }

    JS_FreeValue(ctx, js_blob_use(ctx));

-    JSValue hidden_env = JS_NewObject(ctx);
-    JS_SetPropertyStr(ctx, hidden_env, "os", js_os_use(ctx));
-    JS_SetPropertyStr(ctx, hidden_env, "core_path", JS_NewString(ctx, core_path));
-    JSValue args_arr = JS_NewArray(ctx);
-    for (int i = 2; i < argc; i++) {
-      JSValue str = JS_NewString(ctx, argv[i]);
-      JS_ArrayPush(ctx, &args_arr, str);
-    }
-    JS_SetPropertyStr(ctx, hidden_env, "args", args_arr);
-    hidden_env = JS_Stone(ctx, hidden_env);
-
-    JSValue result = JS_RunMachTree(ctx, boot_ast, hidden_env);
-    cJSON_Delete(boot_ast);
+    JSValue result = JS_RunMachTree(ctx, ast, JS_NULL);
+    cJSON_Delete(ast);

    int exit_code = 0;
    if (JS_IsException(result)) {
-      /* Error already printed to stderr by JS_Throw* */
      JS_GetException(ctx);
      exit_code = 1;
    } else if (!JS_IsNull(result)) {
--- a/source/runtime.c
+++ b/source/runtime.c
@@ -6283,8 +6283,8 @@ static JSValue js_cell_number (JSContext *ctx, JSValue this_val, int argc, JSVal
    return val;
  }

-  /* Handle string */
-  if (tag == JS_TAG_STRING || tag == JS_TAG_STRING_IMM) {
+  /* Handle string (immediate ASCII or heap JSText) */
+  if (JS_IsText (val)) {
    const char *str = JS_ToCString (ctx, val);
    if (!str) return JS_EXCEPTION;

@@ -6925,7 +6925,7 @@ JSValue js_cell_character (JSContext *ctx, JSValue this_val, int argc, JSValue *
  int tag = JS_VALUE_GET_TAG (arg);

  /* Handle string - return first character */
-  if (tag == JS_TAG_STRING || tag == JS_TAG_STRING_IMM) {
+  if (JS_IsText (arg)) {
    if (js_string_value_len (arg) == 0) return JS_NewString (ctx, "");
    return js_sub_string_val (ctx, arg, 0, 1);
  }
@@ -6978,7 +6978,7 @@ static JSValue js_cell_text (JSContext *ctx, JSValue this_val, int argc, JSValue
  int tag = JS_VALUE_GET_TAG (arg);

  /* Handle string / rope */
-  if (tag == JS_TAG_STRING || tag == JS_TAG_STRING_IMM) {
+  if (JS_IsText (arg)) {
    JSValue str = JS_ToString (ctx, arg); /* owned + flattens rope */
    if (JS_IsException (str)) return JS_EXCEPTION;

--- a/tokenize.cm
+++ b/tokenize.cm
@@ -64,6 +64,7 @@ var tokenize = function(src, filename) {
  def CP_o = 111
  def CP_r = 114
  def CP_t = 116
+  def CP_u = 117
  def CP_x = 120
  def CP_z = 122
  def CP_LBRACE = 123
@@ -113,6 +114,23 @@ var tokenize = function(src, filename) {
    return (c >= CP_0 && c <= CP_9) || (c >= CP_a && c <= CP_f) || (c >= CP_A && c <= CP_F)
  }

+  var hex_val = function(c) {
+    if (c >= CP_0 && c <= CP_9) return c - CP_0
+    if (c >= CP_a && c <= CP_f) return c - CP_a + 10
+    if (c >= CP_A && c <= CP_F) return c - CP_A + 10
+    return 0
+  }
+
+  var read_unicode_escape = function() {
+    var cp_val = 0
+    var hi = 0
+    while (hi < 4 && pos < len && is_hex(pk())) {
+      cp_val = cp_val * 16 + hex_val(adv())
+      hi = hi + 1
+    }
+    return character(cp_val)
+  }
+
  var is_alpha = function(c) {
    return (c >= CP_a && c <= CP_z) || (c >= CP_A && c <= CP_Z)
  }
@@ -158,6 +176,7 @@ var tokenize = function(src, filename) {
        else if (esc == CP_DQUOTE) { value = value + "\"" }
        else if (esc == CP_0) { value = value + character(0) }
        else if (esc == CP_BACKTICK) { value = value + "`" }
+        else if (esc == CP_u) { value = value + read_unicode_escape() }
        else { value = value + character(esc) }
      } else {
        value = value + character(adv())
@@ -177,39 +196,37 @@ var tokenize = function(src, filename) {
    var start_row = row
    var start_col = col
    var value = ""
-    var esc = 0
    var depth = 0
    var tc = 0
    var q = 0
    adv() // skip opening backtick
    while (pos < len && pk() != CP_BACKTICK) {
      if (pk() == CP_BSLASH && pos + 1 < len) {
-        adv()
-        esc = adv()
-        if (esc == CP_n) { value = value + "\n" }
-        else if (esc == CP_t) { value = value + "\t" }
-        else if (esc == CP_r) { value = value + "\r" }
-        else if (esc == CP_BSLASH) { value = value + "\\" }
-        else if (esc == CP_BACKTICK) { value = value + "`" }
-        else if (esc == CP_DOLLAR) { value = value + "$" }
-        else if (esc == CP_0) { value = value + character(0) }
-        else { value = value + character(esc) }
+        value = value + character(adv())
+        value = value + character(adv())
      } else if (pk() == CP_DOLLAR && pos + 1 < len && pk_at(1) == CP_LBRACE) {
-        adv() // $
-        adv() // {
+        value = value + character(adv()) // $
+        value = value + character(adv()) // {
        depth = 1
        while (pos < len && depth > 0) {
          tc = pk()
-          if (tc == CP_LBRACE) { depth = depth + 1; adv() }
-          else if (tc == CP_RBRACE) { depth = depth - 1; adv() }
+          if (tc == CP_LBRACE) { depth = depth + 1; value = value + character(adv()) }
+          else if (tc == CP_RBRACE) {
+            depth = depth - 1
+            if (depth > 0) { value = value + character(adv()) }
+            else { value = value + character(adv()) }
+          }
          else if (tc == CP_SQUOTE || tc == CP_DQUOTE || tc == CP_BACKTICK) {
            q = adv()
+            value = value + character(q)
            while (pos < len && pk() != q) {
-              if (pk() == CP_BSLASH && pos + 1 < len) adv()
-              adv()
+              if (pk() == CP_BSLASH && pos + 1 < len) {
+                value = value + character(adv())
+              }
+              value = value + character(adv())
            }
-            if (pos < len) adv()
-          } else { adv() }
+            if (pos < len) { value = value + character(adv()) }
+          } else { value = value + character(adv()) }
        }
      } else {
        value = value + character(adv())