From a24331aae5b5f7c18e48ad5334e4f7422906963c Mon Sep 17 00:00:00 2001 From: John Alanbrook Date: Thu, 5 Feb 2026 11:21:34 -0600 Subject: [PATCH] tokenize --- source/cell.c | 57 ++++++ source/quickjs.c | 488 +++++++++++++++++++++++++++++++++++++++++++++++ source/quickjs.h | 4 + 3 files changed, 549 insertions(+) diff --git a/source/cell.c b/source/cell.c index fb889c4a..c8776d4d 100644 --- a/source/cell.c +++ b/source/cell.c @@ -397,6 +397,63 @@ int cell_init(int argc, char **argv) return json ? 0 : 1; } + /* Check for --tokenize flag to output token array JSON */ + if (argc >= 3 && strcmp(argv[1], "--tokenize") == 0) { + const char *script_or_file = argv[2]; + char *script = NULL; + char *allocated_script = NULL; + const char *filename = ""; + + struct stat st; + if (stat(script_or_file, &st) == 0 && S_ISREG(st.st_mode)) { + FILE *f = fopen(script_or_file, "r"); + if (!f) { + printf("Failed to open file: %s\n", script_or_file); + return 1; + } + allocated_script = malloc(st.st_size + 1); + if (!allocated_script) { + fclose(f); + printf("Failed to allocate memory for script\n"); + return 1; + } + size_t read_size = fread(allocated_script, 1, st.st_size, f); + fclose(f); + allocated_script[read_size] = '\0'; + script = allocated_script; + filename = script_or_file; + } else { + script = (char *)script_or_file; + } + + JSRuntime *rt = JS_NewRuntime(); + if (!rt) { + printf("Failed to create JS runtime\n"); + free(allocated_script); + return 1; + } + JSContext *ctx = JS_NewContext(rt); + if (!ctx) { + printf("Failed to create JS context\n"); + JS_FreeRuntime(rt); + free(allocated_script); + return 1; + } + + char *json = JS_Tokenize(ctx, script, strlen(script), filename); + if (json) { + printf("%s\n", json); + free(json); + } else { + printf("Failed to tokenize\n"); + } + + JS_FreeContext(ctx); + JS_FreeRuntime(rt); + free(allocated_script); + return json ? 0 : 1; + } + /* Check for --mach flag to output machine code JSON */ if (argc >= 3 && strcmp(argv[1], "--mach") == 0) { const char *script_or_file = argv[2]; diff --git a/source/quickjs.c b/source/quickjs.c index a660efda..75a9e291 100644 --- a/source/quickjs.c +++ b/source/quickjs.c @@ -8513,6 +8513,10 @@ enum { TOK_ERROR, TOK_PRIVATE_NAME, TOK_EOF, + /* whitespace/comment tokens for tokenizer */ + TOK_COMMENT, + TOK_NEWLINE, + TOK_SPACE, /* keywords: WARNING: same order as atoms */ TOK_NULL, /* must be first */ TOK_FALSE, @@ -8574,6 +8578,113 @@ enum { #define CP_LS 0x2028 #define CP_PS 0x2029 +/* Map token values to kind strings for tokenizer output */ +static const char *ast_token_kind_str(int token_val) { + static char single_char[2] = {0, 0}; + switch (token_val) { + case TOK_NUMBER: return "number"; + case TOK_STRING: return "text"; + case TOK_TEMPLATE: return "text"; + case TOK_IDENT: return "name"; + case TOK_COMMENT: return "comment"; + case TOK_NEWLINE: return "newline"; + case TOK_SPACE: return "space"; + case TOK_REGEXP: return "regexp"; + case TOK_PRIVATE_NAME: return "private_name"; + case TOK_EOF: return "eof"; + case TOK_ERROR: return "error"; + /* compound operators */ + case TOK_MUL_ASSIGN: return "*="; + case TOK_DIV_ASSIGN: return "/="; + case TOK_MOD_ASSIGN: return "%="; + case TOK_PLUS_ASSIGN: return "+="; + case TOK_MINUS_ASSIGN: return "-="; + case TOK_SHL_ASSIGN: return "<<="; + case TOK_SAR_ASSIGN: return ">>="; + case TOK_SHR_ASSIGN: return ">>>="; + case TOK_AND_ASSIGN: return "&="; + case TOK_XOR_ASSIGN: return "^="; + case TOK_OR_ASSIGN: return "|="; + case TOK_POW_ASSIGN: return "**="; + case TOK_LAND_ASSIGN: return "&&="; + case TOK_LOR_ASSIGN: return "||="; + case TOK_DOUBLE_QUESTION_MARK_ASSIGN: return "?\?="; + case TOK_DEC: return "--"; + case TOK_INC: return "++"; + case TOK_SHL: return "<<"; + case TOK_SAR: return ">>"; + case TOK_SHR: return ">>>"; + case TOK_LT: return "<"; + case TOK_LTE: return "<="; + case TOK_GT: return ">"; + case TOK_GTE: return ">="; + case TOK_EQ: return "=="; + case TOK_STRICT_EQ: return "==="; + case TOK_NEQ: return "!="; + case TOK_STRICT_NEQ: return "!=="; + case TOK_LAND: return "&&"; + case TOK_LOR: return "||"; + case TOK_POW: return "**"; + case TOK_ARROW: return "=>"; + case TOK_DOUBLE_QUESTION_MARK: return "??"; + case TOK_QUESTION_MARK_DOT: return "?."; + /* keywords */ + case TOK_NULL: return "null"; + case TOK_FALSE: return "false"; + case TOK_TRUE: return "true"; + case TOK_IF: return "if"; + case TOK_ELSE: return "else"; + case TOK_RETURN: return "return"; + case TOK_GO: return "go"; + case TOK_VAR: return "var"; + case TOK_DEF: return "def"; + case TOK_THIS: return "this"; + case TOK_DELETE: return "delete"; + case TOK_VOID: return "void"; + case TOK_NEW: return "new"; + case TOK_IN: return "in"; + case TOK_DO: return "do"; + case TOK_WHILE: return "while"; + case TOK_FOR: return "for"; + case TOK_BREAK: return "break"; + case TOK_CONTINUE: return "continue"; + case TOK_SWITCH: return "switch"; + case TOK_CASE: return "case"; + case TOK_DEFAULT: return "default"; + case TOK_THROW: return "throw"; + case TOK_TRY: return "try"; + case TOK_CATCH: return "catch"; + case TOK_FINALLY: return "finally"; + case TOK_FUNCTION: return "function"; + case TOK_DEBUGGER: return "debugger"; + case TOK_WITH: return "with"; + case TOK_CLASS: return "class"; + case TOK_CONST: return "const"; + case TOK_ENUM: return "enum"; + case TOK_EXPORT: return "export"; + case TOK_EXTENDS: return "extends"; + case TOK_IMPORT: return "import"; + case TOK_SUPER: return "super"; + case TOK_IMPLEMENTS: return "implements"; + case TOK_INTERFACE: return "interface"; + case TOK_LET: return "let"; + case TOK_PRIVATE: return "private"; + case TOK_PROTECTED: return "protected"; + case TOK_PUBLIC: return "public"; + case TOK_STATIC: return "static"; + case TOK_YIELD: return "yield"; + case TOK_AWAIT: return "await"; + case TOK_OF: return "of"; + default: + /* Single character tokens */ + if (token_val >= 0 && token_val < 128) { + single_char[0] = (char)token_val; + return single_char; + } + return "unknown"; + } +} + typedef struct BlockEnv { struct BlockEnv *prev; JSValue label_name; /* JS_NULL if none */ @@ -28249,6 +28360,287 @@ redo: return 0; } +/* Tokenizer function that does NOT skip whitespace/comments - emits them as tokens */ +static int tokenize_next (ASTParseState *s) { + const uint8_t *p; + int c; + BOOL ident_has_escape; + + ast_free_token (s); + p = s->buf_ptr; + s->got_lf = FALSE; + + s->token_ptr = p; + c = *p; + switch (c) { + case 0: + if (p >= s->buf_end) { + s->token_val = TOK_EOF; + } else { + goto def_token; + } + break; + case '`': { + const uint8_t *start = p; + p++; + while (p < s->buf_end && *p != '`') { + if (*p == '\\' && p + 1 < s->buf_end) p++; + if (*p == '$' && p + 1 < s->buf_end && p[1] == '{') { + /* template with expressions - not fully supported in AST yet */ + } + p++; + } + if (p < s->buf_end) p++; + s->token_val = TOK_TEMPLATE; + s->token_u.str.str = JS_NewStringLen (s->ctx, (const char *)(start + 1), p - start - 2); + } break; + case '\'': + case '\"': { + const uint8_t *start = p; + int quote = c; + p++; + while (p < s->buf_end && *p != quote) { + if (*p == '\\' && p + 1 < s->buf_end) p++; + p++; + } + if (p < s->buf_end) p++; + s->token_val = TOK_STRING; + s->token_u.str.str = JS_NewStringLen (s->ctx, (const char *)(start + 1), p - start - 2); + } break; + case '\r': + if (p[1] == '\n') p++; + /* fall through */ + case '\n': + p++; + s->got_lf = TRUE; + s->token_val = TOK_NEWLINE; + break; + case '\f': + case '\v': + case ' ': + case '\t': { + /* Collect consecutive whitespace (excluding newlines) */ + while (p < s->buf_end && (*p == ' ' || *p == '\t' || *p == '\f' || *p == '\v')) p++; + s->token_val = TOK_SPACE; + } break; + case '/': + if (p[1] == '*') { + /* Multi-line comment */ + p += 2; + while (p < s->buf_end) { + if (p[0] == '*' && p[1] == '/') { + p += 2; + break; + } + if (*p == '\n' || *p == '\r') s->got_lf = TRUE; + p++; + } + s->token_val = TOK_COMMENT; + } else if (p[1] == '/') { + /* Single-line comment */ + p += 2; + while (p < s->buf_end && *p != '\n' && *p != '\r') p++; + s->token_val = TOK_COMMENT; + } else if (p[1] == '=') { + p += 2; + s->token_val = TOK_DIV_ASSIGN; + } else { + p++; + s->token_val = c; + } + break; + case '\\': + goto def_token; + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': + case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': + case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': + case 'v': case 'w': case 'x': case 'y': case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': + case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': + case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': + case 'V': case 'W': case 'X': case 'Y': case 'Z': + case '_': case '$': { + const uint8_t *start = p; + ident_has_escape = FALSE; + p++; + while (p < s->buf_end) { + c = *p; + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || c == '_' || c == '$') { + p++; + } else if (c >= 0x80) { + p++; + while (p < s->buf_end && (*p & 0xc0) == 0x80) p++; + } else { + break; + } + } + size_t len = p - start; + s->token_u.ident.str = JS_NewStringLen (s->ctx, (const char *)start, len); + s->token_u.ident.has_escape = ident_has_escape; + s->token_u.ident.is_reserved = FALSE; + s->token_val = TOK_IDENT; + + /* Check for keywords */ + if (len == 2 && !memcmp (start, "if", 2)) s->token_val = TOK_IF; + else if (len == 2 && !memcmp (start, "in", 2)) s->token_val = TOK_IN; + else if (len == 2 && !memcmp (start, "do", 2)) s->token_val = TOK_DO; + else if (len == 2 && !memcmp (start, "go", 2)) s->token_val = TOK_GO; + else if (len == 3 && !memcmp (start, "var", 3)) s->token_val = TOK_VAR; + else if (len == 3 && !memcmp (start, "def", 3)) s->token_val = TOK_DEF; + else if (len == 3 && !memcmp (start, "for", 3)) s->token_val = TOK_FOR; + else if (len == 3 && !memcmp (start, "new", 3)) s->token_val = TOK_NEW; + else if (len == 3 && !memcmp (start, "try", 3)) s->token_val = TOK_TRY; + else if (len == 4 && !memcmp (start, "else", 4)) s->token_val = TOK_ELSE; + else if (len == 4 && !memcmp (start, "this", 4)) s->token_val = TOK_THIS; + else if (len == 4 && !memcmp (start, "null", 4)) s->token_val = TOK_NULL; + else if (len == 4 && !memcmp (start, "true", 4)) s->token_val = TOK_TRUE; + else if (len == 4 && !memcmp (start, "void", 4)) s->token_val = TOK_VOID; + else if (len == 4 && !memcmp (start, "case", 4)) s->token_val = TOK_CASE; + else if (len == 5 && !memcmp (start, "false", 5)) s->token_val = TOK_FALSE; + else if (len == 5 && !memcmp (start, "while", 5)) s->token_val = TOK_WHILE; + else if (len == 5 && !memcmp (start, "break", 5)) s->token_val = TOK_BREAK; + else if (len == 5 && !memcmp (start, "throw", 5)) s->token_val = TOK_THROW; + else if (len == 5 && !memcmp (start, "catch", 5)) s->token_val = TOK_CATCH; + else if (len == 6 && !memcmp (start, "return", 6)) s->token_val = TOK_RETURN; + else if (len == 6 && !memcmp (start, "delete", 6)) s->token_val = TOK_DELETE; + else if (len == 6 && !memcmp (start, "switch", 6)) s->token_val = TOK_SWITCH; + else if (len == 7 && !memcmp (start, "default", 7)) s->token_val = TOK_DEFAULT; + else if (len == 7 && !memcmp (start, "finally", 7)) s->token_val = TOK_FINALLY; + else if (len == 8 && !memcmp (start, "function", 8)) s->token_val = TOK_FUNCTION; + else if (len == 8 && !memcmp (start, "continue", 8)) s->token_val = TOK_CONTINUE; + } break; + case '.': + if (p[1] >= '0' && p[1] <= '9') { + goto tokenize_number; + } else { + goto def_token; + } + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + tokenize_number: { + const uint8_t *start = p; + BOOL is_float = FALSE; + if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { + p += 2; + while (p < s->buf_end && ((c = *p, (c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || c == '_'))) p++; + } else if (p[0] == '0' && (p[1] == 'b' || p[1] == 'B')) { + p += 2; + while (p < s->buf_end && (*p == '0' || *p == '1' || *p == '_')) p++; + } else if (p[0] == '0' && (p[1] == 'o' || p[1] == 'O')) { + p += 2; + while (p < s->buf_end && (*p >= '0' && *p <= '7')) p++; + } else { + while (p < s->buf_end && ((*p >= '0' && *p <= '9') || *p == '_')) p++; + if (p < s->buf_end && *p == '.') { + is_float = TRUE; + p++; + while (p < s->buf_end && ((*p >= '0' && *p <= '9') || *p == '_')) p++; + } + if (p < s->buf_end && (*p == 'e' || *p == 'E')) { + is_float = TRUE; + p++; + if (p < s->buf_end && (*p == '+' || *p == '-')) p++; + while (p < s->buf_end && (*p >= '0' && *p <= '9')) p++; + } + } + (void)is_float; + s->token_val = TOK_NUMBER; + char *numstr = sys_malloc (p - start + 1); + memcpy (numstr, start, p - start); + numstr[p - start] = '\0'; + double val = strtod (numstr, NULL); + sys_free (numstr); + s->token_u.num.val = JS_NewFloat64 (s->ctx, val); + } break; + case '*': + if (p[1] == '=') { p += 2; s->token_val = TOK_MUL_ASSIGN; } + else if (p[1] == '*') { + if (p[2] == '=') { p += 3; s->token_val = TOK_POW_ASSIGN; } + else { p += 2; s->token_val = TOK_POW; } + } else { goto def_token; } + break; + case '%': + if (p[1] == '=') { p += 2; s->token_val = TOK_MOD_ASSIGN; } + else { goto def_token; } + break; + case '+': + if (p[1] == '=') { p += 2; s->token_val = TOK_PLUS_ASSIGN; } + else if (p[1] == '+') { p += 2; s->token_val = TOK_INC; } + else { goto def_token; } + break; + case '-': + if (p[1] == '=') { p += 2; s->token_val = TOK_MINUS_ASSIGN; } + else if (p[1] == '-') { p += 2; s->token_val = TOK_DEC; } + else { goto def_token; } + break; + case '<': + if (p[1] == '=') { p += 2; s->token_val = TOK_LTE; } + else if (p[1] == '<') { + if (p[2] == '=') { p += 3; s->token_val = TOK_SHL_ASSIGN; } + else { p += 2; s->token_val = TOK_SHL; } + } else { goto def_token; } + break; + case '>': + if (p[1] == '=') { p += 2; s->token_val = TOK_GTE; } + else if (p[1] == '>') { + if (p[2] == '>') { + if (p[3] == '=') { p += 4; s->token_val = TOK_SHR_ASSIGN; } + else { p += 3; s->token_val = TOK_SHR; } + } else if (p[2] == '=') { p += 3; s->token_val = TOK_SAR_ASSIGN; } + else { p += 2; s->token_val = TOK_SAR; } + } else { goto def_token; } + break; + case '=': + if (p[1] == '=') { + if (p[2] == '=') { p += 3; s->token_val = TOK_STRICT_EQ; } + else { p += 2; s->token_val = TOK_EQ; } + } else if (p[1] == '>') { p += 2; s->token_val = TOK_ARROW; } + else { goto def_token; } + break; + case '!': + if (p[1] == '=') { + if (p[2] == '=') { p += 3; s->token_val = TOK_STRICT_NEQ; } + else { p += 2; s->token_val = TOK_NEQ; } + } else { goto def_token; } + break; + case '&': + if (p[1] == '&') { + if (p[2] == '=') { p += 3; s->token_val = TOK_LAND_ASSIGN; } + else { p += 2; s->token_val = TOK_LAND; } + } else if (p[1] == '=') { p += 2; s->token_val = TOK_AND_ASSIGN; } + else { goto def_token; } + break; + case '|': + if (p[1] == '|') { + if (p[2] == '=') { p += 3; s->token_val = TOK_LOR_ASSIGN; } + else { p += 2; s->token_val = TOK_LOR; } + } else if (p[1] == '=') { p += 2; s->token_val = TOK_OR_ASSIGN; } + else { goto def_token; } + break; + case '^': + if (p[1] == '=') { p += 2; s->token_val = TOK_XOR_ASSIGN; } + else { goto def_token; } + break; + case '?': + if (p[1] == '?') { + if (p[2] == '=') { p += 3; s->token_val = TOK_DOUBLE_QUESTION_MARK_ASSIGN; } + else { p += 2; s->token_val = TOK_DOUBLE_QUESTION_MARK; } + } else if (p[1] == '.') { p += 2; s->token_val = TOK_QUESTION_MARK_DOT; } + else { goto def_token; } + break; + default: + def_token: + p++; + s->token_val = c; + break; + } + s->buf_ptr = p; + return 0; +} + static cJSON *ast_parse_primary (ASTParseState *s) { const uint8_t *start = s->token_ptr; cJSON *node = NULL; @@ -29394,6 +29786,102 @@ char *JS_AST (JSContext *ctx, const char *source, size_t len, const char *filena return json; } +/* Build a token object for the tokenizer output */ +static cJSON *build_token_object (ASTParseState *s) { + cJSON *tok = cJSON_CreateObject (); + const char *kind = ast_token_kind_str (s->token_val); + cJSON_AddStringToObject (tok, "kind", kind); + + /* Position info */ + int at = (int)(s->token_ptr - s->buf_start); + int from_row, from_col; + ast_get_line_col (s, s->token_ptr, &from_row, &from_col); + int to_row, to_col; + ast_get_line_col (s, s->buf_ptr, &to_row, &to_col); + + cJSON_AddNumberToObject (tok, "at", at); + cJSON_AddNumberToObject (tok, "from_row", from_row); + cJSON_AddNumberToObject (tok, "from_column", from_col); + cJSON_AddNumberToObject (tok, "to_row", to_row); + cJSON_AddNumberToObject (tok, "to_column", to_col); + + /* Value field based on token type */ + switch (s->token_val) { + case TOK_NUMBER: { + /* Store original source text as value */ + size_t len = s->buf_ptr - s->token_ptr; + char *text = sys_malloc (len + 1); + memcpy (text, s->token_ptr, len); + text[len] = '\0'; + cJSON_AddStringToObject (tok, "value", text); + sys_free (text); + /* Store parsed number */ + double d = JS_VALUE_GET_FLOAT64 (s->token_u.num.val); + if (JS_VALUE_GET_TAG (s->token_u.num.val) == JS_TAG_INT) { + d = JS_VALUE_GET_INT (s->token_u.num.val); + } + cJSON_AddNumberToObject (tok, "number", d); + } break; + case TOK_STRING: + case TOK_TEMPLATE: { + const char *str = JS_ToCString (s->ctx, s->token_u.str.str); + cJSON_AddStringToObject (tok, "value", str ? str : ""); + JS_FreeCString (s->ctx, str); + } break; + case TOK_IDENT: { + const char *str = JS_ToCString (s->ctx, s->token_u.ident.str); + cJSON_AddStringToObject (tok, "value", str ? str : ""); + JS_FreeCString (s->ctx, str); + } break; + case TOK_COMMENT: + case TOK_SPACE: + case TOK_NEWLINE: { + /* Store the raw source text */ + size_t len = s->buf_ptr - s->token_ptr; + char *text = sys_malloc (len + 1); + memcpy (text, s->token_ptr, len); + text[len] = '\0'; + cJSON_AddStringToObject (tok, "value", text); + sys_free (text); + } break; + default: + /* No value field for operators/punctuators/keywords */ + break; + } + + return tok; +} + +char *JS_Tokenize (JSContext *ctx, const char *source, size_t len, const char *filename) { + ASTParseState s; + memset (&s, 0, sizeof (s)); + + s.ctx = ctx; + s.filename = filename; + s.buf_start = (const uint8_t *)source; + s.buf_ptr = (const uint8_t *)source; + s.buf_end = (const uint8_t *)source + len; + s.function_nr = 0; + s.errors = NULL; + s.has_error = 0; + + cJSON *root = cJSON_CreateObject (); + cJSON_AddStringToObject (root, "filename", filename); + cJSON *tokens = cJSON_AddArrayToObject (root, "tokens"); + + /* Tokenize all tokens including whitespace */ + while (1) { + tokenize_next (&s); + cJSON *tok = build_token_object (&s); + cJSON_AddItemToArray (tokens, tok); + if (s.token_val == TOK_EOF) break; + } + + char *json = cJSON_Print (root); + cJSON_Delete (root); + return json; +} + /* ============================================================ Register-Based Machine Code Generator ============================================================ */ diff --git a/source/quickjs.h b/source/quickjs.h index 80b621e0..d304e578 100644 --- a/source/quickjs.h +++ b/source/quickjs.h @@ -1222,6 +1222,10 @@ CellModule *JS_CompileModule (JSContext *ctx, const char *input, size_t input_le Returns malloc'd JSON string (caller must free), or NULL on error. */ char *JS_AST (JSContext *ctx, const char *source, size_t len, const char *filename); +/* Tokenize source code and return token array as JSON string. + Returns malloc'd JSON string (caller must free), or NULL on error. */ +char *JS_Tokenize (JSContext *ctx, const char *source, size_t len, const char *filename); + /* Generate register-based machine code from AST JSON. Returns malloc'd JSON string (caller must free), or NULL on error. */ char *JS_Mach (JSContext *ctx, const char *ast_json);