diff --git a/docs/_index.md b/docs/_index.md index 88290af8..3efc5a65 100644 --- a/docs/_index.md +++ b/docs/_index.md @@ -57,7 +57,9 @@ Modules loaded with `use()`: ## Tools - [**Command Line**](/docs/cli/) — the `pit` tool +- [**Semantic Index**](/docs/semantic-index/) — index and query symbols, references, and call sites - [**Testing**](/docs/testing/) — writing and running tests +- [**Compiler Inspection**](/docs/compiler-tools/) — dump AST, mcode, and optimizer reports - [**Writing C Modules**](/docs/c-modules/) — native extensions ## Architecture diff --git a/docs/semantic-index.md b/docs/semantic-index.md new file mode 100644 index 00000000..260d5f30 --- /dev/null +++ b/docs/semantic-index.md @@ -0,0 +1,270 @@ +--- +title: "Semantic Index" +description: "Index and query symbols, references, and call sites in source files" +weight: 55 +type: "docs" +--- + +ƿit includes a semantic indexer that extracts symbols, references, call sites, and imports from source files. The index powers the LSP (find references, rename) and is available as a CLI tool for scripting and debugging. + +## Overview + +The indexer walks the parsed AST without modifying it. It produces a JSON structure that maps every declaration, every reference to that declaration, and every call site in a file. + +``` +source → tokenize → parse → fold → index + ↓ + symbols, references, + call sites, imports, + exports, reverse refs +``` + +Two CLI commands expose this: + +| Command | Purpose | +|---------|---------| +| `pit index ` | Produce the full semantic index as JSON | +| `pit explain` | Query the index for a specific symbol or position | + +## pit index + +Index a source file and print the result as JSON. + +```bash +pit index +pit index -o output.json +``` + +### Output + +The index contains these sections: + +| Section | Description | +|---------|-------------| +| `imports` | All `use()` calls with local name, module path, and span | +| `symbols` | Every declaration: vars, defs, functions, params | +| `references` | Every use of a name, classified as read, write, or call | +| `call_sites` | Every function call with callee, args count, and enclosing function | +| `exports` | For `.cm` modules, the keys of the top-level `return` record | +| `reverse_refs` | Inverted index: name to list of reference spans | + +### Example + +Given a file `graph.ce` with functions `make_node`, `connect`, and `build_graph`: + +```bash +pit index graph.ce +``` + +```json +{ + "version": 1, + "path": "graph.ce", + "is_actor": true, + "imports": [ + {"local_name": "json", "module_path": "json", "span": {"from_row": 2, "from_col": 0, "to_row": 2, "to_col": 22}} + ], + "symbols": [ + { + "symbol_id": "graph.ce:make_node:fn", + "name": "make_node", + "kind": "fn", + "params": ["name", "kind"], + "doc_comment": "// A node in the graph.", + "decl_span": {"from_row": 6, "from_col": 0, "to_row": 8, "to_col": 1}, + "scope_fn_nr": 0 + } + ], + "references": [ + {"node_id": 20, "name": "make_node", "ref_kind": "call", "span": {"from_row": 17, "from_col": 13, "to_row": 17, "to_col": 22}} + ], + "call_sites": [ + {"node_id": 20, "callee": "make_node", "args_count": 2, "span": {"from_row": 17, "from_col": 22, "to_row": 17, "to_col": 40}} + ], + "exports": [], + "reverse_refs": { + "make_node": [ + {"node_id": 20, "ref_kind": "call", "span": {"from_row": 17, "from_col": 13, "to_row": 17, "to_col": 22}} + ] + } +} +``` + +### Symbol Kinds + +| Kind | Description | +|------|-------------| +| `fn` | Function (var or def with function value) | +| `var` | Mutable variable | +| `def` | Constant | +| `param` | Function parameter | + +Each symbol has a `symbol_id` in the format `filename:name:kind` and a `decl_span` with `from_row`, `from_col`, `to_row`, `to_col` (0-based). + +### Reference Kinds + +| Kind | Description | +|------|-------------| +| `read` | Value is read | +| `write` | Value is assigned | +| `call` | Used as a function call target | + +### Module Exports + +For `.cm` files, the indexer detects the top-level `return` statement. If it returns a record literal, each key becomes an export linked to its symbol: + +```javascript +// math_utils.cm +var add = function(a, b) { return a + b } +var sub = function(a, b) { return a - b } +return {add: add, sub: sub} +``` + +```bash +pit index math_utils.cm +``` + +The `exports` section will contain: + +```json +[ + {"name": "add", "symbol_id": "math_utils.cm:add:fn"}, + {"name": "sub", "symbol_id": "math_utils.cm:sub:fn"} +] +``` + +## pit explain + +Query the semantic index for a specific symbol or cursor position. This is the targeted query interface — instead of dumping the full index, it answers a specific question. + +```bash +pit explain --span :: +pit explain --symbol +``` + +### --span: What is at this position? + +Point at a line and column (0-based) to find out what symbol or reference is there. + +```bash +pit explain --span demo.ce:6:4 +``` + +If the position lands on a declaration, that symbol is returned along with all its references and call sites. If it lands on a reference, the indexer traces back to the declaration and returns the same information. + +The result includes: + +| Field | Description | +|-------|-------------| +| `symbol` | The resolved declaration (name, kind, params, doc comment, span) | +| `reference` | The reference at the cursor, if the cursor was on a reference | +| `references` | All references to this symbol across the file | +| `call_sites` | All call sites for this symbol | +| `imports` | The file's imports (for context) | + +```json +{ + "symbol": { + "name": "build_graph", + "symbol_id": "demo.ce:build_graph:fn", + "kind": "fn", + "params": [], + "doc_comment": "// Build a sample graph and return it." + }, + "references": [ + {"node_id": 71, "ref_kind": "call", "span": {"from_row": 39, "from_col": 12, "to_row": 39, "to_col": 23}} + ], + "call_sites": [] +} +``` + +### --symbol: Find a symbol by name + +Look up a symbol by name, returning all matching declarations and every reference. + +```bash +pit explain --symbol connect demo.ce +``` + +```json +{ + "symbols": [ + { + "name": "connect", + "symbol_id": "demo.ce:connect:fn", + "kind": "fn", + "params": ["from", "to", "label"], + "doc_comment": "// Connect two nodes with a labeled edge." + } + ], + "references": [ + {"node_id": 29, "ref_kind": "call", "span": {"from_row": 21, "from_col": 2, "to_row": 21, "to_col": 9}}, + {"node_id": 33, "ref_kind": "call", "span": {"from_row": 22, "from_col": 2, "to_row": 22, "to_col": 9}}, + {"node_id": 37, "ref_kind": "call", "span": {"from_row": 23, "from_col": 2, "to_row": 23, "to_col": 9}} + ], + "call_sites": [ + {"callee": "connect", "args_count": 3, "span": {"from_row": 21, "from_col": 9, "to_row": 21, "to_col": 29}}, + {"callee": "connect", "args_count": 3, "span": {"from_row": 22, "from_col": 9, "to_row": 22, "to_col": 31}}, + {"callee": "connect", "args_count": 3, "span": {"from_row": 23, "from_col": 9, "to_row": 23, "to_col": 29}} + ] +} +``` + +This tells you: `connect` is a function taking `(from, to, label)`, declared on line 11, and called 3 times inside `build_graph`. + +## Programmatic Use + +The index and explain modules can be used directly from ƿit scripts: + +### index.cm + +```javascript +var tokenize_mod = use('tokenize') +var parse_mod = use('parse') +var fold_mod = use('fold') +var index_mod = use('index') + +var pipeline = {tokenize: tokenize_mod, parse: parse_mod, fold: fold_mod} +var idx = index_mod.index_file(src, filename, pipeline) +``` + +`index_file` runs the full pipeline (tokenize, parse, fold) and returns the index. If you already have a parsed AST and tokens, use `index_ast` instead: + +```javascript +var idx = index_mod.index_ast(ast, tokens, filename) +``` + +### explain.cm + +```javascript +var explain_mod = use('explain') +var expl = explain_mod.make(idx) + +// What is at line 10, column 5? +var result = expl.at_span(10, 5) + +// Find all symbols named "connect" +var result = expl.by_symbol("connect") + +// Get callers and callees of a symbol +var chain = expl.call_chain("demo.ce:connect:fn", 2) +``` + +For cross-file queries: + +```javascript +var result = explain_mod.explain_across([idx1, idx2, idx3], "connect") +``` + +## LSP Integration + +The semantic index powers these LSP features: + +| Feature | LSP Method | Description | +|---------|------------|-------------| +| Find References | `textDocument/references` | All references to the symbol under the cursor | +| Rename | `textDocument/rename` | Rename a symbol and all its references | +| Prepare Rename | `textDocument/prepareRename` | Validate that the cursor is on a renameable symbol | +| Go to Definition | `textDocument/definition` | Jump to a symbol's declaration (index-backed with AST fallback) | + +These work automatically in any editor with ƿit LSP support. The index is rebuilt on every file change. diff --git a/editors/vscode/lsp/analysis.cm b/editors/vscode/lsp/analysis.cm index 9f3d7b4b..be5ad58c 100644 --- a/editors/vscode/lsp/analysis.cm +++ b/editors/vscode/lsp/analysis.cm @@ -1,10 +1,10 @@ // Document analysis module. -// Call make(tokenize_mod, parse_mod) to get an analysis object. +// Call make(tokenize_mod, parse_mod, index_mod) to get an analysis object. var json = use('json') -// Create an analysis module bound to the tokenize and parse functions. -var make = function(tokenize_mod, parse_mod) { +// Create an analysis module bound to the tokenize, parse, and index functions. +var make = function(tokenize_mod, parse_mod, index_mod) { // Tokenize and parse a document, storing the results. var update = function(docs, uri, params) { @@ -36,13 +36,24 @@ var make = function(tokenize_mod, parse_mod) { } } + var idx = null + var do_index = function() { + idx = index_mod.index_ast(ast, (tok_result != null) ? tok_result.tokens : [], uri) + } disruption { + // indexing failure is non-fatal + } + if (ast != null && index_mod != null) { + do_index() + } + doc = { uri: uri, text: src, version: version, tokens: (tok_result != null) ? tok_result.tokens : [], ast: ast, - errors: errors + errors: errors, + index: idx } docs[uri] = doc return doc diff --git a/editors/vscode/lsp/lsp.ce b/editors/vscode/lsp/lsp.ce index f0233091..e7654841 100644 --- a/editors/vscode/lsp/lsp.ce +++ b/editors/vscode/lsp/lsp.ce @@ -13,9 +13,11 @@ var symbols = use('symbols') // These are the same functions the compiler uses internally. var tokenize_mod = use('tokenize') var parse_mod = use('parse') +var index_mod = use('index') +var explain_mod = use('explain') -// Create analysis module bound to tokenize/parse -var analysis = analysis_make(tokenize_mod, parse_mod) +// Create analysis module bound to tokenize/parse/index +var analysis = analysis_make(tokenize_mod, parse_mod, index_mod) // Document store: URI -> {text, version, ast, tokens, errors} var docs = {} @@ -54,7 +56,9 @@ var handle_initialize = function(id, params) { }, hoverProvider: true, definitionProvider: true, - documentSymbolProvider: true + documentSymbolProvider: true, + referencesProvider: true, + renameProvider: {prepareProvider: true} }, serverInfo: { name: "pit-lsp", @@ -144,6 +148,159 @@ var handle_document_symbol = function(id, params) { protocol.respond(id, result) } +// Handle textDocument/references request. +var handle_references = function(id, params) { + var uri = params.textDocument.uri + var pos = params.position + var doc = docs[uri] + var result = [] + var tok = null + var name = null + var refs = null + var _i = 0 + var ref = null + var expl = null + var sym_result = null + if (doc != null && doc.index != null) { + tok = analysis.token_at(doc, pos.line, pos.character) + if (tok != null && tok.kind == "name" && tok.value != null) { + name = tok.value + refs = doc.index.reverse_refs[name] + if (refs != null) { + _i = 0 + while (_i < length(refs)) { + ref = refs[_i] + if (ref.span != null) { + result[] = { + uri: uri, + range: { + start: {line: ref.span.from_row, character: ref.span.from_col}, + end: {line: ref.span.to_row, character: ref.span.to_col} + } + } + } + _i = _i + 1 + } + } + // Also include the declaration itself if found + expl = explain_mod.make(doc.index) + sym_result = expl.by_symbol(name) + if (sym_result != null && length(sym_result.symbols) > 0) { + _i = 0 + while (_i < length(sym_result.symbols)) { + if (sym_result.symbols[_i].decl_span != null) { + result[] = { + uri: uri, + range: { + start: {line: sym_result.symbols[_i].decl_span.from_row, character: sym_result.symbols[_i].decl_span.from_col}, + end: {line: sym_result.symbols[_i].decl_span.to_row, character: sym_result.symbols[_i].decl_span.to_col} + } + } + } + _i = _i + 1 + } + } + } + } + protocol.respond(id, result) +} + +// Handle textDocument/prepareRename request. +var handle_prepare_rename = function(id, params) { + var uri = params.textDocument.uri + var pos = params.position + var doc = docs[uri] + var tok = null + var name = null + var result = null + var expl = null + var sym_result = null + if (doc != null) { + tok = analysis.token_at(doc, pos.line, pos.character) + if (tok != null && tok.kind == "name" && tok.value != null) { + name = tok.value + // Don't allow renaming intrinsics + if (doc.index != null) { + expl = explain_mod.make(doc.index) + sym_result = expl.by_symbol(name) + if (sym_result != null && length(sym_result.symbols) > 0) { + result = { + range: { + start: {line: tok.from_row, character: tok.from_column}, + end: {line: tok.to_row, character: tok.to_column} + }, + placeholder: name + } + } + } + } + } + protocol.respond(id, result) +} + +// Handle textDocument/rename request. +var handle_rename = function(id, params) { + var uri = params.textDocument.uri + var pos = params.position + var new_name = params.newName + var doc = docs[uri] + var tok = null + var name = null + var edits = [] + var refs = null + var _i = 0 + var ref = null + var expl = null + var sym_result = null + if (doc != null && doc.index != null) { + tok = analysis.token_at(doc, pos.line, pos.character) + if (tok != null && tok.kind == "name" && tok.value != null) { + name = tok.value + expl = explain_mod.make(doc.index) + sym_result = expl.by_symbol(name) + // Add edit for declaration + if (sym_result != null && length(sym_result.symbols) > 0) { + _i = 0 + while (_i < length(sym_result.symbols)) { + if (sym_result.symbols[_i].decl_span != null) { + edits[] = { + range: { + start: {line: sym_result.symbols[_i].decl_span.from_row, character: sym_result.symbols[_i].decl_span.from_col}, + end: {line: sym_result.symbols[_i].decl_span.to_row, character: sym_result.symbols[_i].decl_span.to_col} + }, + newText: new_name + } + } + _i = _i + 1 + } + } + // Add edits for all references + refs = doc.index.reverse_refs[name] + if (refs != null) { + _i = 0 + while (_i < length(refs)) { + ref = refs[_i] + if (ref.span != null) { + edits[] = { + range: { + start: {line: ref.span.from_row, character: ref.span.from_col}, + end: {line: ref.span.to_row, character: ref.span.to_col} + }, + newText: new_name + } + } + _i = _i + 1 + } + } + } + } + var changes = {} + if (length(edits) > 0) { + changes[uri] = edits + } + protocol.respond(id, {changes: changes}) +} + // Dispatch a single message. Wrapped in a function for disruption handling. var dispatch_message = function(msg) { var method = msg.method @@ -167,6 +324,12 @@ var dispatch_message = function(msg) { handle_definition(msg.id, msg.params) } else if (method == "textDocument/documentSymbol") { handle_document_symbol(msg.id, msg.params) + } else if (method == "textDocument/references") { + handle_references(msg.id, msg.params) + } else if (method == "textDocument/prepareRename") { + handle_prepare_rename(msg.id, msg.params) + } else if (method == "textDocument/rename") { + handle_rename(msg.id, msg.params) } else if (method == "shutdown") { protocol.respond(msg.id, null) return "shutdown" diff --git a/editors/vscode/lsp/symbols.cm b/editors/vscode/lsp/symbols.cm index 86b95044..9b9b3d0e 100644 --- a/editors/vscode/lsp/symbols.cm +++ b/editors/vscode/lsp/symbols.cm @@ -91,14 +91,12 @@ var document_symbols = function(doc) { } // Find the declaration location of a name at a given position. +// Uses the semantic index when available, falls back to AST walk. var definition = function(doc, line, col, token_at) { var tok = token_at(doc, line, col) - var ast = doc.ast var name = null var _i = 0 - var _j = 0 - var scope = null - var v = null + var sym = null var decl = null if (tok == null || tok.kind != "name" || tok.value == null) { @@ -107,32 +105,18 @@ var definition = function(doc, line, col, token_at) { name = tok.value - if (ast == null) { - return null - } - - // Search through scopes for the variable declaration - if (ast.scopes != null) { + // Use the semantic index if available + if (doc.index != null) { _i = 0 - while (_i < length(ast.scopes)) { - scope = ast.scopes[_i] - if (scope.vars != null) { - _j = 0 - while (_j < length(scope.vars)) { - v = scope.vars[_j] - if (v.name == name) { - decl = find_declaration(ast.statements, name) - if (decl != null) { - return { - uri: doc.uri, - range: { - start: {line: decl.from_row, character: decl.from_column}, - end: {line: decl.to_row, character: decl.to_column} - } - } - } + while (_i < length(doc.index.symbols)) { + sym = doc.index.symbols[_i] + if (sym.name == name && sym.decl_span != null) { + return { + uri: doc.uri, + range: { + start: {line: sym.decl_span.from_row, character: sym.decl_span.from_col}, + end: {line: sym.decl_span.to_row, character: sym.decl_span.to_col} } - _j = _j + 1 } } _i = _i + 1 @@ -140,13 +124,15 @@ var definition = function(doc, line, col, token_at) { } // Fallback: walk statements for var/def with this name - decl = find_declaration(ast.statements, name) - if (decl != null) { - return { - uri: doc.uri, - range: { - start: {line: decl.from_row, character: decl.from_column}, - end: {line: decl.to_row, character: decl.to_column} + if (doc.ast != null) { + decl = find_declaration(doc.ast.statements, name) + if (decl != null) { + return { + uri: doc.uri, + range: { + start: {line: decl.from_row, character: decl.from_column}, + end: {line: decl.to_row, character: decl.to_column} + } } } } diff --git a/explain.ce b/explain.ce new file mode 100644 index 00000000..6b2c182a --- /dev/null +++ b/explain.ce @@ -0,0 +1,127 @@ +// cell explain — Query the semantic index for a source file. +// +// Usage: +// cell explain --span file.ce:10:5 Find symbol at position +// cell explain --symbol add_node Find symbol by name +// cell explain --symbol add_node file.ce Limit to specific file +// cell explain --help Show this help + +var fd = use('fd') +var json = use('json') +var tokenize_mod = use('tokenize') +var parse_mod = use('parse') +var fold_mod = use('fold') +var index_mod = use('index') +var explain_mod = use('explain') + +var mode = null +var span_arg = null +var symbol_name = null +var file_arg = null +var i = 0 +var parts = null +var filename = null +var line = null +var col = null +var src = null +var idx = null +var explain = null +var result = null +var pipeline = {tokenize: tokenize_mod, parse: parse_mod, fold: fold_mod} + +for (i = 0; i < length(args); i++) { + if (args[i] == '--span') { + mode = "span" + if (i + 1 < length(args)) { + span_arg = args[i + 1] + i = i + 1 + } else { + log.error('--span requires file:line:col') + $stop() + } + } else if (args[i] == '--symbol') { + mode = "symbol" + if (i + 1 < length(args)) { + symbol_name = args[i + 1] + i = i + 1 + } else { + log.error('--symbol requires a name') + $stop() + } + } else if (args[i] == '--help' || args[i] == '-h') { + log.console("Usage: cell explain [options]") + log.console("") + log.console("Query the semantic index for a source file.") + log.console("") + log.console("Options:") + log.console(" --span file:line:col Find symbol at position") + log.console(" --symbol name [file] Find symbol by name") + $stop() + } else if (!starts_with(args[i], '-')) { + if (file_arg == null) { + file_arg = args[i] + } + } +} + +if (mode == null) { + log.error('Specify --span or --symbol. Use --help for usage.') + $stop() +} + +if (mode == "span") { + parts = array(span_arg, ":") + if (length(parts) < 3) { + log.error('--span requires file:line:col format') + $stop() + } + + filename = parts[0] + line = number(parts[1]) + col = number(parts[2]) + + if (!fd.is_file(filename)) { + log.error('File not found: ' + filename) + $stop() + } + + src = text(fd.slurp(filename)) + idx = index_mod.index_file(src, filename, pipeline) + explain = explain_mod.make(idx) + result = explain.at_span(line, col) + + if (result == null) { + log.console("Nothing found at " + filename + ":" + text(line) + ":" + text(col)) + } else { + print(json.encode(result, true)) + print("\n") + } +} + +if (mode == "symbol") { + filename = file_arg + + if (filename == null) { + log.error('--symbol requires a file argument') + $stop() + } + + if (!fd.is_file(filename)) { + log.error('File not found: ' + filename) + $stop() + } + + src = text(fd.slurp(filename)) + idx = index_mod.index_file(src, filename, pipeline) + explain = explain_mod.make(idx) + result = explain.by_symbol(symbol_name) + + if (result == null || length(result.symbols) == 0) { + log.console("Symbol '" + symbol_name + "' not found in " + filename) + } else { + print(json.encode(result, true)) + print("\n") + } +} + +$stop() diff --git a/explain.cm b/explain.cm new file mode 100644 index 00000000..36e8fe09 --- /dev/null +++ b/explain.cm @@ -0,0 +1,235 @@ +// explain.cm — Query module over a semantic index. +// +// Usage: +// var explain = use('explain').make(index) +// explain.at_span(line, col) +// explain.by_symbol(name) +// explain.call_chain(symbol_id, depth) + +// Check if a position (line, col) falls inside a span. +var span_contains = function(span, line, col) { + if (line < span.from_row || line > span.to_row) return false + if (line == span.from_row && col < span.from_col) return false + if (line == span.to_row && col > span.to_col) return false + return true +} + +// Create an explain interface bound to a single file index. +var make = function(index) { + + // Find symbol or reference at a given line/col position. + var at_span = function(line, col) { + var _i = 0 + var sym = null + var ref = null + var found_sym = null + var found_ref = null + var result_refs = [] + var result_calls = [] + + // Search symbols for one whose decl_span contains (line, col). + _i = 0 + while (_i < length(index.symbols)) { + sym = index.symbols[_i] + if (sym.decl_span != null && span_contains(sym.decl_span, line, col)) { + found_sym = sym + break + } + _i = _i + 1 + } + + // If no symbol found, search references. + if (found_sym == null) { + _i = 0 + while (_i < length(index.references)) { + ref = index.references[_i] + if (ref.span != null && span_contains(ref.span, line, col)) { + found_ref = ref + // Look up the symbol this reference points to. + if (ref.symbol_id != null) { + _i = 0 + while (_i < length(index.symbols)) { + if (index.symbols[_i].symbol_id == ref.symbol_id) { + found_sym = index.symbols[_i] + break + } + _i = _i + 1 + } + } + break + } + _i = _i + 1 + } + } + + if (found_sym == null && found_ref == null) return null + + // Gather all references to this symbol. + if (found_sym != null && index.reverse_refs[found_sym.name] != null) { + result_refs = index.reverse_refs[found_sym.name] + } + + // Gather call sites. + _i = 0 + while (_i < length(index.call_sites)) { + if (found_sym != null && index.call_sites[_i].callee_symbol_id == found_sym.symbol_id) { + result_calls[] = index.call_sites[_i] + } + _i = _i + 1 + } + + return { + symbol: found_sym, + reference: found_ref, + references: result_refs, + call_sites: result_calls, + imports: index.imports + } + } + + // Find all symbols matching a name. + var by_symbol = function(name) { + var _i = 0 + var matches = [] + var result_refs = [] + var result_calls = [] + + // Find matching symbols. + _i = 0 + while (_i < length(index.symbols)) { + if (index.symbols[_i].name == name) { + matches[] = index.symbols[_i] + } + _i = _i + 1 + } + + // Gather all references to this name. + if (index.reverse_refs[name] != null) { + result_refs = index.reverse_refs[name] + } + + // Gather call sites where this name is the callee. + _i = 0 + while (_i < length(index.call_sites)) { + if (index.call_sites[_i].callee == name) { + result_calls[] = index.call_sites[_i] + } + _i = _i + 1 + } + + return { + symbols: matches, + references: result_refs, + call_sites: result_calls + } + } + + // Build a call chain from/to a symbol. + var call_chain = function(symbol_id, depth) { + var max_depth = (depth != null) ? depth : 2 + var callers = [] + var callees = [] + var _i = 0 + var cs = null + + // Callees: calls made FROM this symbol. + _i = 0 + while (_i < length(index.call_sites)) { + cs = index.call_sites[_i] + if (cs.enclosing == symbol_id) { + callees[] = { + callee: cs.callee, + callee_symbol_id: cs.callee_symbol_id, + span: cs.span, + args_count: cs.args_count + } + } + _i = _i + 1 + } + + // Callers: calls TO this symbol. + _i = 0 + while (_i < length(index.call_sites)) { + cs = index.call_sites[_i] + if (cs.callee_symbol_id == symbol_id) { + callers[] = { + from: cs.enclosing, + span: cs.span, + args_count: cs.args_count + } + } + _i = _i + 1 + } + + return { + symbol_id: symbol_id, + callers: callers, + callees: callees, + depth: max_depth + } + } + + return { + at_span: at_span, + by_symbol: by_symbol, + call_chain: call_chain + } +} + +// Search across multiple file indexes. +var explain_across = function(indexes, name) { + var _i = 0 + var _j = 0 + var all_symbols = [] + var all_refs = [] + var all_calls = [] + var idx = null + var refs = null + + _i = 0 + while (_i < length(indexes)) { + idx = indexes[_i] + + // Gather symbols. + _j = 0 + while (_j < length(idx.symbols)) { + if (idx.symbols[_j].name == name) { + all_symbols[] = idx.symbols[_j] + } + _j = _j + 1 + } + + // Gather references. + refs = idx.reverse_refs[name] + if (refs != null) { + _j = 0 + while (_j < length(refs)) { + all_refs[] = refs[_j] + _j = _j + 1 + } + } + + // Gather call sites. + _j = 0 + while (_j < length(idx.call_sites)) { + if (idx.call_sites[_j].callee == name) { + all_calls[] = idx.call_sites[_j] + } + _j = _j + 1 + } + + _i = _i + 1 + } + + return { + symbols: all_symbols, + references: all_refs, + call_sites: all_calls + } +} + +return { + make: make, + explain_across: explain_across, + span_contains: span_contains +} diff --git a/index.ce b/index.ce new file mode 100644 index 00000000..c991ce0e --- /dev/null +++ b/index.ce @@ -0,0 +1,64 @@ +// cell index — Build semantic index for a source file. +// +// Usage: +// cell index Index one file, output JSON to stdout +// cell index -o Index one file, write to file +// cell index --help Show this help + +var fd = use('fd') +var json = use('json') +var tokenize_mod = use('tokenize') +var parse_mod = use('parse') +var fold_mod = use('fold') +var index_mod = use('index') + +var filename = null +var output_path = null +var i = 0 + +for (i = 0; i < length(args); i++) { + if (args[i] == '-o' || args[i] == '--output') { + if (i + 1 < length(args)) { + output_path = args[i + 1] + i = i + 1 + } else { + log.error('-o requires a file path') + $stop() + } + } else if (args[i] == '--help' || args[i] == '-h') { + log.console("Usage: cell index [options]") + log.console("") + log.console("Build a semantic index for a source file.") + log.console("") + log.console("Options:") + log.console(" -o Write output to file instead of stdout") + $stop() + } else if (!starts_with(args[i], '-')) { + filename = args[i] + } +} + +if (filename == null) { + log.error('No file specified. Usage: cell index ') + $stop() +} + +if (!fd.is_file(filename)) { + log.error('File not found: ' + filename) + $stop() +} + +var src = text(fd.slurp(filename)) +var pipeline = {tokenize: tokenize_mod, parse: parse_mod, fold: fold_mod} +var idx = index_mod.index_file(src, filename, pipeline) +var out = json.encode(idx, true) + +if (output_path != null) { + fd.slurpwrite(output_path, out) + log.console('Wrote index to ' + output_path) +} else { + print(out) + print("\n") +} + +$stop() diff --git a/index.cm b/index.cm new file mode 100644 index 00000000..d76b8b43 --- /dev/null +++ b/index.cm @@ -0,0 +1,619 @@ +// index.cm — Core semantic indexing module. +// Walks AST output from parse (+ optional fold) to build a semantic index. +// +// Two entry points: +// index_file(src, filename, tokenize_mod, parse_mod, fold_mod) — full pipeline +// index_ast(ast, tokens, filename) — index a pre-parsed AST + +var make_span = function(node) { + return { + from_row: node.from_row, + from_col: node.from_column, + to_row: node.to_row, + to_col: node.to_column + } +} + +// Index an already-parsed AST. Tokens are optional (used for doc comments). +var index_ast = function(ast, tokens, filename) { + var is_actor = ends_with(filename, ".ce") + var imports = [] + var symbols = [] + var references = [] + var call_sites = [] + var exports_list = [] + var node_counter = 0 + var fn_map = {} + var _i = 0 + var _j = 0 + var fn = null + var sym_id = null + var params_list = null + var scope = null + var keys = null + var key = null + var entry = null + var reverse = {} + + // Build function_nr -> {name, outer, from_row} map from ast.functions. + if (ast.functions != null) { + _i = 0 + while (_i < length(ast.functions)) { + fn = ast.functions[_i] + fn_map[text(fn.function_nr)] = { + name: fn.name, + outer: fn.outer, + from_row: fn.from_row + } + _i = _i + 1 + } + } + + // Walk scope chain upward by `lvl` levels from func_nr. + var resolve_scope_nr = function(func_nr, lvl) { + var current = func_nr + var remaining = lvl + var info = null + if (remaining == null || remaining < 0) return null + while (remaining > 0 && current != null) { + info = fn_map[text(current)] + if (info != null) { + current = info.outer + } else { + return null + } + remaining = remaining - 1 + } + return current + } + + // Resolve a name node to its symbol_id using scope chain. + var resolve_symbol_id = function(name_node) { + var decl_fn_nr = resolve_scope_nr(name_node.function_nr, name_node.level) + var _si = 0 + var s = null + var e = null + var kind_str = null + if (decl_fn_nr == null) return null + if (ast.scopes == null) return null + _si = 0 + while (_si < length(ast.scopes)) { + s = ast.scopes[_si] + if (s.function_nr == decl_fn_nr) { + e = s[name_node.name] + if (e != null) { + kind_str = e.make + if (kind_str == "function") kind_str = "fn" + if (kind_str == "input") kind_str = "param" + return filename + ":" + name_node.name + ":" + kind_str + } + } + _si = _si + 1 + } + return null + } + + // Get enclosing symbol id for a function_nr. + var get_enclosing = function(func_nr) { + var info = fn_map[text(func_nr)] + if (info == null || func_nr == 0) return null + if (info.name != null) return filename + ":" + info.name + ":fn" + return null + } + + // Find doc comment in tokens immediately before target_row. + var find_doc_comment = function(target_row) { + var _ti = 0 + var tok = null + var lines = [] + var line_nr = null + if (tokens == null) return null + _ti = 0 + while (_ti < length(tokens)) { + tok = tokens[_ti] + if (tok.kind == "comment" && tok.from_row >= target_row - 10 && tok.from_row < target_row) { + lines[] = tok.value + } + if (tok.from_row >= target_row) break + _ti = _ti + 1 + } + if (length(lines) > 0) return text(lines, "\n") + return null + } + + // Allocate a monotonic node id. + var next_id = function() { + node_counter = node_counter + 1 + return node_counter + } + + // Forward declarations for mutual recursion. + var walk_expr = null + var walk_stmts = null + var walk_stmt = null + + // Walk an expression node, collecting references and call sites. + walk_expr = function(node, enclosing, is_lhs) { + var nid = 0 + var ref_kind = null + var callee_name = null + var callee_sym = null + var arg_count = 0 + var _ai = 0 + var enc = null + var param_name = null + + if (node == null) return + + nid = next_id() + + // Name reference — has function_nr when it's a true variable reference. + if (node.kind == "name" && node.name != null && node.function_nr != null) { + if (node.intrinsic != true) { + ref_kind = is_lhs ? "write" : "read" + references[] = { + node_id: nid, + name: node.name, + symbol_id: resolve_symbol_id(node), + span: make_span(node), + enclosing: enclosing, + ref_kind: ref_kind + } + } + } + + // Call expression. + if (node.kind == "(") { + callee_name = null + callee_sym = null + arg_count = (node.list != null) ? length(node.list) : 0 + + if (node.expression != null) { + if (node.expression.kind == "name") { + callee_name = node.expression.name + if (node.expression.intrinsic != true && node.expression.function_nr != null) { + callee_sym = resolve_symbol_id(node.expression) + } + } else if (node.expression.kind == ".") { + if (node.expression.left != null && node.expression.left.kind == "name") { + callee_name = node.expression.left.name + } + if (node.expression.right != null && node.expression.right.name != null) { + callee_name = (callee_name != null ? callee_name + "." : "") + node.expression.right.name + } + } + } + + if (callee_name != "use") { + call_sites[] = { + node_id: nid, + callee: callee_name, + callee_symbol_id: callee_sym, + span: make_span(node), + enclosing: enclosing, + args_count: arg_count + } + } + + // Also record the callee name as a "call" reference. + if (node.expression != null && node.expression.kind == "name" && + node.expression.function_nr != null && node.expression.intrinsic != true) { + references[] = { + node_id: nid, + name: node.expression.name, + symbol_id: resolve_symbol_id(node.expression), + span: make_span(node.expression), + enclosing: enclosing, + ref_kind: "call" + } + } + + // Walk callee expression (skip name — already recorded above). + if (node.expression != null && node.expression.kind != "name") { + walk_expr(node.expression, enclosing, false) + } + + // Walk arguments. + if (node.list != null) { + _ai = 0 + while (_ai < length(node.list)) { + walk_expr(node.list[_ai], enclosing, false) + _ai = _ai + 1 + } + } + return + } + + // Function / arrow function expression — walk body. + if (node.kind == "function" || node.kind == "arrow function") { + enc = enclosing + if (node.name != null && node.function_nr != null) { + enc = filename + ":" + node.name + ":fn" + } + // Record params as symbols. + if (node.list != null) { + _ai = 0 + while (_ai < length(node.list)) { + param_name = node.list[_ai].name + if (param_name != null) { + symbols[] = { + symbol_id: filename + ":" + param_name + ":param", + name: param_name, + kind: "param", + decl_span: make_span(node.list[_ai]), + doc_comment: null, + scope_fn_nr: node.function_nr, + params: null + } + } + _ai = _ai + 1 + } + } + walk_stmts(node.statements, enc) + walk_stmts(node.disruption, enc) + return + } + + // Assignment operators — left side is a write. + if (node.kind == "=" || node.kind == "+=" || node.kind == "-=" || + node.kind == "*=" || node.kind == "/=" || node.kind == "%=") { + walk_expr(node.left, enclosing, true) + walk_expr(node.right, enclosing, false) + return + } + + // Property access — only walk left (right is property name, not a ref). + if (node.kind == ".") { + walk_expr(node.left, enclosing, false) + return + } + + // Index access. + if (node.kind == "[") { + walk_expr(node.left, enclosing, false) + walk_expr(node.right, enclosing, false) + return + } + + // Array literal. + if (node.kind == "array" && node.list != null) { + _ai = 0 + while (_ai < length(node.list)) { + walk_expr(node.list[_ai], enclosing, false) + _ai = _ai + 1 + } + return + } + + // Record literal — only walk values, not keys. + if (node.kind == "record" && node.list != null) { + _ai = 0 + while (_ai < length(node.list)) { + if (node.list[_ai] != null) { + walk_expr(node.list[_ai].right, enclosing, false) + } + _ai = _ai + 1 + } + return + } + + // Template literal. + if (node.kind == "template" && node.list != null) { + _ai = 0 + while (_ai < length(node.list)) { + walk_expr(node.list[_ai], enclosing, false) + _ai = _ai + 1 + } + return + } + + // Prefix/postfix increment/decrement — treat as write. + if (node.kind == "++" || node.kind == "--") { + walk_expr(node.expression, enclosing, true) + return + } + + // Ternary. + if (node.kind == "?" || node.kind == "then") { + walk_expr(node.expression, enclosing, false) + walk_expr(node.then, enclosing, false) + walk_expr(node.else, enclosing, false) + return + } + + // Generic fallthrough: walk left, right, expression. + if (node.left != null) walk_expr(node.left, enclosing, is_lhs) + if (node.right != null) walk_expr(node.right, enclosing, false) + if (node.expression != null) walk_expr(node.expression, enclosing, false) + } + + // Walk an array of statements. + walk_stmts = function(stmts, enclosing) { + var _wi = 0 + if (stmts == null) return + _wi = 0 + while (_wi < length(stmts)) { + walk_stmt(stmts[_wi], enclosing) + _wi = _wi + 1 + } + } + + // Walk a single statement. + walk_stmt = function(stmt, enclosing) { + var sym_kind = null + var s_id = null + var p_list = null + var _di = 0 + var local_name = null + + if (stmt == null) return + + // Variable/constant declaration. + if (stmt.kind == "var" || stmt.kind == "def") { + if (stmt.left != null && stmt.left.name != null) { + sym_kind = stmt.kind + p_list = null + + // Check if RHS is a function expression. + if (stmt.right != null && (stmt.right.kind == "function" || stmt.right.kind == "arrow function")) { + sym_kind = "fn" + p_list = [] + if (stmt.right.list != null) { + _di = 0 + while (_di < length(stmt.right.list)) { + if (stmt.right.list[_di].name != null) { + p_list[] = stmt.right.list[_di].name + } + _di = _di + 1 + } + } + } + + s_id = filename + ":" + stmt.left.name + ":" + sym_kind + symbols[] = { + symbol_id: s_id, + name: stmt.left.name, + kind: sym_kind, + decl_span: make_span(stmt), + doc_comment: find_doc_comment(stmt.from_row), + scope_fn_nr: 0, + params: p_list + } + + // Check for import: var x = use('path'). + if (stmt.right != null && stmt.right.kind == "(" && + stmt.right.expression != null && stmt.right.expression.name == "use" && + stmt.right.list != null && length(stmt.right.list) > 0 && + stmt.right.list[0].kind == "text") { + imports[] = { + local_name: stmt.left.name, + module_path: stmt.right.list[0].value, + span: make_span(stmt) + } + } + } + + walk_expr(stmt.right, enclosing, false) + return + } + + // Multiple declarations (var_list). + if (stmt.kind == "var_list" && stmt.list != null) { + _di = 0 + while (_di < length(stmt.list)) { + walk_stmt(stmt.list[_di], enclosing) + _di = _di + 1 + } + return + } + + // Expression statement. + if (stmt.kind == "call") { + // Check for bare use() as expression statement. + if (stmt.expression != null && stmt.expression.kind == "(" && + stmt.expression.expression != null && stmt.expression.expression.name == "use" && + stmt.expression.list != null && length(stmt.expression.list) > 0 && + stmt.expression.list[0].kind == "text") { + imports[] = { + local_name: null, + module_path: stmt.expression.list[0].value, + span: make_span(stmt) + } + } + walk_expr(stmt.expression, enclosing, false) + return + } + + // If statement. + if (stmt.kind == "if") { + walk_expr(stmt.expression, enclosing, false) + walk_stmts(stmt.then, enclosing) + if (stmt.else != null) { + walk_stmts(stmt.else, enclosing) + } + // else-if chain. + if (stmt.list != null) { + walk_stmts(stmt.list, enclosing) + } + return + } + + // While loop. + if (stmt.kind == "while") { + walk_expr(stmt.expression, enclosing, false) + walk_stmts(stmt.statements, enclosing) + return + } + + // For loop. + if (stmt.kind == "for") { + walk_expr(stmt.init, enclosing, false) + walk_expr(stmt.test, enclosing, false) + walk_expr(stmt.update, enclosing, false) + walk_stmts(stmt.statements, enclosing) + return + } + + // Do-while loop. + if (stmt.kind == "do") { + walk_stmts(stmt.statements, enclosing) + walk_expr(stmt.expression, enclosing, false) + return + } + + // Return statement. + if (stmt.kind == "return") { + walk_expr(stmt.expression, enclosing, false) + return + } + + // Disrupt. + if (stmt.kind == "disrupt") { + walk_expr(stmt.expression, enclosing, false) + return + } + + // Block. + if (stmt.kind == "block") { + walk_stmts(stmt.statements, enclosing) + return + } + + // Fallthrough: walk any sub-nodes. + walk_expr(stmt.expression, enclosing, false) + walk_expr(stmt.left, enclosing, false) + walk_expr(stmt.right, enclosing, false) + walk_stmts(stmt.statements, enclosing) + } + + // --- 1. Process named functions from ast.functions --- + if (ast.functions != null) { + _i = 0 + while (_i < length(ast.functions)) { + fn = ast.functions[_i] + sym_id = filename + ":" + (fn.name != null ? fn.name : "anon_" + text(fn.function_nr)) + ":fn" + params_list = [] + if (fn.list != null) { + _j = 0 + while (_j < length(fn.list)) { + if (fn.list[_j].name != null) { + params_list[] = fn.list[_j].name + } + _j = _j + 1 + } + } + + symbols[] = { + symbol_id: sym_id, + name: fn.name, + kind: "fn", + decl_span: make_span(fn), + doc_comment: find_doc_comment(fn.from_row), + scope_fn_nr: fn.outer != null ? fn.outer : 0, + params: params_list + } + + // Record params as symbols. + if (fn.list != null) { + _j = 0 + while (_j < length(fn.list)) { + if (fn.list[_j].name != null) { + symbols[] = { + symbol_id: filename + ":" + fn.list[_j].name + ":param", + name: fn.list[_j].name, + kind: "param", + decl_span: make_span(fn.list[_j]), + doc_comment: null, + scope_fn_nr: fn.function_nr, + params: null + } + } + _j = _j + 1 + } + } + + // Walk function body. + walk_stmts(fn.statements, sym_id) + walk_stmts(fn.disruption, sym_id) + + _i = _i + 1 + } + } + + // --- 2. Walk top-level statements --- + walk_stmts(ast.statements, null) + + // --- 3. Detect exports for .cm modules --- + if (!is_actor && ast.statements != null) { + _i = length(ast.statements) - 1 + while (_i >= 0) { + if (ast.statements[_i].kind == "return" && ast.statements[_i].expression != null) { + // Check if the return expression is a record literal with key-value pairs. + if (ast.statements[_i].expression.list != null) { + _j = 0 + while (_j < length(ast.statements[_i].expression.list)) { + entry = ast.statements[_i].expression.list[_j] + if (entry != null && entry.left != null && entry.left.name != null) { + // Link the export to a symbol if the value is a name reference. + sym_id = null + if (entry.right != null && entry.right.kind == "name" && entry.right.function_nr != null) { + sym_id = resolve_symbol_id(entry.right) + } + exports_list[] = { + name: entry.left.name, + symbol_id: sym_id + } + } + _j = _j + 1 + } + } + break + } + _i = _i - 1 + } + } + + // --- 4. Build reverse refs --- + _i = 0 + while (_i < length(references)) { + key = references[_i].name + if (reverse[key] == null) { + reverse[key] = [] + } + reverse[key][] = { + node_id: references[_i].node_id, + span: references[_i].span, + enclosing: references[_i].enclosing, + ref_kind: references[_i].ref_kind + } + _i = _i + 1 + } + + return { + version: 1, + path: filename, + is_actor: is_actor, + imports: imports, + symbols: symbols, + references: references, + call_sites: call_sites, + exports: exports_list, + reverse_refs: reverse + } +} + +// Run the full pipeline (tokenize -> parse -> fold) and index. +// pipeline is {tokenize, parse, fold} — pass fold as null to skip folding. +var index_file = function(src, filename, pipeline) { + var tok_result = pipeline.tokenize(src, filename) + var ast = pipeline.parse(tok_result.tokens, src, filename, pipeline.tokenize) + if (pipeline.fold != null) { + ast = pipeline.fold(ast) + } + return index_ast(ast, tok_result.tokens, filename) +} + +return { + index_file: index_file, + index_ast: index_ast +}