From 2633fb986f5129d8ac84fe981b64c5c57229aa59 Mon Sep 17 00:00:00 2001 From: John Alanbrook Date: Tue, 17 Feb 2026 01:08:10 -0600 Subject: [PATCH] improved semantic indexing --- docs/semantic-index.md | 81 ++++++++++++++++++++++++++++++++++-- explain.ce | 93 ++++++++++++++++++++++++++++++++---------- explain.cm | 7 +++- index.ce | 25 ++++++++++++ internal/shop.cm | 76 +++++++++++++++++----------------- 5 files changed, 217 insertions(+), 65 deletions(-) diff --git a/docs/semantic-index.md b/docs/semantic-index.md index 260d5f30..09c183ec 100644 --- a/docs/semantic-index.md +++ b/docs/semantic-index.md @@ -41,7 +41,7 @@ The index contains these sections: | Section | Description | |---------|-------------| -| `imports` | All `use()` calls with local name, module path, and span | +| `imports` | All `use()` calls with local name, module path, resolved filesystem path, and span | | `symbols` | Every declaration: vars, defs, functions, params | | `references` | Every use of a name, classified as read, write, or call | | `call_sites` | Every function call with callee, args count, and enclosing function | @@ -62,7 +62,7 @@ pit index graph.ce "path": "graph.ce", "is_actor": true, "imports": [ - {"local_name": "json", "module_path": "json", "span": {"from_row": 2, "from_col": 0, "to_row": 2, "to_col": 22}} + {"local_name": "json", "module_path": "json", "resolved_path": ".cell/packages/core/json.cm", "span": {"from_row": 2, "from_col": 0, "to_row": 2, "to_col": 22}} ], "symbols": [ { @@ -139,7 +139,7 @@ Query the semantic index for a specific symbol or cursor position. This is the t ```bash pit explain --span :: -pit explain --symbol +pit explain --symbol ... ``` ### --span: What is at this position? @@ -180,10 +180,11 @@ The result includes: ### --symbol: Find a symbol by name -Look up a symbol by name, returning all matching declarations and every reference. +Look up a symbol by name. Pass one file for a focused result, or multiple files (including shell globs) to search across them all: ```bash pit explain --symbol connect demo.ce +pit explain --symbol connect *.ce *.cm ``` ```json @@ -268,3 +269,75 @@ The semantic index powers these LSP features: | Go to Definition | `textDocument/definition` | Jump to a symbol's declaration (index-backed with AST fallback) | These work automatically in any editor with ƿit LSP support. The index is rebuilt on every file change. + +## LLM / AI Assistance + +The semantic index is designed to give LLMs the context they need to read and edit ƿit code accurately. ƿit is not in any training set, so an LLM cannot rely on memorized patterns — it needs structured information about names, scopes, and call relationships. The commands below are the recommended way to provide that. + +### Understand a file before editing + +Before modifying a file, index it to see its structure: + +```bash +pit index file.ce +``` + +This gives the LLM every declaration, every reference, every call site, and the import list with resolved paths. Key things to extract: + +- **`symbols`** — what functions exist, their parameters, and their doc comments. This is enough to understand the file's API without reading every line. +- **`imports`** with `resolved_path` — which modules are used, and where they live on disk. The LLM can follow these paths to read dependency source when it needs to understand a called function. Imports without a `resolved_path` are C built-ins (like `json`) with no script source to read. +- **`exports`** — for `.cm` modules, what the public API is. This tells the LLM what names other files can access. + +### Investigate a specific symbol + +When the LLM needs to rename, refactor, or understand a specific function: + +```bash +pit explain --symbol update analysis.cm +``` + +This returns the declaration (with doc comment and parameter list), every reference, and every call site. The LLM can use this to: + +- **Rename safely** — the references list has exact spans for every use of the name. +- **Understand callers** — `call_sites` shows where and how the function is called, including argument counts. +- **Read the doc comment** — often enough to understand intent without reading the function body. + +### Investigate a cursor position + +When the LLM is looking at a specific line and column (e.g., from an error message or a user selection): + +```bash +pit explain --span file.ce:17:4 +``` + +This resolves whatever is at that position — declaration or reference — back to the underlying symbol, then returns all references and call sites. Useful for "what is this name?" queries. + +### Search across files + +To find a symbol across multiple files, pass them all: + +```bash +pit explain --symbol connect *.ce *.cm +pit explain --symbol send server.ce client.ce protocol.cm +``` + +This indexes each file and searches across all of them. The result merges all matching declarations, references, and call sites. Use this when the LLM needs to understand cross-file usage before making a change that touches multiple files. + +### Import resolution + +Every import in the index includes the original `module_path` (the string passed to `use()`). For script modules, it also includes `resolved_path` — the filesystem path the module resolves to. This lets the LLM follow dependency chains: + +```json +{"local_name": "fd", "module_path": "fd", "resolved_path": ".cell/packages/core/fd.cm"} +{"local_name": "json", "module_path": "json"} +``` + +An import without `resolved_path` is a C built-in — no script source to read. + +### Recommended workflow + +1. **Start with `pit index`** on the file to edit. Scan imports and symbols for an overview. +2. **Use `pit explain --symbol`** to drill into any function the LLM needs to understand or modify. The doc comment and parameter list are usually sufficient. +3. **Follow `resolved_path`** on imports when the LLM needs to understand a dependency — index or read the resolved file. +4. **Before renaming**, use `pit explain --symbol` (or `--span`) to get all reference spans, then apply edits to each span. +5. **For cross-file changes**, pass all affected files to `pit explain --symbol` to see the full picture before editing. diff --git a/explain.ce b/explain.ce index 6b2c182a..d066e249 100644 --- a/explain.ce +++ b/explain.ce @@ -13,11 +13,35 @@ var parse_mod = use('parse') var fold_mod = use('fold') var index_mod = use('index') var explain_mod = use('explain') +var shop = use('internal/shop') + +// Resolve import paths on an index in-place. +var resolve_imports = function(idx_obj, fname) { + var fi = shop.file_info(fd.realpath(fname)) + var ctx = fi.package + var ri = 0 + var rp = null + var lp = null + while (ri < length(idx_obj.imports)) { + rp = shop.resolve_use_path(idx_obj.imports[ri].module_path, ctx) + // Fallback: check sibling files in the same directory. + if (rp == null) { + lp = fd.dirname(fd.realpath(fname)) + '/' + idx_obj.imports[ri].module_path + '.cm' + if (fd.is_file(lp)) { + rp = lp + } + } + if (rp != null) { + idx_obj.imports[ri].resolved_path = rp + } + ri = ri + 1 + } +} var mode = null var span_arg = null var symbol_name = null -var file_arg = null +var files = [] var i = 0 var parts = null var filename = null @@ -25,6 +49,7 @@ var line = null var col = null var src = null var idx = null +var indexes = [] var explain = null var result = null var pipeline = {tokenize: tokenize_mod, parse: parse_mod, fold: fold_mod} @@ -55,12 +80,10 @@ for (i = 0; i < length(args); i++) { log.console("") log.console("Options:") log.console(" --span file:line:col Find symbol at position") - log.console(" --symbol name [file] Find symbol by name") + log.console(" --symbol name ... Find symbol by name across files") $stop() } else if (!starts_with(args[i], '-')) { - if (file_arg == null) { - file_arg = args[i] - } + files[] = args[i] } } @@ -87,6 +110,7 @@ if (mode == "span") { src = text(fd.slurp(filename)) idx = index_mod.index_file(src, filename, pipeline) + resolve_imports(idx, filename) explain = explain_mod.make(idx) result = explain.at_span(line, col) @@ -99,28 +123,55 @@ if (mode == "span") { } if (mode == "symbol") { - filename = file_arg - - if (filename == null) { - log.error('--symbol requires a file argument') + if (length(files) == 0) { + log.error('--symbol requires at least one file argument') $stop() } - if (!fd.is_file(filename)) { - log.error('File not found: ' + filename) - $stop() + // Validate all files exist. + i = 0 + while (i < length(files)) { + if (!fd.is_file(files[i])) { + log.error('File not found: ' + files[i]) + $stop() + } + i = i + 1 } - src = text(fd.slurp(filename)) - idx = index_mod.index_file(src, filename, pipeline) - explain = explain_mod.make(idx) - result = explain.by_symbol(symbol_name) + if (length(files) == 1) { + // Single file: use by_symbol for a focused result. + filename = files[0] + src = text(fd.slurp(filename)) + idx = index_mod.index_file(src, filename, pipeline) + resolve_imports(idx, filename) + explain = explain_mod.make(idx) + result = explain.by_symbol(symbol_name) - if (result == null || length(result.symbols) == 0) { - log.console("Symbol '" + symbol_name + "' not found in " + filename) - } else { - print(json.encode(result, true)) - print("\n") + if (result == null || length(result.symbols) == 0) { + log.console("Symbol '" + symbol_name + "' not found in " + filename) + } else { + print(json.encode(result, true)) + print("\n") + } + } else if (length(files) > 1) { + // Multiple files: index each and search across all. + indexes = [] + i = 0 + while (i < length(files)) { + src = text(fd.slurp(files[i])) + idx = index_mod.index_file(src, files[i], pipeline) + resolve_imports(idx, files[i]) + indexes[] = idx + i = i + 1 + } + result = explain_mod.explain_across(indexes, symbol_name) + + if (result == null || length(result.symbols) == 0) { + log.console("Symbol '" + symbol_name + "' not found in " + text(length(files)) + " files") + } else { + print(json.encode(result, true)) + print("\n") + } } } diff --git a/explain.cm b/explain.cm index 36e8fe09..d945042c 100644 --- a/explain.cm +++ b/explain.cm @@ -72,8 +72,11 @@ var make = function(index) { // Gather call sites. _i = 0 while (_i < length(index.call_sites)) { - if (found_sym != null && index.call_sites[_i].callee_symbol_id == found_sym.symbol_id) { - result_calls[] = index.call_sites[_i] + if (found_sym != null) { + if (index.call_sites[_i].callee_symbol_id == found_sym.symbol_id || + (index.call_sites[_i].callee_symbol_id == null && index.call_sites[_i].callee == found_sym.name)) { + result_calls[] = index.call_sites[_i] + } } _i = _i + 1 } diff --git a/index.ce b/index.ce index c991ce0e..ab5359a0 100644 --- a/index.ce +++ b/index.ce @@ -11,10 +11,15 @@ var tokenize_mod = use('tokenize') var parse_mod = use('parse') var fold_mod = use('fold') var index_mod = use('index') +var shop = use('internal/shop') var filename = null var output_path = null var i = 0 +var file_info = null +var pkg_ctx = null +var resolved = null +var local_path = null for (i = 0; i < length(args); i++) { if (args[i] == '-o' || args[i] == '--output') { @@ -51,6 +56,26 @@ if (!fd.is_file(filename)) { var src = text(fd.slurp(filename)) var pipeline = {tokenize: tokenize_mod, parse: parse_mod, fold: fold_mod} var idx = index_mod.index_file(src, filename, pipeline) + +// Resolve import paths to filesystem locations. +file_info = shop.file_info(fd.realpath(filename)) +pkg_ctx = file_info.package +i = 0 +while (i < length(idx.imports)) { + resolved = shop.resolve_use_path(idx.imports[i].module_path, pkg_ctx) + // Fallback: check sibling files in the same directory. + if (resolved == null) { + local_path = fd.dirname(fd.realpath(filename)) + '/' + idx.imports[i].module_path + '.cm' + if (fd.is_file(local_path)) { + resolved = local_path + } + } + if (resolved != null) { + idx.imports[i].resolved_path = resolved + } + i = i + 1 +} + var out = json.encode(idx, true) if (output_path != null) { diff --git a/internal/shop.cm b/internal/shop.cm index 7517c6d5..6c4ea826 100644 --- a/internal/shop.cm +++ b/internal/shop.cm @@ -580,16 +580,20 @@ function resolve_mod_fn(path, pkg) { // given a path and a package context // return module info about where it was found -function resolve_locator(path, ctx) +// Resolve a module path to {path, scope, pkg} without compiling. +function resolve_path(path, ctx) { var explicit = split_explicit_package_import(path) var explicit_path = null - var fn = null var core_dir = null var core_file_path = null var is_core = null var scope = null var alias_path = null + var ctx_dir = null + var ctx_path = null + var alias = null + var package_path = null if (explicit) { if (is_internal_path(explicit.path) && ctx && explicit.package != ctx) @@ -597,72 +601,60 @@ function resolve_locator(path, ctx) } if (explicit) { explicit_path = get_packages_dir() + '/' + safe_package_path(explicit.package) + '/' + explicit.path - if (fd.is_file(explicit_path)) { - fn = resolve_mod_fn(explicit_path, explicit.package) - return {path: explicit_path, scope: SCOPE_PACKAGE, symbol: fn} - } + if (fd.is_file(explicit_path)) + return {path: explicit_path, scope: SCOPE_PACKAGE, pkg: explicit.package} } - // 1. If no context, resolve from core only if (!ctx) { core_dir = Shop.get_core_dir() core_file_path = core_dir + '/' + path - if (fd.is_file(core_file_path)) { - fn = resolve_mod_fn(core_file_path, 'core') - return {path: core_file_path, scope: SCOPE_CORE, symbol: fn} - } + if (fd.is_file(core_file_path)) + return {path: core_file_path, scope: SCOPE_CORE, pkg: 'core'} return null } - // check in ctx package - // If ctx is an absolute path (starts with /), use it directly - // Otherwise, look it up in the packages directory - var ctx_dir = null - if (starts_with(ctx, '/')) { + if (starts_with(ctx, '/')) ctx_dir = ctx - } else { + else ctx_dir = get_packages_dir() + '/' + safe_package_path(ctx) - } - var ctx_path = ctx_dir + '/' + path + ctx_path = ctx_dir + '/' + path if (fd.is_file(ctx_path)) { - fn = resolve_mod_fn(ctx_path, ctx) - // Check if ctx is the core package (either by name or by path) is_core = (ctx == 'core') || (ctx_dir == Shop.get_core_dir()) scope = is_core ? SCOPE_CORE : SCOPE_LOCAL - return {path: ctx_path, scope: scope, symbol: fn} + return {path: ctx_path, scope: scope, pkg: ctx} } if (is_internal_path(path)) return null - // check for aliased dependency - var alias = pkg_tools.split_alias(ctx, path) + alias = pkg_tools.split_alias(ctx, path) if (alias) { alias_path = get_packages_dir() + '/' + safe_package_path(alias.package) + '/' + alias.path - if (fd.is_file(alias_path)) { - fn = resolve_mod_fn(alias_path, ctx) - return {path: alias_path, scope:SCOPE_PACKAGE, symbol:fn} - } + if (fd.is_file(alias_path)) + return {path: alias_path, scope: SCOPE_PACKAGE, pkg: ctx} } - var package_path = get_packages_dir() + '/' + safe_package_path(path) - if (fd.is_file(package_path)) { - fn = resolve_mod_fn(package_path, ctx) - return {path: package_path, scope: SCOPE_PACKAGE, symbol: fn} - } + package_path = get_packages_dir() + '/' + safe_package_path(path) + if (fd.is_file(package_path)) + return {path: package_path, scope: SCOPE_PACKAGE, pkg: ctx} - // 4. Check core as fallback core_dir = Shop.get_core_dir() core_file_path = core_dir + '/' + path - if (fd.is_file(core_file_path)) { - fn = resolve_mod_fn(core_file_path, 'core') - return {path: core_file_path, scope: SCOPE_CORE, symbol: fn} - } + if (fd.is_file(core_file_path)) + return {path: core_file_path, scope: SCOPE_CORE, pkg: 'core'} return null } +function resolve_locator(path, ctx) +{ + var info = resolve_path(path, ctx) + if (info == null) return null + var fn = resolve_mod_fn(info.path, info.pkg) + return {path: info.path, scope: info.scope, symbol: fn} +} + // Generate symbol name for a C module file // e.g., make_c_symbol('core', 'math') -> 'js_core_math_use' function make_c_symbol(pkg, file) { @@ -1018,6 +1010,14 @@ Shop.use = function use(path, package_context) { Shop.resolve_locator = resolve_locator +// Resolve a use() module path to a filesystem path without compiling. +// Returns the absolute path string, or null if not found. +Shop.resolve_use_path = function(path, ctx) { + var info = resolve_path(path + '.cm', ctx) + if (info == null) return null + return info.path +} + // Get cache path for a package and commit function get_cache_path(pkg, commit) { return global_shop_path + '/cache/' + replace(replace(pkg, '@','_'), '/','_') + '_' + commit + '.zip'