improved semantic indexing

2026-02-17 01:08:10 -06:00
parent 0ac575db85
commit 2633fb986f
5 changed files with 217 additions and 65 deletions
--- a/docs/semantic-index.md
+++ b/docs/semantic-index.md
@@ -41,7 +41,7 @@ The index contains these sections:

 | Section | Description |
 |---------|-------------|
-| `imports` | All `use()` calls with local name, module path, and span |
+| `imports` | All `use()` calls with local name, module path, resolved filesystem path, and span |
 | `symbols` | Every declaration: vars, defs, functions, params |
 | `references` | Every use of a name, classified as read, write, or call |
 | `call_sites` | Every function call with callee, args count, and enclosing function |
@@ -62,7 +62,7 @@ pit index graph.ce
  "path": "graph.ce",
  "is_actor": true,
  "imports": [
-    {"local_name": "json", "module_path": "json", "span": {"from_row": 2, "from_col": 0, "to_row": 2, "to_col": 22}}
+    {"local_name": "json", "module_path": "json", "resolved_path": ".cell/packages/core/json.cm", "span": {"from_row": 2, "from_col": 0, "to_row": 2, "to_col": 22}}
  ],
  "symbols": [
    {
@@ -139,7 +139,7 @@ Query the semantic index for a specific symbol or cursor position. This is the t

 ```bash
 pit explain --span <file>:<line>:<col>
-pit explain --symbol <name> <file>
+pit explain --symbol <name> <file>...
 ```

 ### --span: What is at this position?
@@ -180,10 +180,11 @@ The result includes:

 ### --symbol: Find a symbol by name

-Look up a symbol by name, returning all matching declarations and every reference.
+Look up a symbol by name. Pass one file for a focused result, or multiple files (including shell globs) to search across them all:

 ```bash
 pit explain --symbol connect demo.ce
+pit explain --symbol connect *.ce *.cm
 ```

 ```json
@@ -268,3 +269,75 @@ The semantic index powers these LSP features:
 | Go to Definition | `textDocument/definition` | Jump to a symbol's declaration (index-backed with AST fallback) |

 These work automatically in any editor with ƿit LSP support. The index is rebuilt on every file change.
+
+## LLM / AI Assistance
+
+The semantic index is designed to give LLMs the context they need to read and edit ƿit code accurately. ƿit is not in any training set, so an LLM cannot rely on memorized patterns — it needs structured information about names, scopes, and call relationships. The commands below are the recommended way to provide that.
+
+### Understand a file before editing
+
+Before modifying a file, index it to see its structure:
+
+```bash
+pit index file.ce
+```
+
+This gives the LLM every declaration, every reference, every call site, and the import list with resolved paths. Key things to extract:
+
+- **`symbols`** — what functions exist, their parameters, and their doc comments. This is enough to understand the file's API without reading every line.
+- **`imports`** with `resolved_path` — which modules are used, and where they live on disk. The LLM can follow these paths to read dependency source when it needs to understand a called function. Imports without a `resolved_path` are C built-ins (like `json`) with no script source to read.
+- **`exports`** — for `.cm` modules, what the public API is. This tells the LLM what names other files can access.
+
+### Investigate a specific symbol
+
+When the LLM needs to rename, refactor, or understand a specific function:
+
+```bash
+pit explain --symbol update analysis.cm
+```
+
+This returns the declaration (with doc comment and parameter list), every reference, and every call site. The LLM can use this to:
+
+- **Rename safely** — the references list has exact spans for every use of the name.
+- **Understand callers** — `call_sites` shows where and how the function is called, including argument counts.
+- **Read the doc comment** — often enough to understand intent without reading the function body.
+
+### Investigate a cursor position
+
+When the LLM is looking at a specific line and column (e.g., from an error message or a user selection):
+
+```bash
+pit explain --span file.ce:17:4
+```
+
+This resolves whatever is at that position — declaration or reference — back to the underlying symbol, then returns all references and call sites. Useful for "what is this name?" queries.
+
+### Search across files
+
+To find a symbol across multiple files, pass them all:
+
+```bash
+pit explain --symbol connect *.ce *.cm
+pit explain --symbol send server.ce client.ce protocol.cm
+```
+
+This indexes each file and searches across all of them. The result merges all matching declarations, references, and call sites. Use this when the LLM needs to understand cross-file usage before making a change that touches multiple files.
+
+### Import resolution
+
+Every import in the index includes the original `module_path` (the string passed to `use()`). For script modules, it also includes `resolved_path` — the filesystem path the module resolves to. This lets the LLM follow dependency chains:
+
+```json
+{"local_name": "fd", "module_path": "fd", "resolved_path": ".cell/packages/core/fd.cm"}
+{"local_name": "json", "module_path": "json"}
+```
+
+An import without `resolved_path` is a C built-in — no script source to read.
+
+### Recommended workflow
+
+1. **Start with `pit index`** on the file to edit. Scan imports and symbols for an overview.
+2. **Use `pit explain --symbol`** to drill into any function the LLM needs to understand or modify. The doc comment and parameter list are usually sufficient.
+3. **Follow `resolved_path`** on imports when the LLM needs to understand a dependency — index or read the resolved file.
+4. **Before renaming**, use `pit explain --symbol` (or `--span`) to get all reference spans, then apply edits to each span.
+5. **For cross-file changes**, pass all affected files to `pit explain --symbol` to see the full picture before editing.
--- a/explain.ce
+++ b/explain.ce
@@ -13,11 +13,35 @@ var parse_mod = use('parse')
 var fold_mod = use('fold')
 var index_mod = use('index')
 var explain_mod = use('explain')
+var shop = use('internal/shop')
+
+// Resolve import paths on an index in-place.
+var resolve_imports = function(idx_obj, fname) {
+  var fi = shop.file_info(fd.realpath(fname))
+  var ctx = fi.package
+  var ri = 0
+  var rp = null
+  var lp = null
+  while (ri < length(idx_obj.imports)) {
+    rp = shop.resolve_use_path(idx_obj.imports[ri].module_path, ctx)
+    // Fallback: check sibling files in the same directory.
+    if (rp == null) {
+      lp = fd.dirname(fd.realpath(fname)) + '/' + idx_obj.imports[ri].module_path + '.cm'
+      if (fd.is_file(lp)) {
+        rp = lp
+      }
+    }
+    if (rp != null) {
+      idx_obj.imports[ri].resolved_path = rp
+    }
+    ri = ri + 1
+  }
+}

 var mode = null
 var span_arg = null
 var symbol_name = null
-var file_arg = null
+var files = []
 var i = 0
 var parts = null
 var filename = null
@@ -25,6 +49,7 @@ var line = null
 var col = null
 var src = null
 var idx = null
+var indexes = []
 var explain = null
 var result = null
 var pipeline = {tokenize: tokenize_mod, parse: parse_mod, fold: fold_mod}
@@ -55,12 +80,10 @@ for (i = 0; i < length(args); i++) {
    log.console("")
    log.console("Options:")
    log.console("  --span file:line:col   Find symbol at position")
-    log.console("  --symbol name [file]   Find symbol by name")
+    log.console("  --symbol name <file>...  Find symbol by name across files")
    $stop()
  } else if (!starts_with(args[i], '-')) {
-    if (file_arg == null) {
-      file_arg = args[i]
-    }
+    files[] = args[i]
  }
 }

@@ -87,6 +110,7 @@ if (mode == "span") {

  src = text(fd.slurp(filename))
  idx = index_mod.index_file(src, filename, pipeline)
+  resolve_imports(idx, filename)
  explain = explain_mod.make(idx)
  result = explain.at_span(line, col)

@@ -99,28 +123,55 @@ if (mode == "span") {
 }

 if (mode == "symbol") {
-  filename = file_arg
-
-  if (filename == null) {
-    log.error('--symbol requires a file argument')
+  if (length(files) == 0) {
+    log.error('--symbol requires at least one file argument')
    $stop()
  }

-  if (!fd.is_file(filename)) {
-    log.error('File not found: ' + filename)
-    $stop()
+  // Validate all files exist.
+  i = 0
+  while (i < length(files)) {
+    if (!fd.is_file(files[i])) {
+      log.error('File not found: ' + files[i])
+      $stop()
+    }
+    i = i + 1
  }

-  src = text(fd.slurp(filename))
-  idx = index_mod.index_file(src, filename, pipeline)
-  explain = explain_mod.make(idx)
-  result = explain.by_symbol(symbol_name)
+  if (length(files) == 1) {
+    // Single file: use by_symbol for a focused result.
+    filename = files[0]
+    src = text(fd.slurp(filename))
+    idx = index_mod.index_file(src, filename, pipeline)
+    resolve_imports(idx, filename)
+    explain = explain_mod.make(idx)
+    result = explain.by_symbol(symbol_name)

-  if (result == null || length(result.symbols) == 0) {
-    log.console("Symbol '" + symbol_name + "' not found in " + filename)
-  } else {
-    print(json.encode(result, true))
-    print("\n")
+    if (result == null || length(result.symbols) == 0) {
+      log.console("Symbol '" + symbol_name + "' not found in " + filename)
+    } else {
+      print(json.encode(result, true))
+      print("\n")
+    }
+  } else if (length(files) > 1) {
+    // Multiple files: index each and search across all.
+    indexes = []
+    i = 0
+    while (i < length(files)) {
+      src = text(fd.slurp(files[i]))
+      idx = index_mod.index_file(src, files[i], pipeline)
+      resolve_imports(idx, files[i])
+      indexes[] = idx
+      i = i + 1
+    }
+    result = explain_mod.explain_across(indexes, symbol_name)
+
+    if (result == null || length(result.symbols) == 0) {
+      log.console("Symbol '" + symbol_name + "' not found in " + text(length(files)) + " files")
+    } else {
+      print(json.encode(result, true))
+      print("\n")
+    }
  }
 }

--- a/explain.cm
+++ b/explain.cm
@@ -72,8 +72,11 @@ var make = function(index) {
    // Gather call sites.
    _i = 0
    while (_i < length(index.call_sites)) {
-      if (found_sym != null && index.call_sites[_i].callee_symbol_id == found_sym.symbol_id) {
-        result_calls[] = index.call_sites[_i]
+      if (found_sym != null) {
+        if (index.call_sites[_i].callee_symbol_id == found_sym.symbol_id ||
+            (index.call_sites[_i].callee_symbol_id == null && index.call_sites[_i].callee == found_sym.name)) {
+          result_calls[] = index.call_sites[_i]
+        }
      }
      _i = _i + 1
    }
--- a/index.ce
+++ b/index.ce
@@ -11,10 +11,15 @@ var tokenize_mod = use('tokenize')
 var parse_mod = use('parse')
 var fold_mod = use('fold')
 var index_mod = use('index')
+var shop = use('internal/shop')

 var filename = null
 var output_path = null
 var i = 0
+var file_info = null
+var pkg_ctx = null
+var resolved = null
+var local_path = null

 for (i = 0; i < length(args); i++) {
  if (args[i] == '-o' || args[i] == '--output') {
@@ -51,6 +56,26 @@ if (!fd.is_file(filename)) {
 var src = text(fd.slurp(filename))
 var pipeline = {tokenize: tokenize_mod, parse: parse_mod, fold: fold_mod}
 var idx = index_mod.index_file(src, filename, pipeline)
+
+// Resolve import paths to filesystem locations.
+file_info = shop.file_info(fd.realpath(filename))
+pkg_ctx = file_info.package
+i = 0
+while (i < length(idx.imports)) {
+  resolved = shop.resolve_use_path(idx.imports[i].module_path, pkg_ctx)
+  // Fallback: check sibling files in the same directory.
+  if (resolved == null) {
+    local_path = fd.dirname(fd.realpath(filename)) + '/' + idx.imports[i].module_path + '.cm'
+    if (fd.is_file(local_path)) {
+      resolved = local_path
+    }
+  }
+  if (resolved != null) {
+    idx.imports[i].resolved_path = resolved
+  }
+  i = i + 1
+}
+
 var out = json.encode(idx, true)

 if (output_path != null) {
--- a/internal/shop.cm
+++ b/internal/shop.cm
@@ -580,16 +580,20 @@ function resolve_mod_fn(path, pkg) {

 // given a path and a package context
 // return module info about where it was found
-function resolve_locator(path, ctx)
+// Resolve a module path to {path, scope, pkg} without compiling.
+function resolve_path(path, ctx)
 {
  var explicit = split_explicit_package_import(path)
  var explicit_path = null
-  var fn = null
  var core_dir = null
  var core_file_path = null
  var is_core = null
  var scope = null
  var alias_path = null
+  var ctx_dir = null
+  var ctx_path = null
+  var alias = null
+  var package_path = null

  if (explicit) {
    if (is_internal_path(explicit.path) && ctx && explicit.package != ctx)
@@ -597,72 +601,60 @@ function resolve_locator(path, ctx)
  }
  if (explicit) {
    explicit_path = get_packages_dir() + '/' + safe_package_path(explicit.package) + '/' + explicit.path
-    if (fd.is_file(explicit_path)) {
-      fn = resolve_mod_fn(explicit_path, explicit.package)
-      return {path: explicit_path, scope: SCOPE_PACKAGE, symbol: fn}
-    }
+    if (fd.is_file(explicit_path))
+      return {path: explicit_path, scope: SCOPE_PACKAGE, pkg: explicit.package}
  }

-  // 1. If no context, resolve from core only
  if (!ctx) {
    core_dir = Shop.get_core_dir()
    core_file_path = core_dir + '/' + path
-    if (fd.is_file(core_file_path)) {
-      fn = resolve_mod_fn(core_file_path, 'core')
-      return {path: core_file_path, scope: SCOPE_CORE, symbol: fn}
-    }
+    if (fd.is_file(core_file_path))
+      return {path: core_file_path, scope: SCOPE_CORE, pkg: 'core'}
    return null
  }

-  // check in ctx package
-  // If ctx is an absolute path (starts with /), use it directly
-  // Otherwise, look it up in the packages directory
-  var ctx_dir = null
-  if (starts_with(ctx, '/')) {
+  if (starts_with(ctx, '/'))
    ctx_dir = ctx
-  } else {
+  else
    ctx_dir = get_packages_dir() + '/' + safe_package_path(ctx)
-  }
-  var ctx_path = ctx_dir + '/' + path
+  ctx_path = ctx_dir + '/' + path

  if (fd.is_file(ctx_path)) {
-    fn = resolve_mod_fn(ctx_path, ctx)
-    // Check if ctx is the core package (either by name or by path)
    is_core = (ctx == 'core') || (ctx_dir == Shop.get_core_dir())
    scope = is_core ? SCOPE_CORE : SCOPE_LOCAL
-    return {path: ctx_path, scope: scope, symbol: fn}
+    return {path: ctx_path, scope: scope, pkg: ctx}
  }

  if (is_internal_path(path))
    return null

-  // check for aliased dependency
-  var alias = pkg_tools.split_alias(ctx, path)
+  alias = pkg_tools.split_alias(ctx, path)
  if (alias) {
    alias_path = get_packages_dir() + '/' + safe_package_path(alias.package) + '/' + alias.path
-    if (fd.is_file(alias_path)) {
-      fn = resolve_mod_fn(alias_path, ctx)
-      return {path: alias_path, scope:SCOPE_PACKAGE, symbol:fn}
-    }
+    if (fd.is_file(alias_path))
+      return {path: alias_path, scope: SCOPE_PACKAGE, pkg: ctx}
  }

-  var package_path = get_packages_dir() + '/' + safe_package_path(path)
-  if (fd.is_file(package_path)) {
-    fn = resolve_mod_fn(package_path, ctx)
-    return {path: package_path, scope: SCOPE_PACKAGE, symbol: fn}
-  }
+  package_path = get_packages_dir() + '/' + safe_package_path(path)
+  if (fd.is_file(package_path))
+    return {path: package_path, scope: SCOPE_PACKAGE, pkg: ctx}

-  // 4. Check core as fallback
  core_dir = Shop.get_core_dir()
  core_file_path = core_dir + '/' + path
-  if (fd.is_file(core_file_path)) {
-    fn = resolve_mod_fn(core_file_path, 'core')
-    return {path: core_file_path, scope: SCOPE_CORE, symbol: fn}
-  }
+  if (fd.is_file(core_file_path))
+    return {path: core_file_path, scope: SCOPE_CORE, pkg: 'core'}

  return null
 }

+function resolve_locator(path, ctx)
+{
+  var info = resolve_path(path, ctx)
+  if (info == null) return null
+  var fn = resolve_mod_fn(info.path, info.pkg)
+  return {path: info.path, scope: info.scope, symbol: fn}
+}
+
 // Generate symbol name for a C module file
 // e.g., make_c_symbol('core', 'math') -> 'js_core_math_use'
 function make_c_symbol(pkg, file) {
@@ -1018,6 +1010,14 @@ Shop.use = function use(path, package_context) {

 Shop.resolve_locator = resolve_locator

+// Resolve a use() module path to a filesystem path without compiling.
+// Returns the absolute path string, or null if not found.
+Shop.resolve_use_path = function(path, ctx) {
+  var info = resolve_path(path + '.cm', ctx)
+  if (info == null) return null
+  return info.path
+}
+
 // Get cache path for a package and commit
 function get_cache_path(pkg, commit) {
  return global_shop_path + '/cache/' + replace(replace(pkg, '@','_'), '/','_') + '_' + commit + '.zip'