From 97ece8e5cb6bea7b62e76cbd2ac90ef604829dda Mon Sep 17 00:00:00 2001 From: John Alanbrook Date: Sat, 17 Jan 2026 15:48:43 -0600 Subject: [PATCH] text extract --- internal/engine.cm | 4 +- source/quickjs.c | 734 ++++++++++++++++++++++++++++++++++----------- tests/suite.cm | 98 ++++++ 3 files changed, 663 insertions(+), 173 deletions(-) diff --git a/internal/engine.cm b/internal/engine.cm index e479c0c2..5e09663a 100644 --- a/internal/engine.cm +++ b/internal/engine.cm @@ -124,10 +124,10 @@ function caller_data(depth = 0) var caller = array(Error().stack, "\n")[1+depth] if (caller) { - var md = caller.match(/\((.*)\:/) + var md = extract(caller, /\((.*)\:/) var m = md ? md[1] : "SCRIPT" if (m) file = m - md = caller.match(/\:(\d*)\)/) + md = extract(caller, /\:(\d*)\)/) m = md ? md[1] : 0 if (m) line = m } diff --git a/source/quickjs.c b/source/quickjs.c index 9f4015e3..a3715618 100644 --- a/source/quickjs.c +++ b/source/quickjs.c @@ -30399,56 +30399,6 @@ static int check_regexp_g_flag(JSContext *ctx, JSValueConst regexp) return 0; } -static JSValue js_string_match(JSContext *ctx, JSValueConst this_val, - int argc, JSValueConst *argv, int atom) -{ - // match(rx), search(rx), matchAll(rx) - // atom is JS_ATOM_Symbol_match, JS_ATOM_Symbol_search, or JS_ATOM_Symbol_matchAll - JSValueConst O = this_val, regexp = argv[0], args[2]; - JSValue matcher, S, rx, result, str; - int args_len; - - if (JS_IsNull(O) || JS_IsNull(O)) - return JS_ThrowTypeError(ctx, "cannot convert to object"); - - if (!JS_IsNull(regexp) && !JS_IsNull(regexp)) { - matcher = JS_GetProperty(ctx, regexp, atom); - if (JS_IsException(matcher)) - return JS_EXCEPTION; - if (atom == JS_ATOM_Symbol_matchAll) { - if (check_regexp_g_flag(ctx, regexp) < 0) { - JS_FreeValue(ctx, matcher); - return JS_EXCEPTION; - } - } - if (!JS_IsNull(matcher) && !JS_IsNull(matcher)) { - return JS_CallFree(ctx, matcher, regexp, 1, &O); - } - } - S = JS_ToString(ctx, O); - if (JS_IsException(S)) - return JS_EXCEPTION; - args_len = 1; - args[0] = regexp; - str = JS_NULL; - if (atom == JS_ATOM_Symbol_matchAll) { - str = js_new_string8(ctx, "g"); - if (JS_IsException(str)) - goto fail; - args[args_len++] = (JSValueConst)str; - } - rx = JS_CallConstructor(ctx, ctx->regexp_ctor, args_len, args); - JS_FreeValue(ctx, str); - if (JS_IsException(rx)) { - fail: - JS_FreeValue(ctx, S); - return JS_EXCEPTION; - } - result = JS_InvokeFree(ctx, rx, atom, 1, (JSValueConst *)&S); - JS_FreeValue(ctx, S); - return result; -} - static JSValue js_string___GetSubstitution(JSContext *ctx, JSValueConst this_val, int argc, JSValueConst *argv) { @@ -30738,7 +30688,6 @@ static const JSCFunctionListEntry js_string_proto_funcs[] = { JS_CFUNC_DEF("concat", 1, js_string_concat), JS_CFUNC_MAGIC_DEF("indexOf", 1, js_string_indexOf, 0 ), JS_CFUNC_MAGIC_DEF("lastIndexOf", 1, js_string_indexOf, 1 ), - JS_CFUNC_MAGIC_DEF("match", 1, js_string_match, JS_ATOM_Symbol_match ), JS_CFUNC_MAGIC_DEF("replace", 2, js_string_replace, 0 ), JS_CFUNC_MAGIC_DEF("replaceAll", 2, js_string_replace, 1 ), JS_CFUNC_DEF("toString", 0, js_string_toString ), @@ -34345,60 +34294,6 @@ static JSValue js_cell_text_codepoint(JSContext *ctx, JSValueConst this_val, return JS_NewInt32(ctx, c); } -/* text.search(str, target, from) - find substring */ -static JSValue js_cell_text_search(JSContext *ctx, JSValueConst this_val, - int argc, JSValueConst *argv) -{ - if (argc < 2) return JS_NULL; - - int tag1 = JS_VALUE_GET_TAG(argv[0]); - int tag2 = JS_VALUE_GET_TAG(argv[1]); - if ((tag1 != JS_TAG_STRING && tag1 != JS_TAG_STRING_ROPE) || - (tag2 != JS_TAG_STRING && tag2 != JS_TAG_STRING_ROPE)) - return JS_NULL; - - JSValue str = JS_ToString(ctx, argv[0]); - if (JS_IsException(str)) return str; - - JSValue target = JS_ToString(ctx, argv[1]); - if (JS_IsException(target)) { - JS_FreeValue(ctx, str); - return target; - } - - JSString *p = JS_VALUE_GET_STRING(str); - JSString *t = JS_VALUE_GET_STRING(target); - - int from = 0; - if (argc > 2 && !JS_IsNull(argv[2])) { - if (JS_ToInt32(ctx, &from, argv[2])) { - JS_FreeValue(ctx, str); - JS_FreeValue(ctx, target); - return JS_NULL; - } - if (from < 0) from += p->len; - if (from < 0) from = 0; - } - - int result = -1; - int len = p->len; - int t_len = t->len; - - if (len >= t_len) { - for (int i = from; i <= len - t_len; i++) { - if (!string_cmp(p, t, i, 0, t_len)) { - result = i; - break; - } - } - } - - JS_FreeValue(ctx, str); - JS_FreeValue(ctx, target); - - if (result == -1) return JS_NULL; - return JS_NewInt32(ctx, result); -} /* Helpers (C, not C++). Put these above js_cell_text_replace in the same C file. */ static int sb_concat_value_to_string_free(JSContext *ctx, StringBuffer *b, JSValue v) @@ -34438,6 +34333,18 @@ static JSValue make_replacement(JSContext *ctx, int argc, JSValueConst *argv, in return JS_AtomToString(ctx, JS_ATOM_empty_string); } +static int JS_IsRegExp(JSContext *ctx, JSValueConst v) +{ + if (!JS_IsObject(v)) return 0; + + JSValue exec = JS_GetPropertyStr(ctx, v, "exec"); + if (JS_IsException(exec)) return -1; + + int ok = JS_IsFunction(ctx, exec); + JS_FreeValue(ctx, exec); + return ok; +} + /* text.replace(text, target, replacement, limit) * * Return a new text in which the target is replaced by the replacement. @@ -34451,145 +34358,502 @@ static JSValue make_replacement(JSContext *ctx, int argc, JSValueConst *argv, in * Example: replace("abc", "", "-") => "-a-b-c-" * Boundaries count toward limit even if replacement returns null. */ -static JSValue js_cell_text_replace(JSContext *ctx, JSValueConst this_val, - int argc, JSValueConst *argv) + +static JSValue js_cell_text_replace(JSContext *ctx, JSValueConst this_val, int argc, JSValueConst *argv) { if (argc < 2) return JS_NULL; - /* Require text + target be strings (or ropes) */ + int tag_text = JS_VALUE_GET_TAG(argv[0]); + if (tag_text != JS_TAG_STRING && tag_text != JS_TAG_STRING_ROPE) return JS_NULL; + + int target_is_regex = 0; { - int tag_text = JS_VALUE_GET_TAG(argv[0]); int tag_tgt = JS_VALUE_GET_TAG(argv[1]); - if ((tag_text != JS_TAG_STRING && tag_text != JS_TAG_STRING_ROPE) || - (tag_tgt != JS_TAG_STRING && tag_tgt != JS_TAG_STRING_ROPE)) + if (tag_tgt == JS_TAG_STRING || tag_tgt == JS_TAG_STRING_ROPE) { + target_is_regex = 0; + } else if (JS_IsObject(argv[1]) && JS_IsRegExp(ctx, argv[1])) { + target_is_regex = 1; + } else { return JS_NULL; + } } JSValue str = JS_ToString(ctx, argv[0]); if (JS_IsException(str)) return str; - JSValue target = JS_ToString(ctx, argv[1]); - if (JS_IsException(target)) { - JS_FreeValue(ctx, str); - return target; - } - JSString *sp = JS_VALUE_GET_STRING(str); - JSString *tp = JS_VALUE_GET_STRING(target); + int len = (int)sp->len; - int32_t limit = -1; /* -1 means unlimited */ + int32_t limit = -1; if (argc > 3 && !JS_IsNull(argv[3])) { if (JS_ToInt32(ctx, &limit, argv[3])) { JS_FreeValue(ctx, str); - JS_FreeValue(ctx, target); return JS_NULL; } if (limit < 0) limit = -1; } - int len = (int)sp->len; - int t_len = (int)tp->len; - StringBuffer b_s, *b = &b_s; string_buffer_init(ctx, b, len); - /* Empty target: boundary replacements */ - if (t_len == 0) { + if (!target_is_regex) { + JSValue target = JS_ToString(ctx, argv[1]); + if (JS_IsException(target)) { + JS_FreeValue(ctx, str); + return target; + } + + JSString *tp = JS_VALUE_GET_STRING(target); + int t_len = (int)tp->len; + + if (t_len == 0) { + int32_t count = 0; + + for (int boundary = 0; boundary <= len; boundary++) { + if (limit >= 0 && count >= limit) break; + + JSValue match = JS_AtomToString(ctx, JS_ATOM_empty_string); + if (JS_IsException(match)) goto fail_str_target; + + JSValue rep = make_replacement(ctx, argc, argv, boundary, match); + if (JS_IsException(rep)) goto fail_str_target; + + count++; + + if (!JS_IsNull(rep)) { + if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail_str_target; + } else { + JS_FreeValue(ctx, rep); + } + + if (boundary < len) { + JSValue ch = js_sub_string(ctx, sp, boundary, boundary + 1); + if (JS_IsException(ch)) goto fail_str_target; + if (string_buffer_concat_value_free(b, ch)) goto fail_str_target; + } + } + + JS_FreeValue(ctx, str); + JS_FreeValue(ctx, target); + return string_buffer_end(b); + } + + int pos = 0; int32_t count = 0; - for (int boundary = 0; boundary <= len; boundary++) { - if (limit >= 0 && count >= limit) break; + while (pos <= len - t_len && (limit < 0 || count < limit)) { + int found = -1; - /* match text is "" */ - JSValue match = JS_AtomToString(ctx, JS_ATOM_empty_string); - if (JS_IsException(match)) goto fail; + for (int i = pos; i <= len - t_len; i++) { + if (!string_cmp(sp, tp, i, 0, t_len)) { + found = i; + break; + } + } + if (found < 0) break; - JSValue rep = make_replacement(ctx, argc, argv, boundary, match); - if (JS_IsException(rep)) goto fail; + if (found > pos) { + JSValue sub = js_sub_string(ctx, sp, pos, found); + if (JS_IsException(sub)) goto fail_str_target; + if (string_buffer_concat_value_free(b, sub)) goto fail_str_target; + } + + JSValue match = js_sub_string(ctx, sp, found, found + t_len); + if (JS_IsException(match)) goto fail_str_target; + + JSValue rep = make_replacement(ctx, argc, argv, found, match); + if (JS_IsException(rep)) goto fail_str_target; - /* Count includes null matches */ count++; if (!JS_IsNull(rep)) { - if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail; + if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail_str_target; } else { JS_FreeValue(ctx, rep); } - /* Copy next character between boundaries (does not affect count) */ - if (boundary < len) { - JSValue ch = js_sub_string(ctx, sp, boundary, boundary + 1); - if (JS_IsException(ch)) goto fail; - if (string_buffer_concat_value_free(b, ch)) goto fail; - } + pos = found + t_len; + } + + if (pos < len) { + JSValue sub = js_sub_string(ctx, sp, pos, len); + if (JS_IsException(sub)) goto fail_str_target; + if (string_buffer_concat_value_free(b, sub)) goto fail_str_target; } JS_FreeValue(ctx, str); JS_FreeValue(ctx, target); return string_buffer_end(b); + + fail_str_target: + string_buffer_free(b); + JS_FreeValue(ctx, str); + JS_FreeValue(ctx, target); + return JS_EXCEPTION; } - /* Non-empty target: left-to-right, non-overlapping */ + /* Regex target */ + JSValue rx = argv[1]; + JSValue orig_last_index = JS_GetPropertyStr(ctx, rx, "lastIndex"); + if (JS_IsException(orig_last_index)) goto fail_rx; + int have_orig_last_index = 1; + int pos = 0; int32_t count = 0; - while (pos <= len - t_len && (limit < 0 || count < limit)) { - int found = -1; + while (pos <= len && (limit < 0 || count < limit)) { + if (JS_SetPropertyStr(ctx, rx, "lastIndex", JS_NewInt32(ctx, 0)) < 0) goto fail_rx; - /* Find next occurrence (naive search) */ - for (int i = pos; i <= len - t_len; i++) { - if (!string_cmp(sp, tp, i, 0, t_len)) { - found = i; - break; - } + JSValue sub_str = js_sub_string(ctx, sp, pos, len); + if (JS_IsException(sub_str)) goto fail_rx; + + JSValue exec_res = JS_Invoke(ctx, rx, JS_ATOM_exec, 1, (JSValueConst *)&sub_str); + JS_FreeValue(ctx, sub_str); + if (JS_IsException(exec_res)) goto fail_rx; + + if (JS_IsNull(exec_res)) { + JS_FreeValue(ctx, exec_res); + break; + } + + JSValue idx_val = JS_GetPropertyStr(ctx, exec_res, "index"); + if (JS_IsException(idx_val)) { + JS_FreeValue(ctx, exec_res); + goto fail_rx; + } + + int32_t local_index = 0; + if (JS_ToInt32(ctx, &local_index, idx_val)) { + JS_FreeValue(ctx, idx_val); + JS_FreeValue(ctx, exec_res); + goto fail_rx; + } + JS_FreeValue(ctx, idx_val); + + if (local_index < 0) local_index = 0; + int found = pos + local_index; + if (found < pos) found = pos; + if (found > len) { + JS_FreeValue(ctx, exec_res); + break; + } + + JSValue match = JS_GetPropertyUint32(ctx, exec_res, 0); + JS_FreeValue(ctx, exec_res); + if (JS_IsException(match)) goto fail_rx; + + int match_len = 0; + { + JSValue mstr = JS_ToString(ctx, match); + if (JS_IsException(mstr)) goto fail_rx; + JSString *mp = JS_VALUE_GET_STRING(mstr); + match_len = (int)mp->len; + JS_FreeValue(ctx, mstr); } - if (found < 0) break; - /* Copy prefix up to match */ if (found > pos) { - JSValue sub = js_sub_string(ctx, sp, pos, found); - if (JS_IsException(sub)) goto fail; - if (string_buffer_concat_value_free(b, sub)) goto fail; + JSValue prefix = js_sub_string(ctx, sp, pos, found); + if (JS_IsException(prefix)) goto fail_rx; + if (string_buffer_concat_value_free(b, prefix)) goto fail_rx; } - /* Match text for callback */ - JSValue match = js_sub_string(ctx, sp, found, found + t_len); - if (JS_IsException(match)) goto fail; - JSValue rep = make_replacement(ctx, argc, argv, found, match); - if (JS_IsException(rep)) goto fail; + if (JS_IsException(rep)) goto fail_rx; - /* Count includes null matches */ count++; if (!JS_IsNull(rep)) { - if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail; + if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail_rx; } else { JS_FreeValue(ctx, rep); } - pos = found + t_len; + pos = found + match_len; + if (match_len == 0) { + if (pos < len) pos++; + else break; + } } - /* Copy remainder */ if (pos < len) { - JSValue sub = js_sub_string(ctx, sp, pos, len); - if (JS_IsException(sub)) goto fail; - if (string_buffer_concat_value_free(b, sub)) goto fail; + JSValue tail = js_sub_string(ctx, sp, pos, len); + if (JS_IsException(tail)) goto fail_rx; + if (string_buffer_concat_value_free(b, tail)) goto fail_rx; } + if (have_orig_last_index) JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index); + JS_FreeValue(ctx, str); - JS_FreeValue(ctx, target); return string_buffer_end(b); -fail: +fail_rx: string_buffer_free(b); + if (!JS_IsNull(orig_last_index) && !JS_IsException(orig_last_index)) { + JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index); + } else { + JS_FreeValue(ctx, orig_last_index); + } JS_FreeValue(ctx, str); - JS_FreeValue(ctx, target); return JS_EXCEPTION; } +/* text.search(str, target, from) - find substring or regex match */ +static JSValue js_cell_text_search(JSContext *ctx, JSValueConst this_val, int argc, JSValueConst *argv) +{ + if (argc < 2) return JS_NULL; + + int tag1 = JS_VALUE_GET_TAG(argv[0]); + if (tag1 != JS_TAG_STRING && tag1 != JS_TAG_STRING_ROPE) return JS_NULL; + + int target_is_regex = 0; + int tag2 = JS_VALUE_GET_TAG(argv[1]); + if (tag2 == JS_TAG_STRING || tag2 == JS_TAG_STRING_ROPE) { + target_is_regex = 0; + } else if (JS_IsObject(argv[1]) && JS_IsRegExp(ctx, argv[1])) { + target_is_regex = 1; + } else { + return JS_NULL; + } + + JSValue str = JS_ToString(ctx, argv[0]); + if (JS_IsException(str)) return str; + + JSString *p = JS_VALUE_GET_STRING(str); + int len = (int)p->len; + + int from = 0; + if (argc > 2 && !JS_IsNull(argv[2])) { + if (JS_ToInt32(ctx, &from, argv[2])) { + JS_FreeValue(ctx, str); + return JS_NULL; + } + if (from < 0) from += len; + if (from < 0) from = 0; + } + if (from > len) { + JS_FreeValue(ctx, str); + return JS_NULL; + } + + if (!target_is_regex) { + JSValue target = JS_ToString(ctx, argv[1]); + if (JS_IsException(target)) { + JS_FreeValue(ctx, str); + return target; + } + + JSString *t = JS_VALUE_GET_STRING(target); + int t_len = (int)t->len; + + int result = -1; + if (len >= t_len) { + for (int i = from; i <= len - t_len; i++) { + if (!string_cmp(p, t, i, 0, t_len)) { + result = i; + break; + } + } + } + + JS_FreeValue(ctx, str); + JS_FreeValue(ctx, target); + + if (result == -1) return JS_NULL; + return JS_NewInt32(ctx, result); + } + + /* Regex target */ + JSValue rx = argv[1]; + JSValue orig_last_index = JS_GetPropertyStr(ctx, rx, "lastIndex"); + if (JS_IsException(orig_last_index)) { + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + int have_orig_last_index = 1; + + if (JS_SetPropertyStr(ctx, rx, "lastIndex", JS_NewInt32(ctx, 0)) < 0) goto fail_rx_search; + + JSValue sub_str = js_sub_string(ctx, p, from, len); + if (JS_IsException(sub_str)) goto fail_rx_search; + + JSValue exec_res = JS_Invoke(ctx, rx, JS_ATOM_exec, 1, (JSValueConst *)&sub_str); + JS_FreeValue(ctx, sub_str); + if (JS_IsException(exec_res)) goto fail_rx_search; + + if (JS_IsNull(exec_res)) { + JS_FreeValue(ctx, exec_res); + if (have_orig_last_index) JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index); + JS_FreeValue(ctx, str); + return JS_NULL; + } + + JSValue idx_val = JS_GetPropertyStr(ctx, exec_res, "index"); + if (JS_IsException(idx_val)) { + JS_FreeValue(ctx, exec_res); + goto fail_rx_search; + } + + int32_t local_index = 0; + if (JS_ToInt32(ctx, &local_index, idx_val)) { + JS_FreeValue(ctx, idx_val); + JS_FreeValue(ctx, exec_res); + goto fail_rx_search; + } + JS_FreeValue(ctx, idx_val); + JS_FreeValue(ctx, exec_res); + + if (local_index < 0) local_index = 0; + + if (have_orig_last_index) JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index); + + JS_FreeValue(ctx, str); + return JS_NewInt32(ctx, from + local_index); + +fail_rx_search: + if (!JS_IsNull(orig_last_index) && !JS_IsException(orig_last_index)) { + JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index); + } else { + JS_FreeValue(ctx, orig_last_index); + } + JS_FreeValue(ctx, str); + return JS_EXCEPTION; +} +static inline uint16_t js_str_get(JSString *s, int idx) { + return s->is_wide_char ? s->u.str16[idx] : s->u.str8[idx]; +} + +static int js_str_find_range(JSString *hay, int from, int to, JSString *needle) { + int nlen = (int)needle->len; + int hlen = (int)hay->len; + + if (from < 0) from = 0; + if (to < 0) to = 0; + if (to > hlen) to = hlen; + if (from > to) return -1; + + if (nlen == 0) return from; + if (nlen > (to - from)) return -1; + + int limit = to - nlen; + for (int i = from; i <= limit; i++) { + int j = 0; + for (; j < nlen; j++) { + if (js_str_get(hay, i + j) != js_str_get(needle, j)) break; + } + if (j == nlen) return i; + } + return -1; +} + +/* text_extract(text, pattern, from?, to?) - extract match using regexp or literal text */ +static JSValue js_cell_text_extract(JSContext *ctx, JSValueConst this_val, + int argc, JSValueConst *argv) +{ + if (argc < 2) return JS_NULL; + + JSValue str = JS_ToString(ctx, argv[0]); + if (JS_IsException(str)) return JS_EXCEPTION; + + JSString *p = JS_VALUE_GET_STRING(str); + int len = (int)p->len; + + int from = 0; + if (argc >= 3 && !JS_IsNull(argv[2])) { + if (JS_ToInt32(ctx, &from, argv[2])) { + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + if (from < 0) from += len; + if (from < 0) from = 0; + if (from > len) from = len; + } + + int to = len; + if (argc >= 4 && !JS_IsNull(argv[3])) { + if (JS_ToInt32(ctx, &to, argv[3])) { + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + if (to < 0) to += len; + if (to < 0) to = 0; + if (to > len) to = len; + } + + if (from > to) { + JS_FreeValue(ctx, str); + return JS_NULL; + } + + /* RegExp path */ + if (js_is_regexp(ctx, argv[1])) { + JSValue substr; + + if (from == 0 && to == len) { + substr = JS_DupValue(ctx, str); + } else { + substr = js_sub_string(ctx, p, from, to); + if (JS_IsException(substr)) { + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + } + + JSValue exec_func = JS_GetPropertyStr(ctx, argv[1], "exec"); + if (JS_IsException(exec_func)) { + JS_FreeValue(ctx, substr); + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + + JSValue result = JS_Call(ctx, exec_func, argv[1], 1, &substr); + + JS_FreeValue(ctx, exec_func); + JS_FreeValue(ctx, substr); + JS_FreeValue(ctx, str); + + if (JS_IsException(result)) return JS_EXCEPTION; + return result; + } + + /* Literal text path */ + JSValue needle_val = JS_ToString(ctx, argv[1]); + if (JS_IsException(needle_val)) { + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + + JSString *needle = JS_VALUE_GET_STRING(needle_val); + int pos = js_str_find_range(p, from, to, needle); + + JS_FreeValue(ctx, needle_val); + + if (pos < 0) { + JS_FreeValue(ctx, str); + return JS_NULL; + } + + JSValue arr = JS_NewArray(ctx); + if (JS_IsException(arr)) { + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + + JSValue match = js_sub_string(ctx, p, pos, pos + (int)needle->len); + if (JS_IsException(match)) { + JS_FreeValue(ctx, arr); + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + + JS_DefinePropertyValueUint32(ctx, arr, 0, match, JS_PROP_C_W_E); + JS_DefinePropertyValueStr(ctx, arr, "index", JS_NewInt32(ctx, pos), JS_PROP_C_W_E); + JS_DefinePropertyValueStr(ctx, arr, "input", JS_DupValue(ctx, str), JS_PROP_C_W_E); + + JS_FreeValue(ctx, str); + return arr; +} + /* ---------------------------------------------------------------------------- * array function and sub-functions * ---------------------------------------------------------------------------- */ @@ -34833,7 +35097,8 @@ static JSValue js_cell_array(JSContext *ctx, JSValueConst this_val, return result; } - int tag2 = JS_VALUE_GET_TAG(argv[1]); + int tag2 = JS_VALUE_GET_TAG(argv[1]); + if (tag2 == JS_TAG_STRING || tag2 == JS_TAG_STRING_ROPE) { /* Split by separator */ const char *cstr = JS_ToCString(ctx, str); @@ -34859,7 +35124,6 @@ static JSValue js_cell_array(JSContext *ctx, JSValueConst this_val, const char *found; if (sep_len == 0) { - /* Split into characters */ for (int i = 0; i < len; i++) { JSValue ch = js_sub_string(ctx, p, i, i + 1); JS_SetPropertyInt64(ctx, result, idx++, ch); @@ -34880,6 +35144,131 @@ static JSValue js_cell_array(JSContext *ctx, JSValueConst this_val, return result; } + if (JS_IsObject(argv[1]) && JS_IsRegExp(ctx, argv[1])) { + /* Split by regex (manual "global" iteration; ignore g flag semantics) */ + JSValue rx = argv[1]; + + JSValue result = JS_NewArray(ctx); + if (JS_IsException(result)) { + JS_FreeValue(ctx, str); + return result; + } + + /* Save & restore lastIndex to avoid mutating caller-visible state */ + JSValue orig_last_index = JS_GetPropertyStr(ctx, rx, "lastIndex"); + if (JS_IsException(orig_last_index)) { + JS_FreeValue(ctx, result); + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + + int pos = 0; + int64_t out_idx = 0; + + while (pos <= len) { + /* force lastIndex = 0 so flags don't matter and we fully control iteration */ + if (JS_SetPropertyStr(ctx, rx, "lastIndex", JS_NewInt32(ctx, 0)) < 0) goto fail_rx_split; + + JSValue sub_str = js_sub_string(ctx, p, pos, len); + if (JS_IsException(sub_str)) goto fail_rx_split; + + JSValue exec_res = JS_Invoke(ctx, rx, JS_ATOM_exec, 1, (JSValueConst *)&sub_str); + JS_FreeValue(ctx, sub_str); + if (JS_IsException(exec_res)) goto fail_rx_split; + + if (JS_IsNull(exec_res)) { + JS_FreeValue(ctx, exec_res); + /* remainder */ + JSValue tail = js_sub_string(ctx, p, pos, len); + if (JS_IsException(tail)) goto fail_rx_split; + JS_SetPropertyInt64(ctx, result, out_idx++, tail); + break; + } + + /* local match index within sub_str */ + JSValue idx_val = JS_GetPropertyStr(ctx, exec_res, "index"); + if (JS_IsException(idx_val)) { + JS_FreeValue(ctx, exec_res); + goto fail_rx_split; + } + + int32_t local_index = 0; + if (JS_ToInt32(ctx, &local_index, idx_val)) { + JS_FreeValue(ctx, idx_val); + JS_FreeValue(ctx, exec_res); + goto fail_rx_split; + } + JS_FreeValue(ctx, idx_val); + + if (local_index < 0) local_index = 0; + + int found = pos + local_index; + if (found < pos) found = pos; + if (found > len) { + /* treat as no more matches */ + JS_FreeValue(ctx, exec_res); + JSValue tail = js_sub_string(ctx, p, pos, len); + if (JS_IsException(tail)) goto fail_rx_split; + JS_SetPropertyInt64(ctx, result, out_idx++, tail); + break; + } + + /* match text is exec_res[0] */ + JSValue match = JS_GetPropertyUint32(ctx, exec_res, 0); + JS_FreeValue(ctx, exec_res); + if (JS_IsException(match)) goto fail_rx_split; + + /* compute match length in code units */ + int match_len = 0; + { + JSValue mstr = JS_ToString(ctx, match); + if (JS_IsException(mstr)) { + JS_FreeValue(ctx, match); + goto fail_rx_split; + } + JSString *mp = JS_VALUE_GET_STRING(mstr); + match_len = (int)mp->len; + JS_FreeValue(ctx, mstr); + } + JS_FreeValue(ctx, match); + + /* emit piece before match */ + JSValue part = js_sub_string(ctx, p, pos, found); + if (JS_IsException(part)) goto fail_rx_split; + JS_SetPropertyInt64(ctx, result, out_idx++, part); + + /* advance past match; ensure progress on empty matches */ + pos = found + match_len; + if (match_len == 0) { + if (found >= len) { + /* match at end: add trailing empty field and stop */ + JSValue empty = JS_NewStringLen(ctx, "", 0); + if (JS_IsException(empty)) goto fail_rx_split; + JS_SetPropertyInt64(ctx, result, out_idx++, empty); + break; + } + pos = found + 1; + } + } + + /* restore lastIndex */ + JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index); + + JS_FreeValue(ctx, str); + return result; + + fail_rx_split: + /* best-effort restore lastIndex */ + if (!JS_IsException(orig_last_index)) { + JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index); + } else { + JS_FreeValue(ctx, orig_last_index); + } + JS_FreeValue(ctx, result); + JS_FreeValue(ctx, str); + return JS_EXCEPTION; + } + if (tag2 == JS_TAG_INT || tag2 == JS_TAG_FLOAT64) { /* Dice into chunks */ int chunk_len; @@ -37126,6 +37515,9 @@ void JS_AddIntrinsicBaseObjects(JSContext *ctx) JS_DefinePropertyValueStr(ctx, ctx->global_obj, "search", JS_NewCFunction(ctx, js_cell_text_search, "search", 3), JS_PROP_WRITABLE | JS_PROP_CONFIGURABLE); + JS_DefinePropertyValueStr(ctx, ctx->global_obj, "extract", + JS_NewCFunction(ctx, js_cell_text_extract, "extract", 3), + JS_PROP_WRITABLE | JS_PROP_CONFIGURABLE); JS_DefinePropertyValueStr(ctx, ctx->global_obj, "reduce", JS_NewCFunction(ctx, js_cell_array_reduce, "reduce", 4), JS_PROP_WRITABLE | JS_PROP_CONFIGURABLE); diff --git a/tests/suite.cm b/tests/suite.cm index 3e61ae6a..dbdf9101 100644 --- a/tests/suite.cm +++ b/tests/suite.cm @@ -2858,6 +2858,26 @@ return { if (result != "he[2][3]o") throw "replace with function failed: " + result }, + test_replace_with_function_limit: function() { + var result = replace("banana", "a", (match, pos) => `[${pos}]`, 2) + if (result != "b[1]n[3]na") throw "replace with function limit failed: " + result + }, + + test_replace_with_regex: function() { + var result = replace("banana", /a/, "o") + if (result != "bonono") throw "replace with regex failed" + }, + + test_replace_with_regex_limit: function() { + var result = replace("banana", /a/, "o", 2) + if (result != "bonona") throw "replace with regex limit failed: " + result + }, + + test_replace_with_regex_function: function() { + var result = replace("hello", /l/, (match, pos) => `[${pos}]`) + if (result != "he[2][3]o") throw "replace with regex function failed: " + result + }, + // ============================================================================ // TEXT FUNCTION (Conversion and Slicing) // ============================================================================ @@ -3520,4 +3540,82 @@ return { if (result != 42) throw "immediately invoked function failed" }, + test_text_split_text: function() { + var text = "hello world" + var result = array(text, " ") + if (result.length != 2) throw "text split failed" + if (result[0] != "hello") throw "text split failed" + if (result[1] != "world") throw "text split failed" + }, + + test_text_split_regex: function() { + var text = "hello world" + var result = array(text, /\s+/) + if (result.length != 2) throw "text split failed" + if (result[0] != "hello") throw "text split failed" + if (result[1] != "world") throw "text split failed" + }, + + test_text_search_text: function() { + var text = "hello world" + var result = search(text, "world") + if (result != 6) throw "text search failed" + }, + + test_text_search_regex: function() { + var text = "hello world" + var result = search(text, /world/) + if (result != 6) throw "text search failed" + }, + + test_extract_basic_text: function() { + var text = "hello world" + var result = extract(text, "world") + if (result[0] != "world") throw "extract basic text failed" + }, + + test_extract_text_not_found: function() { + var text = "hello world" + var result = extract(text, "xyz") + if (result != null) throw "extract not found should return null" + }, + + test_extract_regex_basic: function() { + var text = "hello world" + var result = extract(text, /world/) + if (result[0] != "world") throw "extract regex basic failed" + }, + + test_extract_regex_with_capture_group: function() { + var text = "hello world" + var result = extract(text, /(\w+) (\w+)/) + if (result[0] != "hello world") throw "extract regex full match failed" + if (result[1] != "hello") throw "extract regex capture group 1 failed" + if (result[2] != "world") throw "extract regex capture group 2 failed" + }, + + test_extract_regex_digits: function() { + var text = "abc123def456" + var result = extract(text, /(\d+)/) + if (result[0] != "123") throw "extract regex digits failed" + if (result[1] != "123") throw "extract regex digits capture failed" + }, + + test_extract_with_from: function() { + var text = "hello hello world" + var result = extract(text, "hello", 1) + if (result[0] != "hello") throw "extract with from failed" + }, + + test_extract_with_from_to: function() { + var text = "hello world hello" + var result = extract(text, "hello", 0, 10) + if (result[0] != "hello") throw "extract with from to failed" + }, + + test_extract_regex_case_insensitive: function() { + var text = "Hello World" + var result = extract(text, /hello/i) + if (result[0] != "Hello") throw "extract regex case insensitive failed" + }, }