text extract

This commit is contained in:
2026-01-17 15:48:43 -06:00
parent 45ee4a337c
commit 97ece8e5cb
3 changed files with 663 additions and 173 deletions

View File

@@ -30399,56 +30399,6 @@ static int check_regexp_g_flag(JSContext *ctx, JSValueConst regexp)
return 0;
}
static JSValue js_string_match(JSContext *ctx, JSValueConst this_val,
int argc, JSValueConst *argv, int atom)
{
// match(rx), search(rx), matchAll(rx)
// atom is JS_ATOM_Symbol_match, JS_ATOM_Symbol_search, or JS_ATOM_Symbol_matchAll
JSValueConst O = this_val, regexp = argv[0], args[2];
JSValue matcher, S, rx, result, str;
int args_len;
if (JS_IsNull(O) || JS_IsNull(O))
return JS_ThrowTypeError(ctx, "cannot convert to object");
if (!JS_IsNull(regexp) && !JS_IsNull(regexp)) {
matcher = JS_GetProperty(ctx, regexp, atom);
if (JS_IsException(matcher))
return JS_EXCEPTION;
if (atom == JS_ATOM_Symbol_matchAll) {
if (check_regexp_g_flag(ctx, regexp) < 0) {
JS_FreeValue(ctx, matcher);
return JS_EXCEPTION;
}
}
if (!JS_IsNull(matcher) && !JS_IsNull(matcher)) {
return JS_CallFree(ctx, matcher, regexp, 1, &O);
}
}
S = JS_ToString(ctx, O);
if (JS_IsException(S))
return JS_EXCEPTION;
args_len = 1;
args[0] = regexp;
str = JS_NULL;
if (atom == JS_ATOM_Symbol_matchAll) {
str = js_new_string8(ctx, "g");
if (JS_IsException(str))
goto fail;
args[args_len++] = (JSValueConst)str;
}
rx = JS_CallConstructor(ctx, ctx->regexp_ctor, args_len, args);
JS_FreeValue(ctx, str);
if (JS_IsException(rx)) {
fail:
JS_FreeValue(ctx, S);
return JS_EXCEPTION;
}
result = JS_InvokeFree(ctx, rx, atom, 1, (JSValueConst *)&S);
JS_FreeValue(ctx, S);
return result;
}
static JSValue js_string___GetSubstitution(JSContext *ctx, JSValueConst this_val,
int argc, JSValueConst *argv)
{
@@ -30738,7 +30688,6 @@ static const JSCFunctionListEntry js_string_proto_funcs[] = {
JS_CFUNC_DEF("concat", 1, js_string_concat),
JS_CFUNC_MAGIC_DEF("indexOf", 1, js_string_indexOf, 0 ),
JS_CFUNC_MAGIC_DEF("lastIndexOf", 1, js_string_indexOf, 1 ),
JS_CFUNC_MAGIC_DEF("match", 1, js_string_match, JS_ATOM_Symbol_match ),
JS_CFUNC_MAGIC_DEF("replace", 2, js_string_replace, 0 ),
JS_CFUNC_MAGIC_DEF("replaceAll", 2, js_string_replace, 1 ),
JS_CFUNC_DEF("toString", 0, js_string_toString ),
@@ -34345,60 +34294,6 @@ static JSValue js_cell_text_codepoint(JSContext *ctx, JSValueConst this_val,
return JS_NewInt32(ctx, c);
}
/* text.search(str, target, from) - find substring */
static JSValue js_cell_text_search(JSContext *ctx, JSValueConst this_val,
int argc, JSValueConst *argv)
{
if (argc < 2) return JS_NULL;
int tag1 = JS_VALUE_GET_TAG(argv[0]);
int tag2 = JS_VALUE_GET_TAG(argv[1]);
if ((tag1 != JS_TAG_STRING && tag1 != JS_TAG_STRING_ROPE) ||
(tag2 != JS_TAG_STRING && tag2 != JS_TAG_STRING_ROPE))
return JS_NULL;
JSValue str = JS_ToString(ctx, argv[0]);
if (JS_IsException(str)) return str;
JSValue target = JS_ToString(ctx, argv[1]);
if (JS_IsException(target)) {
JS_FreeValue(ctx, str);
return target;
}
JSString *p = JS_VALUE_GET_STRING(str);
JSString *t = JS_VALUE_GET_STRING(target);
int from = 0;
if (argc > 2 && !JS_IsNull(argv[2])) {
if (JS_ToInt32(ctx, &from, argv[2])) {
JS_FreeValue(ctx, str);
JS_FreeValue(ctx, target);
return JS_NULL;
}
if (from < 0) from += p->len;
if (from < 0) from = 0;
}
int result = -1;
int len = p->len;
int t_len = t->len;
if (len >= t_len) {
for (int i = from; i <= len - t_len; i++) {
if (!string_cmp(p, t, i, 0, t_len)) {
result = i;
break;
}
}
}
JS_FreeValue(ctx, str);
JS_FreeValue(ctx, target);
if (result == -1) return JS_NULL;
return JS_NewInt32(ctx, result);
}
/* Helpers (C, not C++). Put these above js_cell_text_replace in the same C file. */
static int sb_concat_value_to_string_free(JSContext *ctx, StringBuffer *b, JSValue v)
@@ -34438,6 +34333,18 @@ static JSValue make_replacement(JSContext *ctx, int argc, JSValueConst *argv, in
return JS_AtomToString(ctx, JS_ATOM_empty_string);
}
static int JS_IsRegExp(JSContext *ctx, JSValueConst v)
{
if (!JS_IsObject(v)) return 0;
JSValue exec = JS_GetPropertyStr(ctx, v, "exec");
if (JS_IsException(exec)) return -1;
int ok = JS_IsFunction(ctx, exec);
JS_FreeValue(ctx, exec);
return ok;
}
/* text.replace(text, target, replacement, limit)
*
* Return a new text in which the target is replaced by the replacement.
@@ -34451,145 +34358,502 @@ static JSValue make_replacement(JSContext *ctx, int argc, JSValueConst *argv, in
* Example: replace("abc", "", "-") => "-a-b-c-"
* Boundaries count toward limit even if replacement returns null.
*/
static JSValue js_cell_text_replace(JSContext *ctx, JSValueConst this_val,
int argc, JSValueConst *argv)
static JSValue js_cell_text_replace(JSContext *ctx, JSValueConst this_val, int argc, JSValueConst *argv)
{
if (argc < 2) return JS_NULL;
/* Require text + target be strings (or ropes) */
int tag_text = JS_VALUE_GET_TAG(argv[0]);
if (tag_text != JS_TAG_STRING && tag_text != JS_TAG_STRING_ROPE) return JS_NULL;
int target_is_regex = 0;
{
int tag_text = JS_VALUE_GET_TAG(argv[0]);
int tag_tgt = JS_VALUE_GET_TAG(argv[1]);
if ((tag_text != JS_TAG_STRING && tag_text != JS_TAG_STRING_ROPE) ||
(tag_tgt != JS_TAG_STRING && tag_tgt != JS_TAG_STRING_ROPE))
if (tag_tgt == JS_TAG_STRING || tag_tgt == JS_TAG_STRING_ROPE) {
target_is_regex = 0;
} else if (JS_IsObject(argv[1]) && JS_IsRegExp(ctx, argv[1])) {
target_is_regex = 1;
} else {
return JS_NULL;
}
}
JSValue str = JS_ToString(ctx, argv[0]);
if (JS_IsException(str)) return str;
JSValue target = JS_ToString(ctx, argv[1]);
if (JS_IsException(target)) {
JS_FreeValue(ctx, str);
return target;
}
JSString *sp = JS_VALUE_GET_STRING(str);
JSString *tp = JS_VALUE_GET_STRING(target);
int len = (int)sp->len;
int32_t limit = -1; /* -1 means unlimited */
int32_t limit = -1;
if (argc > 3 && !JS_IsNull(argv[3])) {
if (JS_ToInt32(ctx, &limit, argv[3])) {
JS_FreeValue(ctx, str);
JS_FreeValue(ctx, target);
return JS_NULL;
}
if (limit < 0) limit = -1;
}
int len = (int)sp->len;
int t_len = (int)tp->len;
StringBuffer b_s, *b = &b_s;
string_buffer_init(ctx, b, len);
/* Empty target: boundary replacements */
if (t_len == 0) {
if (!target_is_regex) {
JSValue target = JS_ToString(ctx, argv[1]);
if (JS_IsException(target)) {
JS_FreeValue(ctx, str);
return target;
}
JSString *tp = JS_VALUE_GET_STRING(target);
int t_len = (int)tp->len;
if (t_len == 0) {
int32_t count = 0;
for (int boundary = 0; boundary <= len; boundary++) {
if (limit >= 0 && count >= limit) break;
JSValue match = JS_AtomToString(ctx, JS_ATOM_empty_string);
if (JS_IsException(match)) goto fail_str_target;
JSValue rep = make_replacement(ctx, argc, argv, boundary, match);
if (JS_IsException(rep)) goto fail_str_target;
count++;
if (!JS_IsNull(rep)) {
if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail_str_target;
} else {
JS_FreeValue(ctx, rep);
}
if (boundary < len) {
JSValue ch = js_sub_string(ctx, sp, boundary, boundary + 1);
if (JS_IsException(ch)) goto fail_str_target;
if (string_buffer_concat_value_free(b, ch)) goto fail_str_target;
}
}
JS_FreeValue(ctx, str);
JS_FreeValue(ctx, target);
return string_buffer_end(b);
}
int pos = 0;
int32_t count = 0;
for (int boundary = 0; boundary <= len; boundary++) {
if (limit >= 0 && count >= limit) break;
while (pos <= len - t_len && (limit < 0 || count < limit)) {
int found = -1;
/* match text is "" */
JSValue match = JS_AtomToString(ctx, JS_ATOM_empty_string);
if (JS_IsException(match)) goto fail;
for (int i = pos; i <= len - t_len; i++) {
if (!string_cmp(sp, tp, i, 0, t_len)) {
found = i;
break;
}
}
if (found < 0) break;
JSValue rep = make_replacement(ctx, argc, argv, boundary, match);
if (JS_IsException(rep)) goto fail;
if (found > pos) {
JSValue sub = js_sub_string(ctx, sp, pos, found);
if (JS_IsException(sub)) goto fail_str_target;
if (string_buffer_concat_value_free(b, sub)) goto fail_str_target;
}
JSValue match = js_sub_string(ctx, sp, found, found + t_len);
if (JS_IsException(match)) goto fail_str_target;
JSValue rep = make_replacement(ctx, argc, argv, found, match);
if (JS_IsException(rep)) goto fail_str_target;
/* Count includes null matches */
count++;
if (!JS_IsNull(rep)) {
if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail;
if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail_str_target;
} else {
JS_FreeValue(ctx, rep);
}
/* Copy next character between boundaries (does not affect count) */
if (boundary < len) {
JSValue ch = js_sub_string(ctx, sp, boundary, boundary + 1);
if (JS_IsException(ch)) goto fail;
if (string_buffer_concat_value_free(b, ch)) goto fail;
}
pos = found + t_len;
}
if (pos < len) {
JSValue sub = js_sub_string(ctx, sp, pos, len);
if (JS_IsException(sub)) goto fail_str_target;
if (string_buffer_concat_value_free(b, sub)) goto fail_str_target;
}
JS_FreeValue(ctx, str);
JS_FreeValue(ctx, target);
return string_buffer_end(b);
fail_str_target:
string_buffer_free(b);
JS_FreeValue(ctx, str);
JS_FreeValue(ctx, target);
return JS_EXCEPTION;
}
/* Non-empty target: left-to-right, non-overlapping */
/* Regex target */
JSValue rx = argv[1];
JSValue orig_last_index = JS_GetPropertyStr(ctx, rx, "lastIndex");
if (JS_IsException(orig_last_index)) goto fail_rx;
int have_orig_last_index = 1;
int pos = 0;
int32_t count = 0;
while (pos <= len - t_len && (limit < 0 || count < limit)) {
int found = -1;
while (pos <= len && (limit < 0 || count < limit)) {
if (JS_SetPropertyStr(ctx, rx, "lastIndex", JS_NewInt32(ctx, 0)) < 0) goto fail_rx;
/* Find next occurrence (naive search) */
for (int i = pos; i <= len - t_len; i++) {
if (!string_cmp(sp, tp, i, 0, t_len)) {
found = i;
break;
}
JSValue sub_str = js_sub_string(ctx, sp, pos, len);
if (JS_IsException(sub_str)) goto fail_rx;
JSValue exec_res = JS_Invoke(ctx, rx, JS_ATOM_exec, 1, (JSValueConst *)&sub_str);
JS_FreeValue(ctx, sub_str);
if (JS_IsException(exec_res)) goto fail_rx;
if (JS_IsNull(exec_res)) {
JS_FreeValue(ctx, exec_res);
break;
}
JSValue idx_val = JS_GetPropertyStr(ctx, exec_res, "index");
if (JS_IsException(idx_val)) {
JS_FreeValue(ctx, exec_res);
goto fail_rx;
}
int32_t local_index = 0;
if (JS_ToInt32(ctx, &local_index, idx_val)) {
JS_FreeValue(ctx, idx_val);
JS_FreeValue(ctx, exec_res);
goto fail_rx;
}
JS_FreeValue(ctx, idx_val);
if (local_index < 0) local_index = 0;
int found = pos + local_index;
if (found < pos) found = pos;
if (found > len) {
JS_FreeValue(ctx, exec_res);
break;
}
JSValue match = JS_GetPropertyUint32(ctx, exec_res, 0);
JS_FreeValue(ctx, exec_res);
if (JS_IsException(match)) goto fail_rx;
int match_len = 0;
{
JSValue mstr = JS_ToString(ctx, match);
if (JS_IsException(mstr)) goto fail_rx;
JSString *mp = JS_VALUE_GET_STRING(mstr);
match_len = (int)mp->len;
JS_FreeValue(ctx, mstr);
}
if (found < 0) break;
/* Copy prefix up to match */
if (found > pos) {
JSValue sub = js_sub_string(ctx, sp, pos, found);
if (JS_IsException(sub)) goto fail;
if (string_buffer_concat_value_free(b, sub)) goto fail;
JSValue prefix = js_sub_string(ctx, sp, pos, found);
if (JS_IsException(prefix)) goto fail_rx;
if (string_buffer_concat_value_free(b, prefix)) goto fail_rx;
}
/* Match text for callback */
JSValue match = js_sub_string(ctx, sp, found, found + t_len);
if (JS_IsException(match)) goto fail;
JSValue rep = make_replacement(ctx, argc, argv, found, match);
if (JS_IsException(rep)) goto fail;
if (JS_IsException(rep)) goto fail_rx;
/* Count includes null matches */
count++;
if (!JS_IsNull(rep)) {
if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail;
if (sb_concat_value_to_string_free(ctx, b, rep) < 0) goto fail_rx;
} else {
JS_FreeValue(ctx, rep);
}
pos = found + t_len;
pos = found + match_len;
if (match_len == 0) {
if (pos < len) pos++;
else break;
}
}
/* Copy remainder */
if (pos < len) {
JSValue sub = js_sub_string(ctx, sp, pos, len);
if (JS_IsException(sub)) goto fail;
if (string_buffer_concat_value_free(b, sub)) goto fail;
JSValue tail = js_sub_string(ctx, sp, pos, len);
if (JS_IsException(tail)) goto fail_rx;
if (string_buffer_concat_value_free(b, tail)) goto fail_rx;
}
if (have_orig_last_index) JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index);
JS_FreeValue(ctx, str);
JS_FreeValue(ctx, target);
return string_buffer_end(b);
fail:
fail_rx:
string_buffer_free(b);
if (!JS_IsNull(orig_last_index) && !JS_IsException(orig_last_index)) {
JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index);
} else {
JS_FreeValue(ctx, orig_last_index);
}
JS_FreeValue(ctx, str);
JS_FreeValue(ctx, target);
return JS_EXCEPTION;
}
/* text.search(str, target, from) - find substring or regex match */
static JSValue js_cell_text_search(JSContext *ctx, JSValueConst this_val, int argc, JSValueConst *argv)
{
if (argc < 2) return JS_NULL;
int tag1 = JS_VALUE_GET_TAG(argv[0]);
if (tag1 != JS_TAG_STRING && tag1 != JS_TAG_STRING_ROPE) return JS_NULL;
int target_is_regex = 0;
int tag2 = JS_VALUE_GET_TAG(argv[1]);
if (tag2 == JS_TAG_STRING || tag2 == JS_TAG_STRING_ROPE) {
target_is_regex = 0;
} else if (JS_IsObject(argv[1]) && JS_IsRegExp(ctx, argv[1])) {
target_is_regex = 1;
} else {
return JS_NULL;
}
JSValue str = JS_ToString(ctx, argv[0]);
if (JS_IsException(str)) return str;
JSString *p = JS_VALUE_GET_STRING(str);
int len = (int)p->len;
int from = 0;
if (argc > 2 && !JS_IsNull(argv[2])) {
if (JS_ToInt32(ctx, &from, argv[2])) {
JS_FreeValue(ctx, str);
return JS_NULL;
}
if (from < 0) from += len;
if (from < 0) from = 0;
}
if (from > len) {
JS_FreeValue(ctx, str);
return JS_NULL;
}
if (!target_is_regex) {
JSValue target = JS_ToString(ctx, argv[1]);
if (JS_IsException(target)) {
JS_FreeValue(ctx, str);
return target;
}
JSString *t = JS_VALUE_GET_STRING(target);
int t_len = (int)t->len;
int result = -1;
if (len >= t_len) {
for (int i = from; i <= len - t_len; i++) {
if (!string_cmp(p, t, i, 0, t_len)) {
result = i;
break;
}
}
}
JS_FreeValue(ctx, str);
JS_FreeValue(ctx, target);
if (result == -1) return JS_NULL;
return JS_NewInt32(ctx, result);
}
/* Regex target */
JSValue rx = argv[1];
JSValue orig_last_index = JS_GetPropertyStr(ctx, rx, "lastIndex");
if (JS_IsException(orig_last_index)) {
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
int have_orig_last_index = 1;
if (JS_SetPropertyStr(ctx, rx, "lastIndex", JS_NewInt32(ctx, 0)) < 0) goto fail_rx_search;
JSValue sub_str = js_sub_string(ctx, p, from, len);
if (JS_IsException(sub_str)) goto fail_rx_search;
JSValue exec_res = JS_Invoke(ctx, rx, JS_ATOM_exec, 1, (JSValueConst *)&sub_str);
JS_FreeValue(ctx, sub_str);
if (JS_IsException(exec_res)) goto fail_rx_search;
if (JS_IsNull(exec_res)) {
JS_FreeValue(ctx, exec_res);
if (have_orig_last_index) JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index);
JS_FreeValue(ctx, str);
return JS_NULL;
}
JSValue idx_val = JS_GetPropertyStr(ctx, exec_res, "index");
if (JS_IsException(idx_val)) {
JS_FreeValue(ctx, exec_res);
goto fail_rx_search;
}
int32_t local_index = 0;
if (JS_ToInt32(ctx, &local_index, idx_val)) {
JS_FreeValue(ctx, idx_val);
JS_FreeValue(ctx, exec_res);
goto fail_rx_search;
}
JS_FreeValue(ctx, idx_val);
JS_FreeValue(ctx, exec_res);
if (local_index < 0) local_index = 0;
if (have_orig_last_index) JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index);
JS_FreeValue(ctx, str);
return JS_NewInt32(ctx, from + local_index);
fail_rx_search:
if (!JS_IsNull(orig_last_index) && !JS_IsException(orig_last_index)) {
JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index);
} else {
JS_FreeValue(ctx, orig_last_index);
}
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
static inline uint16_t js_str_get(JSString *s, int idx) {
return s->is_wide_char ? s->u.str16[idx] : s->u.str8[idx];
}
static int js_str_find_range(JSString *hay, int from, int to, JSString *needle) {
int nlen = (int)needle->len;
int hlen = (int)hay->len;
if (from < 0) from = 0;
if (to < 0) to = 0;
if (to > hlen) to = hlen;
if (from > to) return -1;
if (nlen == 0) return from;
if (nlen > (to - from)) return -1;
int limit = to - nlen;
for (int i = from; i <= limit; i++) {
int j = 0;
for (; j < nlen; j++) {
if (js_str_get(hay, i + j) != js_str_get(needle, j)) break;
}
if (j == nlen) return i;
}
return -1;
}
/* text_extract(text, pattern, from?, to?) - extract match using regexp or literal text */
static JSValue js_cell_text_extract(JSContext *ctx, JSValueConst this_val,
int argc, JSValueConst *argv)
{
if (argc < 2) return JS_NULL;
JSValue str = JS_ToString(ctx, argv[0]);
if (JS_IsException(str)) return JS_EXCEPTION;
JSString *p = JS_VALUE_GET_STRING(str);
int len = (int)p->len;
int from = 0;
if (argc >= 3 && !JS_IsNull(argv[2])) {
if (JS_ToInt32(ctx, &from, argv[2])) {
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
if (from < 0) from += len;
if (from < 0) from = 0;
if (from > len) from = len;
}
int to = len;
if (argc >= 4 && !JS_IsNull(argv[3])) {
if (JS_ToInt32(ctx, &to, argv[3])) {
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
if (to < 0) to += len;
if (to < 0) to = 0;
if (to > len) to = len;
}
if (from > to) {
JS_FreeValue(ctx, str);
return JS_NULL;
}
/* RegExp path */
if (js_is_regexp(ctx, argv[1])) {
JSValue substr;
if (from == 0 && to == len) {
substr = JS_DupValue(ctx, str);
} else {
substr = js_sub_string(ctx, p, from, to);
if (JS_IsException(substr)) {
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
}
JSValue exec_func = JS_GetPropertyStr(ctx, argv[1], "exec");
if (JS_IsException(exec_func)) {
JS_FreeValue(ctx, substr);
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
JSValue result = JS_Call(ctx, exec_func, argv[1], 1, &substr);
JS_FreeValue(ctx, exec_func);
JS_FreeValue(ctx, substr);
JS_FreeValue(ctx, str);
if (JS_IsException(result)) return JS_EXCEPTION;
return result;
}
/* Literal text path */
JSValue needle_val = JS_ToString(ctx, argv[1]);
if (JS_IsException(needle_val)) {
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
JSString *needle = JS_VALUE_GET_STRING(needle_val);
int pos = js_str_find_range(p, from, to, needle);
JS_FreeValue(ctx, needle_val);
if (pos < 0) {
JS_FreeValue(ctx, str);
return JS_NULL;
}
JSValue arr = JS_NewArray(ctx);
if (JS_IsException(arr)) {
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
JSValue match = js_sub_string(ctx, p, pos, pos + (int)needle->len);
if (JS_IsException(match)) {
JS_FreeValue(ctx, arr);
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
JS_DefinePropertyValueUint32(ctx, arr, 0, match, JS_PROP_C_W_E);
JS_DefinePropertyValueStr(ctx, arr, "index", JS_NewInt32(ctx, pos), JS_PROP_C_W_E);
JS_DefinePropertyValueStr(ctx, arr, "input", JS_DupValue(ctx, str), JS_PROP_C_W_E);
JS_FreeValue(ctx, str);
return arr;
}
/* ----------------------------------------------------------------------------
* array function and sub-functions
* ---------------------------------------------------------------------------- */
@@ -34833,7 +35097,8 @@ static JSValue js_cell_array(JSContext *ctx, JSValueConst this_val,
return result;
}
int tag2 = JS_VALUE_GET_TAG(argv[1]);
int tag2 = JS_VALUE_GET_TAG(argv[1]);
if (tag2 == JS_TAG_STRING || tag2 == JS_TAG_STRING_ROPE) {
/* Split by separator */
const char *cstr = JS_ToCString(ctx, str);
@@ -34859,7 +35124,6 @@ static JSValue js_cell_array(JSContext *ctx, JSValueConst this_val,
const char *found;
if (sep_len == 0) {
/* Split into characters */
for (int i = 0; i < len; i++) {
JSValue ch = js_sub_string(ctx, p, i, i + 1);
JS_SetPropertyInt64(ctx, result, idx++, ch);
@@ -34880,6 +35144,131 @@ static JSValue js_cell_array(JSContext *ctx, JSValueConst this_val,
return result;
}
if (JS_IsObject(argv[1]) && JS_IsRegExp(ctx, argv[1])) {
/* Split by regex (manual "global" iteration; ignore g flag semantics) */
JSValue rx = argv[1];
JSValue result = JS_NewArray(ctx);
if (JS_IsException(result)) {
JS_FreeValue(ctx, str);
return result;
}
/* Save & restore lastIndex to avoid mutating caller-visible state */
JSValue orig_last_index = JS_GetPropertyStr(ctx, rx, "lastIndex");
if (JS_IsException(orig_last_index)) {
JS_FreeValue(ctx, result);
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
int pos = 0;
int64_t out_idx = 0;
while (pos <= len) {
/* force lastIndex = 0 so flags don't matter and we fully control iteration */
if (JS_SetPropertyStr(ctx, rx, "lastIndex", JS_NewInt32(ctx, 0)) < 0) goto fail_rx_split;
JSValue sub_str = js_sub_string(ctx, p, pos, len);
if (JS_IsException(sub_str)) goto fail_rx_split;
JSValue exec_res = JS_Invoke(ctx, rx, JS_ATOM_exec, 1, (JSValueConst *)&sub_str);
JS_FreeValue(ctx, sub_str);
if (JS_IsException(exec_res)) goto fail_rx_split;
if (JS_IsNull(exec_res)) {
JS_FreeValue(ctx, exec_res);
/* remainder */
JSValue tail = js_sub_string(ctx, p, pos, len);
if (JS_IsException(tail)) goto fail_rx_split;
JS_SetPropertyInt64(ctx, result, out_idx++, tail);
break;
}
/* local match index within sub_str */
JSValue idx_val = JS_GetPropertyStr(ctx, exec_res, "index");
if (JS_IsException(idx_val)) {
JS_FreeValue(ctx, exec_res);
goto fail_rx_split;
}
int32_t local_index = 0;
if (JS_ToInt32(ctx, &local_index, idx_val)) {
JS_FreeValue(ctx, idx_val);
JS_FreeValue(ctx, exec_res);
goto fail_rx_split;
}
JS_FreeValue(ctx, idx_val);
if (local_index < 0) local_index = 0;
int found = pos + local_index;
if (found < pos) found = pos;
if (found > len) {
/* treat as no more matches */
JS_FreeValue(ctx, exec_res);
JSValue tail = js_sub_string(ctx, p, pos, len);
if (JS_IsException(tail)) goto fail_rx_split;
JS_SetPropertyInt64(ctx, result, out_idx++, tail);
break;
}
/* match text is exec_res[0] */
JSValue match = JS_GetPropertyUint32(ctx, exec_res, 0);
JS_FreeValue(ctx, exec_res);
if (JS_IsException(match)) goto fail_rx_split;
/* compute match length in code units */
int match_len = 0;
{
JSValue mstr = JS_ToString(ctx, match);
if (JS_IsException(mstr)) {
JS_FreeValue(ctx, match);
goto fail_rx_split;
}
JSString *mp = JS_VALUE_GET_STRING(mstr);
match_len = (int)mp->len;
JS_FreeValue(ctx, mstr);
}
JS_FreeValue(ctx, match);
/* emit piece before match */
JSValue part = js_sub_string(ctx, p, pos, found);
if (JS_IsException(part)) goto fail_rx_split;
JS_SetPropertyInt64(ctx, result, out_idx++, part);
/* advance past match; ensure progress on empty matches */
pos = found + match_len;
if (match_len == 0) {
if (found >= len) {
/* match at end: add trailing empty field and stop */
JSValue empty = JS_NewStringLen(ctx, "", 0);
if (JS_IsException(empty)) goto fail_rx_split;
JS_SetPropertyInt64(ctx, result, out_idx++, empty);
break;
}
pos = found + 1;
}
}
/* restore lastIndex */
JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index);
JS_FreeValue(ctx, str);
return result;
fail_rx_split:
/* best-effort restore lastIndex */
if (!JS_IsException(orig_last_index)) {
JS_SetPropertyStr(ctx, rx, "lastIndex", orig_last_index);
} else {
JS_FreeValue(ctx, orig_last_index);
}
JS_FreeValue(ctx, result);
JS_FreeValue(ctx, str);
return JS_EXCEPTION;
}
if (tag2 == JS_TAG_INT || tag2 == JS_TAG_FLOAT64) {
/* Dice into chunks */
int chunk_len;
@@ -37126,6 +37515,9 @@ void JS_AddIntrinsicBaseObjects(JSContext *ctx)
JS_DefinePropertyValueStr(ctx, ctx->global_obj, "search",
JS_NewCFunction(ctx, js_cell_text_search, "search", 3),
JS_PROP_WRITABLE | JS_PROP_CONFIGURABLE);
JS_DefinePropertyValueStr(ctx, ctx->global_obj, "extract",
JS_NewCFunction(ctx, js_cell_text_extract, "extract", 3),
JS_PROP_WRITABLE | JS_PROP_CONFIGURABLE);
JS_DefinePropertyValueStr(ctx, ctx->global_obj, "reduce",
JS_NewCFunction(ctx, js_cell_array_reduce, "reduce", 4),
JS_PROP_WRITABLE | JS_PROP_CONFIGURABLE);