Files
cell/source/qjs_utf8.c

211 lines
4.8 KiB
C

#include "qjs_utf8.h"
#include "qjs_blob.h"
#include "jsffi.h"
#include <string.h>
#include <stdlib.h>
#include "kim.h"
// Get codepoints from a UTF-8 string
JSC_CCALL(utf8_codepoints,
const char *str = JS_ToCString(js, argv[0]);
if (!str) return JS_EXCEPTION;
JSValue arr = JS_NewArray(js);
int idx = 0;
char *ptr = (char*)str;
while (*ptr) {
int codepoint = decode_utf8(&ptr);
JS_SetPropertyUint32(js, arr, idx++, JS_NewInt32(js, codepoint));
}
JS_FreeCString(js, str);
ret = arr;
)
// Create UTF-8 string from codepoints
JSC_CCALL(utf8_from_codepoints,
int len = JS_ArrayLength(js, argv[0]);
// Allocate buffer (worst case: 4 bytes per codepoint + null)
char *buffer = malloc(len * 4 + 1);
char *ptr = buffer;
for (int i = 0; i < len; i++) {
JSValue val = JS_GetPropertyUint32(js, argv[0], i);
int codepoint;
JS_ToInt32(js, &codepoint, val);
JS_FreeValue(js, val);
encode_utf8(&ptr, codepoint);
}
*ptr = '\0';
ret = JS_NewString(js, buffer);
free(buffer);
)
// Count UTF-8 characters (runes) in a string
JSC_SCALL(utf8_length,
int count = utf8_count(str);
ret = JS_NewInt32(js, count);
)
// Validate UTF-8 string
JSC_SCALL(utf8_validate,
char *ptr = (char*)str;
int valid = 1;
while (*ptr) {
int start_pos = ptr - str;
int codepoint = decode_utf8(&ptr);
// Check for invalid sequences
if (codepoint < 0 || codepoint > 0x10FFFF ||
(codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
valid = 0;
break;
}
// Check for overlong encodings
int bytes_used = ptr - (str + start_pos);
if ((codepoint <= 0x7F && bytes_used != 1) ||
(codepoint <= 0x7FF && bytes_used != 2) ||
(codepoint <= 0xFFFF && bytes_used != 3) ||
(codepoint <= 0x10FFFF && bytes_used != 4)) {
valid = 0;
break;
}
}
ret = JS_NewBool(js, valid);
)
// Get byte length of UTF-8 string
JSC_SCALL(utf8_byte_length,
ret = JS_NewInt32(js, strlen(str));
)
// Encode string to UTF-8 bytes
JSC_SCALL(utf8_encode,
size_t len = strlen(str);
ret = js_new_blob_stoned_copy(js, str, len);
)
// Decode UTF-8 bytes to string
JSC_CCALL(utf8_decode,
size_t len;
void *data = js_get_blob_data(js, &len, argv[0]);
if (!data) return JS_ThrowTypeError(js, "Expected blob");
// Create null-terminated string
char *str = malloc(len + 1);
memcpy(str, data, len);
str[len] = '\0';
ret = JS_NewString(js, str);
free(str);
)
// Slice UTF-8 string by character indices (not byte indices)
JSC_CCALL(utf8_slice,
const char *str = JS_ToCString(js, argv[0]);
if (!str) return JS_EXCEPTION;
int start = 0;
int end = utf8_count(str);
if (argc > 1) JS_ToInt32(js, &start, argv[1]);
if (argc > 2) JS_ToInt32(js, &end, argv[2]);
// Handle negative indices
int total = end;
if (start < 0) start = total + start;
if (end < 0) end = total + end;
// Clamp values
if (start < 0) start = 0;
if (end > total) end = total;
if (start >= end) {
JS_FreeCString(js, str);
return JS_NewString(js, "");
}
// Find start position
char *ptr = (char*)str;
for (int i = 0; i < start && *ptr; i++) {
decode_utf8(&ptr);
}
char *start_ptr = ptr;
// Find end position
for (int i = start; i < end && *ptr; i++) {
decode_utf8(&ptr);
}
// Create substring
size_t slice_len = ptr - start_ptr;
char *slice = malloc(slice_len + 1);
memcpy(slice, start_ptr, slice_len);
slice[slice_len] = '\0';
ret = JS_NewString(js, slice);
free(slice);
JS_FreeCString(js, str);
)
// Get character at index
JSC_CCALL(utf8_char_at,
const char *str = JS_ToCString(js, argv[0]);
if (!str) return JS_EXCEPTION;
int index;
JS_ToInt32(js, &index, argv[1]);
char *ptr = (char*)str;
int count = 0;
// Skip to index
while (*ptr && count < index) {
decode_utf8(&ptr);
count++;
}
if (!*ptr || count != index) {
JS_FreeCString(js, str);
return JS_UNDEFINED;
}
// Get the character
char *char_start = ptr;
decode_utf8(&ptr);
size_t char_len = ptr - char_start;
char *result = malloc(char_len + 1);
memcpy(result, char_start, char_len);
result[char_len] = '\0';
ret = JS_NewString(js, result);
free(result);
JS_FreeCString(js, str);
)
static const JSCFunctionListEntry js_utf8_funcs[] = {
MIST_FUNC_DEF(utf8, codepoints, 1),
MIST_FUNC_DEF(utf8, from_codepoints, 1),
MIST_FUNC_DEF(utf8, length, 1),
MIST_FUNC_DEF(utf8, validate, 1),
MIST_FUNC_DEF(utf8, byte_length, 1),
MIST_FUNC_DEF(utf8, encode, 1),
MIST_FUNC_DEF(utf8, decode, 1),
MIST_FUNC_DEF(utf8, slice, 3),
MIST_FUNC_DEF(utf8, char_at, 2),
};
JSValue js_utf8_use(JSContext *js)
{
JSValue mod = JS_NewObject(js);
JS_SetPropertyFunctionList(js, mod, js_utf8_funcs, countof(js_utf8_funcs));
return mod;
}