211 lines
4.8 KiB
C
211 lines
4.8 KiB
C
#include "qjs_utf8.h"
|
|
#include "qjs_blob.h"
|
|
#include "jsffi.h"
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "kim.h"
|
|
|
|
// Get codepoints from a UTF-8 string
|
|
JSC_CCALL(utf8_codepoints,
|
|
const char *str = JS_ToCString(js, argv[0]);
|
|
if (!str) return JS_EXCEPTION;
|
|
|
|
JSValue arr = JS_NewArray(js);
|
|
int idx = 0;
|
|
|
|
char *ptr = (char*)str;
|
|
while (*ptr) {
|
|
int codepoint = decode_utf8(&ptr);
|
|
JS_SetPropertyUint32(js, arr, idx++, JS_NewInt32(js, codepoint));
|
|
}
|
|
|
|
JS_FreeCString(js, str);
|
|
ret = arr;
|
|
)
|
|
|
|
// Create UTF-8 string from codepoints
|
|
JSC_CCALL(utf8_from_codepoints,
|
|
int len = JS_ArrayLength(js, argv[0]);
|
|
|
|
// Allocate buffer (worst case: 4 bytes per codepoint + null)
|
|
char *buffer = malloc(len * 4 + 1);
|
|
char *ptr = buffer;
|
|
|
|
for (int i = 0; i < len; i++) {
|
|
JSValue val = JS_GetPropertyUint32(js, argv[0], i);
|
|
int codepoint;
|
|
JS_ToInt32(js, &codepoint, val);
|
|
JS_FreeValue(js, val);
|
|
|
|
encode_utf8(&ptr, codepoint);
|
|
}
|
|
|
|
*ptr = '\0';
|
|
ret = JS_NewString(js, buffer);
|
|
free(buffer);
|
|
)
|
|
|
|
// Count UTF-8 characters (runes) in a string
|
|
JSC_SCALL(utf8_length,
|
|
int count = utf8_count(str);
|
|
ret = JS_NewInt32(js, count);
|
|
)
|
|
|
|
// Validate UTF-8 string
|
|
JSC_SCALL(utf8_validate,
|
|
char *ptr = (char*)str;
|
|
int valid = 1;
|
|
|
|
while (*ptr) {
|
|
int start_pos = ptr - str;
|
|
int codepoint = decode_utf8(&ptr);
|
|
|
|
// Check for invalid sequences
|
|
if (codepoint < 0 || codepoint > 0x10FFFF ||
|
|
(codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
|
|
valid = 0;
|
|
break;
|
|
}
|
|
|
|
// Check for overlong encodings
|
|
int bytes_used = ptr - (str + start_pos);
|
|
if ((codepoint <= 0x7F && bytes_used != 1) ||
|
|
(codepoint <= 0x7FF && bytes_used != 2) ||
|
|
(codepoint <= 0xFFFF && bytes_used != 3) ||
|
|
(codepoint <= 0x10FFFF && bytes_used != 4)) {
|
|
valid = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
ret = JS_NewBool(js, valid);
|
|
)
|
|
|
|
// Get byte length of UTF-8 string
|
|
JSC_SCALL(utf8_byte_length,
|
|
ret = JS_NewInt32(js, strlen(str));
|
|
)
|
|
|
|
// Encode string to UTF-8 bytes
|
|
JSC_SCALL(utf8_encode,
|
|
size_t len = strlen(str);
|
|
ret = js_new_blob_stoned_copy(js, str, len);
|
|
)
|
|
|
|
// Decode UTF-8 bytes to string
|
|
JSC_CCALL(utf8_decode,
|
|
size_t len;
|
|
void *data = js_get_blob_data(js, &len, argv[0]);
|
|
if (!data) return JS_ThrowTypeError(js, "Expected blob");
|
|
|
|
// Create null-terminated string
|
|
char *str = malloc(len + 1);
|
|
memcpy(str, data, len);
|
|
str[len] = '\0';
|
|
|
|
ret = JS_NewString(js, str);
|
|
free(str);
|
|
)
|
|
|
|
// Slice UTF-8 string by character indices (not byte indices)
|
|
JSC_CCALL(utf8_slice,
|
|
const char *str = JS_ToCString(js, argv[0]);
|
|
if (!str) return JS_EXCEPTION;
|
|
|
|
int start = 0;
|
|
int end = utf8_count(str);
|
|
|
|
if (argc > 1) JS_ToInt32(js, &start, argv[1]);
|
|
if (argc > 2) JS_ToInt32(js, &end, argv[2]);
|
|
|
|
// Handle negative indices
|
|
int total = end;
|
|
if (start < 0) start = total + start;
|
|
if (end < 0) end = total + end;
|
|
|
|
// Clamp values
|
|
if (start < 0) start = 0;
|
|
if (end > total) end = total;
|
|
if (start >= end) {
|
|
JS_FreeCString(js, str);
|
|
return JS_NewString(js, "");
|
|
}
|
|
|
|
// Find start position
|
|
char *ptr = (char*)str;
|
|
for (int i = 0; i < start && *ptr; i++) {
|
|
decode_utf8(&ptr);
|
|
}
|
|
char *start_ptr = ptr;
|
|
|
|
// Find end position
|
|
for (int i = start; i < end && *ptr; i++) {
|
|
decode_utf8(&ptr);
|
|
}
|
|
|
|
// Create substring
|
|
size_t slice_len = ptr - start_ptr;
|
|
char *slice = malloc(slice_len + 1);
|
|
memcpy(slice, start_ptr, slice_len);
|
|
slice[slice_len] = '\0';
|
|
|
|
ret = JS_NewString(js, slice);
|
|
free(slice);
|
|
JS_FreeCString(js, str);
|
|
)
|
|
|
|
// Get character at index
|
|
JSC_CCALL(utf8_char_at,
|
|
const char *str = JS_ToCString(js, argv[0]);
|
|
if (!str) return JS_EXCEPTION;
|
|
|
|
int index;
|
|
JS_ToInt32(js, &index, argv[1]);
|
|
|
|
char *ptr = (char*)str;
|
|
int count = 0;
|
|
|
|
// Skip to index
|
|
while (*ptr && count < index) {
|
|
decode_utf8(&ptr);
|
|
count++;
|
|
}
|
|
|
|
if (!*ptr || count != index) {
|
|
JS_FreeCString(js, str);
|
|
return JS_NULL;
|
|
}
|
|
|
|
// Get the character
|
|
char *char_start = ptr;
|
|
decode_utf8(&ptr);
|
|
|
|
size_t char_len = ptr - char_start;
|
|
char *result = malloc(char_len + 1);
|
|
memcpy(result, char_start, char_len);
|
|
result[char_len] = '\0';
|
|
|
|
ret = JS_NewString(js, result);
|
|
free(result);
|
|
JS_FreeCString(js, str);
|
|
)
|
|
|
|
static const JSCFunctionListEntry js_utf8_funcs[] = {
|
|
MIST_FUNC_DEF(utf8, codepoints, 1),
|
|
MIST_FUNC_DEF(utf8, from_codepoints, 1),
|
|
MIST_FUNC_DEF(utf8, length, 1),
|
|
MIST_FUNC_DEF(utf8, validate, 1),
|
|
MIST_FUNC_DEF(utf8, byte_length, 1),
|
|
MIST_FUNC_DEF(utf8, encode, 1),
|
|
MIST_FUNC_DEF(utf8, decode, 1),
|
|
MIST_FUNC_DEF(utf8, slice, 3),
|
|
MIST_FUNC_DEF(utf8, char_at, 2),
|
|
};
|
|
|
|
JSValue js_utf8_use(JSContext *js)
|
|
{
|
|
JSValue mod = JS_NewObject(js);
|
|
JS_SetPropertyFunctionList(js, mod, js_utf8_funcs, countof(js_utf8_funcs));
|
|
return mod;
|
|
} |