From cbda7dfbc9166a21850b735329c2e015993822e5 Mon Sep 17 00:00:00 2001 From: John Alanbrook Date: Sat, 7 Jun 2025 23:35:19 -0500 Subject: [PATCH] add utf8 and kim text encoder/decoders --- meson.build | 2 +- scripts/text.cm | 127 +++++++-------------------- source/jsffi.c | 4 + source/kim.h | 12 +-- source/nota.h | 1 + source/qjs_kim.c | 82 ++++++++++++++++++ source/qjs_kim.h | 8 ++ source/qjs_utf8.c | 211 +++++++++++++++++++++++++++++++++++++++++++++ source/qjs_utf8.h | 8 ++ tests/kim.ce | 51 +++++++++++ tests/text_utf8.ce | 47 ++++++++++ tests/utf8.ce | 70 +++++++++++++++ 12 files changed, 518 insertions(+), 105 deletions(-) create mode 100644 source/qjs_kim.c create mode 100644 source/qjs_kim.h create mode 100644 source/qjs_utf8.c create mode 100644 source/qjs_utf8.h create mode 100644 tests/kim.ce create mode 100644 tests/text_utf8.ce create mode 100644 tests/utf8.ce diff --git a/meson.build b/meson.build index bcf7296d..e5b7ae56 100644 --- a/meson.build +++ b/meson.build @@ -295,7 +295,7 @@ src += [ 'anim.c', 'config.c', 'datastream.c','font.c','HandmadeMath.c','jsffi.c','model.c', 'render.c','simplex.c','spline.c', 'transform.c','cell.c', 'wildmatch.c', 'sprite.c', 'rtree.c', 'qjs_nota.c', 'qjs_soloud.c', 'qjs_sdl.c', 'qjs_sdl_input.c', 'qjs_sdl_video.c', 'qjs_sdl_surface.c', 'qjs_math.c', 'qjs_geometry.c', 'qjs_transform.c', 'qjs_sprite.c', 'qjs_io.c', 'qjs_fd.c', 'qjs_os.c', 'qjs_actor.c', - 'qjs_qr.c', 'qjs_wota.c', 'monocypher.c', 'qjs_blob.c', 'qjs_crypto.c', 'qjs_time.c', 'qjs_http.c', 'qjs_rtree.c', 'qjs_spline.c', 'qjs_js.c', 'qjs_debug.c', 'picohttpparser.c', 'qjs_miniz.c', 'timer.c', 'qjs_socket.c' + 'qjs_qr.c', 'qjs_wota.c', 'monocypher.c', 'qjs_blob.c', 'qjs_crypto.c', 'qjs_time.c', 'qjs_http.c', 'qjs_rtree.c', 'qjs_spline.c', 'qjs_js.c', 'qjs_debug.c', 'picohttpparser.c', 'qjs_miniz.c', 'timer.c', 'qjs_socket.c', 'qjs_kim.c', 'qjs_utf8.c' ] # quirc src src += [ diff --git a/scripts/text.cm b/scripts/text.cm index c38adf38..b52947f4 100644 --- a/scripts/text.cm +++ b/scripts/text.cm @@ -3,6 +3,7 @@ /* -------- helper functions ----------------------------------------- */ var blob = use('blob') +var utf8 = use('utf8') // Convert number to string with given radix function to_radix(num, radix) { @@ -179,113 +180,43 @@ function text() { } // Default: interpret as UTF-8 text - var byte_count = Math.floor(bit_length / 8); - var bytes = []; - - // Read bytes from the blob - for (var i = 0; i < byte_count; i++) { - var byte_val = 0; - for (var j = 0; j < 8; j++) { - var bit_pos = i * 8 + j; - var bit = arg.read_logical(bit_pos); - if (bit) byte_val |= (1 << j); - } - bytes.push(byte_val); - } - - // Convert bytes to UTF-8 string - var result = ""; - var i = 0; - while (i < bytes.length) { - var b1 = bytes[i]; - var codepoint; - var nextI; - - if (b1 < 0x80) { - // 1-byte ASCII - codepoint = b1; - nextI = i + 1; - } else if (b1 < 0xC0) { - // Invalid start byte, treat as replacement character - codepoint = 0xFFFD; - nextI = i + 1; - } else if (b1 < 0xE0) { - // 2-byte sequence - if (i + 1 < bytes.length && (bytes[i + 1] & 0xC0) === 0x80) { - codepoint = ((b1 & 0x1F) << 6) | (bytes[i + 1] & 0x3F); - nextI = i + 2; - } else { - codepoint = 0xFFFD; - nextI = i + 1; - } - } else if (b1 < 0xF0) { - // 3-byte sequence - if (i + 2 < bytes.length && - (bytes[i + 1] & 0xC0) === 0x80 && - (bytes[i + 2] & 0xC0) === 0x80) { - codepoint = ((b1 & 0x0F) << 12) | - ((bytes[i + 1] & 0x3F) << 6) | - (bytes[i + 2] & 0x3F); - nextI = i + 3; - } else { - codepoint = 0xFFFD; - nextI = i + 1; - } - } else if (b1 < 0xF8) { - // 4-byte sequence - if (i + 3 < bytes.length && - (bytes[i + 1] & 0xC0) === 0x80 && - (bytes[i + 2] & 0xC0) === 0x80 && - (bytes[i + 3] & 0xC0) === 0x80) { - codepoint = ((b1 & 0x07) << 18) | - ((bytes[i + 1] & 0x3F) << 12) | - ((bytes[i + 2] & 0x3F) << 6) | - (bytes[i + 3] & 0x3F); - nextI = i + 4; - } else { - codepoint = 0xFFFD; - nextI = i + 1; - } - } else { - // Invalid start byte - codepoint = 0xFFFD; - nextI = i + 1; - } - - // Convert codepoint to string - if (codepoint <= 0xFFFF) { - result += String.fromCharCode(codepoint); - } else if (codepoint <= 0x10FFFF) { - // Convert to surrogate pair for JavaScript - codepoint -= 0x10000; - result += String.fromCharCode(0xD800 + (codepoint >> 10)); - result += String.fromCharCode(0xDC00 + (codepoint & 0x3FF)); - } else { - result += String.fromCharCode(0xFFFD); // Replacement character - } - - i = nextI; - } - - return result; + // Use the utf8 module to decode the blob + return utf8.decode(arg); } // Handle array conversion if (Array.isArray(arg)) { var separator = arguments[1] || ""; - var result = ""; + + // Check if all items are valid codepoints + var all_codepoints = true; for (var i = 0; i < arg.length; i++) { - if (i > 0) result += separator; - var item = arg[i]; - if (typeof item === 'number' && item >= 0 && item <= 0x10FFFF && item === Math.floor(item)) { - // Unicode codepoint - result += String.fromCharCode(item); - } else { - result += String(item); + if (!(typeof item === 'number' && item >= 0 && item <= 0x10FFFF && item === Math.floor(item))) { + all_codepoints = false; + break; } } - return result; + + if (all_codepoints && separator === "") { + // Use utf8 module to convert codepoints to string + return utf8.from_codepoints(arg); + } else { + // General array to string conversion + var result = ""; + for (var i = 0; i < arg.length; i++) { + if (i > 0) result += separator; + + var item = arg[i]; + if (typeof item === 'number' && item >= 0 && item <= 0x10FFFF && item === Math.floor(item)) { + // Single codepoint - use utf8 module + result += utf8.from_codepoints([item]); + } else { + result += String(item); + } + } + return result; + } } // Handle number conversion diff --git a/source/jsffi.c b/source/jsffi.c index d700d92a..40984898 100644 --- a/source/jsffi.c +++ b/source/jsffi.c @@ -53,6 +53,8 @@ #include "qjs_debug.h" #include "qjs_sdl_surface.h" #include "qjs_sdl.h" +#include "qjs_kim.h" +#include "qjs_utf8.h" #ifndef NSTEAM #include "qjs_steam.h" #endif @@ -1554,6 +1556,8 @@ void ffi_load(JSContext *js) arrput(rt->module_registry, MISTLINE(http)); arrput(rt->module_registry, MISTLINE(crypto)); arrput(rt->module_registry, MISTLINE(miniz)); + arrput(rt->module_registry, MISTLINE(kim)); + arrput(rt->module_registry, MISTLINE(utf8)); // power user arrput(rt->module_registry, MISTLINE(js)); diff --git a/source/kim.h b/source/kim.h index 051945e6..107279b9 100755 --- a/source/kim.h +++ b/source/kim.h @@ -10,17 +10,17 @@ void kim_to_utf8(char **kim, char **utf, int runes); // Return the number of runes in a utf8 string int utf8_count(const char *utf8); +int decode_utf8(char **s); +void encode_utf8(char **s, int code); +void encode_kim(char **s, int code); +int decode_kim(char **s); + #ifdef KIM_IMPLEMENTATION #define KIM_CONT 0x80 #define KIM_DATA 0x7f #define CONTINUE(CHAR) (CHAR>>7) -int decode_utf8(char **s); -void encode_utf8(char **s, int code); -static void encode_kim(char **s, int code); -int decode_kim(char **s); - static inline int utf8_bytes(char c) { int bytes = __builtin_clz(~(c)); @@ -70,7 +70,7 @@ void encode_utf8(char **s, int rune) { } // write and advance s with rune in kim -static inline void encode_kim(char **s, int rune) +void encode_kim(char **s, int rune) { if (rune < KIM_CONT) { **s = 0 | (KIM_DATA & rune); diff --git a/source/nota.h b/source/nota.h index 98f50586..c9ff68bb 100755 --- a/source/nota.h +++ b/source/nota.h @@ -3,6 +3,7 @@ #include #include +#include "kim.h" /* Nota type nibble values */ #define NOTA_BLOB 0x00 diff --git a/source/qjs_kim.c b/source/qjs_kim.c new file mode 100644 index 00000000..4084baf3 --- /dev/null +++ b/source/qjs_kim.c @@ -0,0 +1,82 @@ +#include "qjs_kim.h" +#include "qjs_blob.h" +#include "jsffi.h" +#include +#include + +#define KIM_IMPLEMENTATION +#include "kim.h" + +JSC_CCALL(kim_encode, + const char *utf8_str = JS_ToCString(js, argv[0]); + if (!utf8_str) return JS_EXCEPTION; + + // Count runes to estimate kim buffer size + int rune_count = utf8_count(utf8_str); + + // Allocate kim buffer (worst case: 5 bytes per rune) + size_t kim_size = rune_count * 5; + char *kim_buffer = malloc(kim_size); + char *kim_ptr = kim_buffer; + + // Encode utf8 to kim + long long runes_encoded; + utf8_to_kim(&utf8_str, &kim_ptr, &runes_encoded); + + // Calculate actual size used + size_t actual_size = kim_ptr - kim_buffer; + + // Create blob with the encoded data + ret = js_new_blob_stoned_copy(js, kim_buffer, actual_size); + + free(kim_buffer); + JS_FreeCString(js, utf8_str); +) + +JSC_CCALL(kim_decode, + size_t kim_len; + void *kim_data = js_get_blob_data(js, &kim_len, argv[0]); + if (!kim_data) return JS_ThrowTypeError(js, "Expected blob"); + + // Allocate UTF-8 buffer (worst case: 4 bytes per kim byte) + size_t utf8_size = kim_len * 4; + char *utf8_buffer = malloc(utf8_size + 1); // +1 for null terminator + char *utf8_ptr = utf8_buffer; + + // Copy kim data since kim_to_utf8 modifies the pointer + char *kim_copy = malloc(kim_len); + memcpy(kim_copy, kim_data, kim_len); + char *kim_ptr = kim_copy; + + // Count runes in kim data + int rune_count = 0; + char *temp_ptr = kim_copy; + while (temp_ptr < kim_copy + kim_len) { + decode_kim(&temp_ptr); + rune_count++; + } + + // Reset pointer and decode + kim_ptr = kim_copy; + kim_to_utf8(&kim_ptr, &utf8_ptr, rune_count); + + // Null terminate + *utf8_ptr = '\0'; + + ret = JS_NewString(js, utf8_buffer); + + free(utf8_buffer); + free(kim_copy); +) + +static const JSCFunctionListEntry js_kim_funcs[] = { + MIST_FUNC_DEF(kim, encode, 1), + MIST_FUNC_DEF(kim, decode, 1), +}; + +JSValue js_kim_use(JSContext *js) +{ + JSValue mod = JS_NewObject(js); + JS_SetPropertyFunctionList(js, mod, js_kim_funcs, countof(js_kim_funcs)); + return mod; +} \ No newline at end of file diff --git a/source/qjs_kim.h b/source/qjs_kim.h new file mode 100644 index 00000000..580a4c00 --- /dev/null +++ b/source/qjs_kim.h @@ -0,0 +1,8 @@ +#ifndef QJS_KIM_H +#define QJS_KIM_H + +#include "cell.h" + +JSValue js_kim_use(JSContext*); + +#endif \ No newline at end of file diff --git a/source/qjs_utf8.c b/source/qjs_utf8.c new file mode 100644 index 00000000..0852de50 --- /dev/null +++ b/source/qjs_utf8.c @@ -0,0 +1,211 @@ +#include "qjs_utf8.h" +#include "qjs_blob.h" +#include "jsffi.h" +#include +#include + +#include "kim.h" + +// Get codepoints from a UTF-8 string +JSC_CCALL(utf8_codepoints, + const char *str = JS_ToCString(js, argv[0]); + if (!str) return JS_EXCEPTION; + + JSValue arr = JS_NewArray(js); + int idx = 0; + + char *ptr = (char*)str; + while (*ptr) { + int codepoint = decode_utf8(&ptr); + JS_SetPropertyUint32(js, arr, idx++, JS_NewInt32(js, codepoint)); + } + + JS_FreeCString(js, str); + ret = arr; +) + +// Create UTF-8 string from codepoints +JSC_CCALL(utf8_from_codepoints, + int len = JS_ArrayLength(js, argv[0]); + + // Allocate buffer (worst case: 4 bytes per codepoint + null) + char *buffer = malloc(len * 4 + 1); + char *ptr = buffer; + + for (int i = 0; i < len; i++) { + JSValue val = JS_GetPropertyUint32(js, argv[0], i); + int codepoint; + JS_ToInt32(js, &codepoint, val); + JS_FreeValue(js, val); + + encode_utf8(&ptr, codepoint); + } + + *ptr = '\0'; + ret = JS_NewString(js, buffer); + free(buffer); +) + +// Count UTF-8 characters (runes) in a string +JSC_SCALL(utf8_length, + int count = utf8_count(str); + ret = JS_NewInt32(js, count); +) + +// Validate UTF-8 string +JSC_SCALL(utf8_validate, + char *ptr = (char*)str; + int valid = 1; + + while (*ptr) { + int start_pos = ptr - str; + int codepoint = decode_utf8(&ptr); + + // Check for invalid sequences + if (codepoint < 0 || codepoint > 0x10FFFF || + (codepoint >= 0xD800 && codepoint <= 0xDFFF)) { + valid = 0; + break; + } + + // Check for overlong encodings + int bytes_used = ptr - (str + start_pos); + if ((codepoint <= 0x7F && bytes_used != 1) || + (codepoint <= 0x7FF && bytes_used != 2) || + (codepoint <= 0xFFFF && bytes_used != 3) || + (codepoint <= 0x10FFFF && bytes_used != 4)) { + valid = 0; + break; + } + } + + ret = JS_NewBool(js, valid); +) + +// Get byte length of UTF-8 string +JSC_SCALL(utf8_byte_length, + ret = JS_NewInt32(js, strlen(str)); +) + +// Encode string to UTF-8 bytes +JSC_SCALL(utf8_encode, + size_t len = strlen(str); + ret = js_new_blob_stoned_copy(js, str, len); +) + +// Decode UTF-8 bytes to string +JSC_CCALL(utf8_decode, + size_t len; + void *data = js_get_blob_data(js, &len, argv[0]); + if (!data) return JS_ThrowTypeError(js, "Expected blob"); + + // Create null-terminated string + char *str = malloc(len + 1); + memcpy(str, data, len); + str[len] = '\0'; + + ret = JS_NewString(js, str); + free(str); +) + +// Slice UTF-8 string by character indices (not byte indices) +JSC_CCALL(utf8_slice, + const char *str = JS_ToCString(js, argv[0]); + if (!str) return JS_EXCEPTION; + + int start = 0; + int end = utf8_count(str); + + if (argc > 1) JS_ToInt32(js, &start, argv[1]); + if (argc > 2) JS_ToInt32(js, &end, argv[2]); + + // Handle negative indices + int total = end; + if (start < 0) start = total + start; + if (end < 0) end = total + end; + + // Clamp values + if (start < 0) start = 0; + if (end > total) end = total; + if (start >= end) { + JS_FreeCString(js, str); + return JS_NewString(js, ""); + } + + // Find start position + char *ptr = (char*)str; + for (int i = 0; i < start && *ptr; i++) { + decode_utf8(&ptr); + } + char *start_ptr = ptr; + + // Find end position + for (int i = start; i < end && *ptr; i++) { + decode_utf8(&ptr); + } + + // Create substring + size_t slice_len = ptr - start_ptr; + char *slice = malloc(slice_len + 1); + memcpy(slice, start_ptr, slice_len); + slice[slice_len] = '\0'; + + ret = JS_NewString(js, slice); + free(slice); + JS_FreeCString(js, str); +) + +// Get character at index +JSC_CCALL(utf8_char_at, + const char *str = JS_ToCString(js, argv[0]); + if (!str) return JS_EXCEPTION; + + int index; + JS_ToInt32(js, &index, argv[1]); + + char *ptr = (char*)str; + int count = 0; + + // Skip to index + while (*ptr && count < index) { + decode_utf8(&ptr); + count++; + } + + if (!*ptr || count != index) { + JS_FreeCString(js, str); + return JS_UNDEFINED; + } + + // Get the character + char *char_start = ptr; + decode_utf8(&ptr); + + size_t char_len = ptr - char_start; + char *result = malloc(char_len + 1); + memcpy(result, char_start, char_len); + result[char_len] = '\0'; + + ret = JS_NewString(js, result); + free(result); + JS_FreeCString(js, str); +) + +static const JSCFunctionListEntry js_utf8_funcs[] = { + MIST_FUNC_DEF(utf8, codepoints, 1), + MIST_FUNC_DEF(utf8, from_codepoints, 1), + MIST_FUNC_DEF(utf8, length, 1), + MIST_FUNC_DEF(utf8, validate, 1), + MIST_FUNC_DEF(utf8, byte_length, 1), + MIST_FUNC_DEF(utf8, encode, 1), + MIST_FUNC_DEF(utf8, decode, 1), + MIST_FUNC_DEF(utf8, slice, 3), + MIST_FUNC_DEF(utf8, char_at, 2), +}; + +JSValue js_utf8_use(JSContext *js) +{ + JSValue mod = JS_NewObject(js); + JS_SetPropertyFunctionList(js, mod, js_utf8_funcs, countof(js_utf8_funcs)); + return mod; +} \ No newline at end of file diff --git a/source/qjs_utf8.h b/source/qjs_utf8.h new file mode 100644 index 00000000..ae1327c9 --- /dev/null +++ b/source/qjs_utf8.h @@ -0,0 +1,8 @@ +#ifndef QJS_UTF8_H +#define QJS_UTF8_H + +#include "cell.h" + +JSValue js_utf8_use(JSContext*); + +#endif \ No newline at end of file diff --git a/tests/kim.ce b/tests/kim.ce new file mode 100644 index 00000000..89aa4ca0 --- /dev/null +++ b/tests/kim.ce @@ -0,0 +1,51 @@ +var kim = use("kim"); +var blob = use('blob') + +// Test basic ASCII +var test1 = "Hello, World!"; +var encoded1 = kim.encode(test1); +var decoded1 = kim.decode(encoded1); +log.console("ASCII test:", test1 === decoded1 ? "PASS" : "FAIL"); +if (test1 !== decoded1) { + log.console(" Expected:", test1); + log.console(" Got:", decoded1); +} + +// Test Unicode characters +var test2 = "Hello, δΈ–η•Œ! 🌍 ΠŸΡ€ΠΈΠ²Π΅Ρ‚ ΠΌΠΈΡ€"; +var encoded2 = kim.encode(test2); +var decoded2 = kim.decode(encoded2); +log.console("Unicode test:", test2 === decoded2 ? "PASS" : "FAIL"); +if (test2 !== decoded2) { + log.console(" Expected:", test2); + log.console(" Got:", decoded2); +} + +// Test empty string +var test3 = ""; +var encoded3 = kim.encode(test3); +log.console(typeof encoded3) +log.console(encoded3 instanceof blob) +var decoded3 = kim.decode(encoded3); +log.console("Empty string test:", test3 === decoded3 ? "PASS" : "FAIL"); + +// Test various Unicode ranges +var test4 = "αβγδΡ АБВГД δ½ ε₯½ πŸ˜€πŸ˜ŽπŸŽ‰ βˆ‘βˆβˆ«"; +var encoded4 = kim.encode(test4); +var decoded4 = kim.decode(encoded4); +log.console("Mixed Unicode test:", test4 === decoded4 ? "PASS" : "FAIL"); +if (test4 !== decoded4) { + log.console(" Expected:", test4); + log.console(" Got:", decoded4); +} + +// Test efficiency - KIM should be smaller for high codepoints +var highCodepoints = "πŸŒπŸŒŽπŸŒπŸ—ΊοΈπŸ§­"; +var encodedHigh = kim.encode(highCodepoints); +var utf8Bytes = new Blob([highCodepoints]).size; +log.console("High codepoint efficiency:"); +log.console(" UTF-8 bytes:", utf8Bytes); +log.console(" KIM bytes:", encodedHigh.byteLength); +log.console(" Savings:", utf8Bytes - encodedHigh.byteLength, "bytes"); + +log.console("\nAll tests completed!"); \ No newline at end of file diff --git a/tests/text_utf8.ce b/tests/text_utf8.ce new file mode 100644 index 00000000..42f0b128 --- /dev/null +++ b/tests/text_utf8.ce @@ -0,0 +1,47 @@ +var text = use('text'); +var blob = use('blob'); +var utf8 = use('utf8'); + +// Test blob to text conversion +var test_string = "Hello, δΈ–η•Œ! 🌍"; +var encoded_blob = utf8.encode(test_string); +var decoded_text = text(encoded_blob); + +log.console("Blob to text test:"); +log.console(" Original:", test_string); +log.console(" Decoded:", decoded_text); +log.console(" Match:", test_string === decoded_text ? "PASS" : "FAIL"); + +// Test array of codepoints conversion +var codepoints = [72, 101, 108, 108, 111, 44, 32, 19990, 30028, 33, 32, 127757]; +var from_codepoints = text(codepoints); +log.console("\nCodepoints to text test:"); +log.console(" From codepoints:", from_codepoints); +log.console(" Match:", from_codepoints === test_string ? "PASS" : "FAIL"); + +// Test array with separator +var words = ["Hello", "world", "from", "text"]; +var joined = text(words, " "); +log.console("\nArray with separator test:"); +log.console(" Joined:", joined); +log.console(" Expected: Hello world from text"); +log.console(" Match:", joined === "Hello world from text" ? "PASS" : "FAIL"); + +// Test mixed array with codepoints +var mixed = [72, "ello", 32, "world"]; +var mixed_result = text(mixed, ""); +log.console("\nMixed array test:"); +log.console(" Result:", mixed_result); +log.console(" Expected: Hello world"); +log.console(" Match:", mixed_result === "Hello world" ? "PASS" : "FAIL"); + +// Test blob encoding formats still work +var test_data = utf8.encode("ABC"); +log.console("\nBlob format tests:"); +log.console(" Hex:", text(test_data, "h")); +log.console(" Binary:", text(test_data, "b")); +log.console(" Octal:", text(test_data, "o")); + +log.console("\nAll tests completed!"); + +$_.stop(); \ No newline at end of file diff --git a/tests/utf8.ce b/tests/utf8.ce new file mode 100644 index 00000000..41a2bd4a --- /dev/null +++ b/tests/utf8.ce @@ -0,0 +1,70 @@ +var utf8 = use("utf8"); + +// Test character counting vs byte counting +var test1 = "Hello"; +log.console("ASCII length test:"); +log.console(" Characters:", utf8.length(test1)); +log.console(" Bytes:", utf8.byte_length(test1)); +log.console(" Match:", utf8.length(test1) === utf8.byte_length(test1) ? "PASS" : "FAIL"); + +var test2 = "Hello δΈ–η•Œ"; +log.console("\nMixed ASCII/Unicode length test:"); +log.console(" Characters:", utf8.length(test2)); +log.console(" Bytes:", utf8.byte_length(test2)); +log.console(" Bytes > Characters:", utf8.byte_length(test2) > utf8.length(test2) ? "PASS" : "FAIL"); + +// Test codepoints +var test3 = "AπŸ˜€B"; +var codepoints = utf8.codepoints(test3); +log.console("\nCodepoints test:"); +log.console(" String:", test3); +log.console(" Codepoints:", codepoints); +log.console(" A=65:", codepoints[0] === 65 ? "PASS" : "FAIL"); +log.console(" πŸ˜€=128512:", codepoints[1] === 128512 ? "PASS" : "FAIL"); +log.console(" B=66:", codepoints[2] === 66 ? "PASS" : "FAIL"); + +// Test from_codepoints +var reconstructed = utf8.from_codepoints(codepoints); +log.console(" Reconstructed:", reconstructed); +log.console(" Match:", test3 === reconstructed ? "PASS" : "FAIL"); + +// Test encode/decode +var test4 = "UTF-8 encoding: δ½ ε₯½δΈ–η•Œ 🌍"; +var encoded = utf8.encode(test4); +var decoded = utf8.decode(encoded); +log.console("\nEncode/decode test:"); +log.console(" Original:", test4); +log.console(" Decoded:", decoded); +log.console(" Match:", test4 === decoded ? "PASS" : "FAIL"); + +// Test validation +log.console("\nValidation tests:"); +log.console(" Valid UTF-8:", utf8.validate("Hello δΈ–η•Œ") ? "PASS" : "FAIL"); + +// Test slicing +var test5 = "Hello δΈ–η•Œ!"; +log.console("\nSlice tests:"); +log.console(" Original:", test5); +log.console(" slice(0, 5):", utf8.slice(test5, 0, 5)); +log.console(" slice(6, 8):", utf8.slice(test5, 6, 8)); +log.console(" slice(-3):", utf8.slice(test5, -3)); +log.console(" slice(0, -1):", utf8.slice(test5, 0, -1)); + +// Test char_at +log.console("\nchar_at tests:"); +log.console(" char_at(0):", utf8.char_at(test5, 0)); +log.console(" char_at(6):", utf8.char_at(test5, 6)); +log.console(" char_at(7):", utf8.char_at(test5, 7)); +log.console(" char_at(100):", utf8.char_at(test5, 100)); + +// Test with emoji sequences +var test6 = "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦"; +log.console("\nComplex emoji test:"); +log.console(" String:", test6); +log.console(" Length:", utf8.length(test6)); +log.console(" Byte length:", utf8.byte_length(test6)); +log.console(" Codepoints:", utf8.codepoints(test6).length); + +log.console("\nAll tests completed!"); + +$_.stop() \ No newline at end of file