From cbda7dfbc9166a21850b735329c2e015993822e5 Mon Sep 17 00:00:00 2001
From: John Alanbrook <john@pockle.world>
Date: Sat, 7 Jun 2025 23:35:19 -0500
Subject: [PATCH] add utf8 and kim text encoder/decoders

---
 meson.build        |   2 +-
 scripts/text.cm    | 127 +++++++--------------------
 source/jsffi.c     |   4 +
 source/kim.h       |  12 +--
 source/nota.h      |   1 +
 source/qjs_kim.c   |  82 ++++++++++++++++++
 source/qjs_kim.h   |   8 ++
 source/qjs_utf8.c  | 211 +++++++++++++++++++++++++++++++++++++++++++++
 source/qjs_utf8.h  |   8 ++
 tests/kim.ce       |  51 +++++++++++
 tests/text_utf8.ce |  47 ++++++++++
 tests/utf8.ce      |  70 +++++++++++++++
 12 files changed, 518 insertions(+), 105 deletions(-)
 create mode 100644 source/qjs_kim.c
 create mode 100644 source/qjs_kim.h
 create mode 100644 source/qjs_utf8.c
 create mode 100644 source/qjs_utf8.h
 create mode 100644 tests/kim.ce
 create mode 100644 tests/text_utf8.ce
 create mode 100644 tests/utf8.ce

diff --git a/meson.build b/meson.build
index bcf7296d..e5b7ae56 100644
--- a/meson.build
+++ b/meson.build
@@ -295,7 +295,7 @@ src += [
   'anim.c', 'config.c', 'datastream.c','font.c','HandmadeMath.c','jsffi.c','model.c',
   'render.c','simplex.c','spline.c', 'transform.c','cell.c',  'wildmatch.c',
   'sprite.c', 'rtree.c', 'qjs_nota.c', 'qjs_soloud.c', 'qjs_sdl.c', 'qjs_sdl_input.c', 'qjs_sdl_video.c', 'qjs_sdl_surface.c', 'qjs_math.c', 'qjs_geometry.c', 'qjs_transform.c', 'qjs_sprite.c', 'qjs_io.c', 'qjs_fd.c', 'qjs_os.c', 'qjs_actor.c',
-  'qjs_qr.c', 'qjs_wota.c', 'monocypher.c', 'qjs_blob.c', 'qjs_crypto.c', 'qjs_time.c', 'qjs_http.c', 'qjs_rtree.c', 'qjs_spline.c', 'qjs_js.c', 'qjs_debug.c', 'picohttpparser.c', 'qjs_miniz.c', 'timer.c', 'qjs_socket.c'
+  'qjs_qr.c', 'qjs_wota.c', 'monocypher.c', 'qjs_blob.c', 'qjs_crypto.c', 'qjs_time.c', 'qjs_http.c', 'qjs_rtree.c', 'qjs_spline.c', 'qjs_js.c', 'qjs_debug.c', 'picohttpparser.c', 'qjs_miniz.c', 'timer.c', 'qjs_socket.c', 'qjs_kim.c', 'qjs_utf8.c'
 ]
 # quirc src
 src += [
diff --git a/scripts/text.cm b/scripts/text.cm
index c38adf38..b52947f4 100644
--- a/scripts/text.cm
+++ b/scripts/text.cm
@@ -3,6 +3,7 @@
 /* -------- helper functions ----------------------------------------- */
 
 var blob = use('blob')
+var utf8 = use('utf8')
 
 // Convert number to string with given radix
 function to_radix(num, radix) {
@@ -179,113 +180,43 @@ function text() {
     }
     
     // Default: interpret as UTF-8 text
-    var byte_count = Math.floor(bit_length / 8);
-    var bytes = [];
-    
-    // Read bytes from the blob
-    for (var i = 0; i < byte_count; i++) {
-      var byte_val = 0;
-      for (var j = 0; j < 8; j++) {
-        var bit_pos = i * 8 + j;
-        var bit = arg.read_logical(bit_pos);
-        if (bit) byte_val |= (1 << j);
-      }
-      bytes.push(byte_val);
-    }
-    
-    // Convert bytes to UTF-8 string
-    var result = "";
-    var i = 0;
-    while (i < bytes.length) {
-      var b1 = bytes[i];
-      var codepoint;
-      var nextI;
-      
-      if (b1 < 0x80) {
-        // 1-byte ASCII
-        codepoint = b1;
-        nextI = i + 1;
-      } else if (b1 < 0xC0) {
-        // Invalid start byte, treat as replacement character
-        codepoint = 0xFFFD;
-        nextI = i + 1;
-      } else if (b1 < 0xE0) {
-        // 2-byte sequence
-        if (i + 1 < bytes.length && (bytes[i + 1] & 0xC0) === 0x80) {
-          codepoint = ((b1 & 0x1F) << 6) | (bytes[i + 1] & 0x3F);
-          nextI = i + 2;
-        } else {
-          codepoint = 0xFFFD;
-          nextI = i + 1;
-        }
-      } else if (b1 < 0xF0) {
-        // 3-byte sequence
-        if (i + 2 < bytes.length && 
-            (bytes[i + 1] & 0xC0) === 0x80 && 
-            (bytes[i + 2] & 0xC0) === 0x80) {
-          codepoint = ((b1 & 0x0F) << 12) | 
-                      ((bytes[i + 1] & 0x3F) << 6) | 
-                      (bytes[i + 2] & 0x3F);
-          nextI = i + 3;
-        } else {
-          codepoint = 0xFFFD;
-          nextI = i + 1;
-        }
-      } else if (b1 < 0xF8) {
-        // 4-byte sequence
-        if (i + 3 < bytes.length && 
-            (bytes[i + 1] & 0xC0) === 0x80 && 
-            (bytes[i + 2] & 0xC0) === 0x80 && 
-            (bytes[i + 3] & 0xC0) === 0x80) {
-          codepoint = ((b1 & 0x07) << 18) | 
-                      ((bytes[i + 1] & 0x3F) << 12) | 
-                      ((bytes[i + 2] & 0x3F) << 6) | 
-                      (bytes[i + 3] & 0x3F);
-          nextI = i + 4;
-        } else {
-          codepoint = 0xFFFD;
-          nextI = i + 1;
-        }
-      } else {
-        // Invalid start byte
-        codepoint = 0xFFFD;
-        nextI = i + 1;
-      }
-      
-      // Convert codepoint to string
-      if (codepoint <= 0xFFFF) {
-        result += String.fromCharCode(codepoint);
-      } else if (codepoint <= 0x10FFFF) {
-        // Convert to surrogate pair for JavaScript
-        codepoint -= 0x10000;
-        result += String.fromCharCode(0xD800 + (codepoint >> 10));
-        result += String.fromCharCode(0xDC00 + (codepoint & 0x3FF));
-      } else {
-        result += String.fromCharCode(0xFFFD); // Replacement character
-      }
-      
-      i = nextI;
-    }
-    
-    return result;
+    // Use the utf8 module to decode the blob
+    return utf8.decode(arg);
   }
   
   // Handle array conversion
   if (Array.isArray(arg)) {
     var separator = arguments[1] || "";
-    var result = "";
+    
+    // Check if all items are valid codepoints
+    var all_codepoints = true;
     for (var i = 0; i < arg.length; i++) {
-      if (i > 0) result += separator;
-      
       var item = arg[i];
-      if (typeof item === 'number' && item >= 0 && item <= 0x10FFFF && item === Math.floor(item)) {
-        // Unicode codepoint
-        result += String.fromCharCode(item);
-      } else {
-        result += String(item);
+      if (!(typeof item === 'number' && item >= 0 && item <= 0x10FFFF && item === Math.floor(item))) {
+        all_codepoints = false;
+        break;
       }
     }
-    return result;
+    
+    if (all_codepoints && separator === "") {
+      // Use utf8 module to convert codepoints to string
+      return utf8.from_codepoints(arg);
+    } else {
+      // General array to string conversion
+      var result = "";
+      for (var i = 0; i < arg.length; i++) {
+        if (i > 0) result += separator;
+        
+        var item = arg[i];
+        if (typeof item === 'number' && item >= 0 && item <= 0x10FFFF && item === Math.floor(item)) {
+          // Single codepoint - use utf8 module
+          result += utf8.from_codepoints([item]);
+        } else {
+          result += String(item);
+        }
+      }
+      return result;
+    }
   }
   
   // Handle number conversion
diff --git a/source/jsffi.c b/source/jsffi.c
index d700d92a..40984898 100644
--- a/source/jsffi.c
+++ b/source/jsffi.c
@@ -53,6 +53,8 @@
 #include "qjs_debug.h"
 #include "qjs_sdl_surface.h"
 #include "qjs_sdl.h"
+#include "qjs_kim.h"
+#include "qjs_utf8.h"
 #ifndef NSTEAM
 #include "qjs_steam.h"
 #endif
@@ -1554,6 +1556,8 @@ void ffi_load(JSContext *js)
   arrput(rt->module_registry, MISTLINE(http));
   arrput(rt->module_registry, MISTLINE(crypto));
   arrput(rt->module_registry, MISTLINE(miniz));
+  arrput(rt->module_registry, MISTLINE(kim));
+  arrput(rt->module_registry, MISTLINE(utf8));
 
   // power user
   arrput(rt->module_registry, MISTLINE(js));
diff --git a/source/kim.h b/source/kim.h
index 051945e6..107279b9 100755
--- a/source/kim.h
+++ b/source/kim.h
@@ -10,17 +10,17 @@ void kim_to_utf8(char **kim, char **utf, int runes);
 // Return the number of runes in a utf8 string
 int utf8_count(const char *utf8);
 
+int decode_utf8(char **s);
+void encode_utf8(char **s, int code);
+void encode_kim(char **s, int code);
+int decode_kim(char **s);
+
 #ifdef KIM_IMPLEMENTATION
 
 #define KIM_CONT 0x80
 #define KIM_DATA 0x7f
 #define CONTINUE(CHAR) (CHAR>>7)
 
-int decode_utf8(char **s);
-void encode_utf8(char **s, int code);
-static void encode_kim(char **s, int code);
-int decode_kim(char **s);
-
 static inline int utf8_bytes(char c)
 {
   int bytes = __builtin_clz(~(c));
@@ -70,7 +70,7 @@ void encode_utf8(char **s, int rune) {
 }
 
 // write and advance s with rune in kim
-static inline void encode_kim(char **s, int rune)
+void encode_kim(char **s, int rune)
 {
   if (rune < KIM_CONT) {
     **s = 0 | (KIM_DATA & rune);
diff --git a/source/nota.h b/source/nota.h
index 98f50586..c9ff68bb 100755
--- a/source/nota.h
+++ b/source/nota.h
@@ -3,6 +3,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include "kim.h"
 
 /* Nota type nibble values */
 #define NOTA_BLOB  0x00
diff --git a/source/qjs_kim.c b/source/qjs_kim.c
new file mode 100644
index 00000000..4084baf3
--- /dev/null
+++ b/source/qjs_kim.c
@@ -0,0 +1,82 @@
+#include "qjs_kim.h"
+#include "qjs_blob.h"
+#include "jsffi.h"
+#include <string.h>
+#include <stdlib.h>
+
+#define KIM_IMPLEMENTATION
+#include "kim.h"
+
+JSC_CCALL(kim_encode,
+  const char *utf8_str = JS_ToCString(js, argv[0]);
+  if (!utf8_str) return JS_EXCEPTION;
+  
+  // Count runes to estimate kim buffer size
+  int rune_count = utf8_count(utf8_str);
+  
+  // Allocate kim buffer (worst case: 5 bytes per rune)
+  size_t kim_size = rune_count * 5;
+  char *kim_buffer = malloc(kim_size);
+  char *kim_ptr = kim_buffer;
+  
+  // Encode utf8 to kim
+  long long runes_encoded;
+  utf8_to_kim(&utf8_str, &kim_ptr, &runes_encoded);
+  
+  // Calculate actual size used
+  size_t actual_size = kim_ptr - kim_buffer;
+  
+  // Create blob with the encoded data
+  ret = js_new_blob_stoned_copy(js, kim_buffer, actual_size);
+  
+  free(kim_buffer);
+  JS_FreeCString(js, utf8_str);
+)
+
+JSC_CCALL(kim_decode,
+  size_t kim_len;
+  void *kim_data = js_get_blob_data(js, &kim_len, argv[0]);
+  if (!kim_data) return JS_ThrowTypeError(js, "Expected blob");
+  
+  // Allocate UTF-8 buffer (worst case: 4 bytes per kim byte)
+  size_t utf8_size = kim_len * 4;
+  char *utf8_buffer = malloc(utf8_size + 1); // +1 for null terminator
+  char *utf8_ptr = utf8_buffer;
+  
+  // Copy kim data since kim_to_utf8 modifies the pointer
+  char *kim_copy = malloc(kim_len);
+  memcpy(kim_copy, kim_data, kim_len);
+  char *kim_ptr = kim_copy;
+  
+  // Count runes in kim data
+  int rune_count = 0;
+  char *temp_ptr = kim_copy;
+  while (temp_ptr < kim_copy + kim_len) {
+    decode_kim(&temp_ptr);
+    rune_count++;
+  }
+  
+  // Reset pointer and decode
+  kim_ptr = kim_copy;
+  kim_to_utf8(&kim_ptr, &utf8_ptr, rune_count);
+  
+  // Null terminate
+  *utf8_ptr = '\0';
+  
+  ret = JS_NewString(js, utf8_buffer);
+  
+  free(utf8_buffer);
+  free(kim_copy);
+)
+
+static const JSCFunctionListEntry js_kim_funcs[] = {
+  MIST_FUNC_DEF(kim, encode, 1),
+  MIST_FUNC_DEF(kim, decode, 1),
+};
+
+JSValue js_kim_use(JSContext *js)
+{
+  JSValue mod = JS_NewObject(js);
+  JS_SetPropertyFunctionList(js, mod, js_kim_funcs, countof(js_kim_funcs));
+  return mod;
+}
\ No newline at end of file
diff --git a/source/qjs_kim.h b/source/qjs_kim.h
new file mode 100644
index 00000000..580a4c00
--- /dev/null
+++ b/source/qjs_kim.h
@@ -0,0 +1,8 @@
+#ifndef QJS_KIM_H
+#define QJS_KIM_H
+
+#include "cell.h"
+
+JSValue js_kim_use(JSContext*);
+
+#endif
\ No newline at end of file
diff --git a/source/qjs_utf8.c b/source/qjs_utf8.c
new file mode 100644
index 00000000..0852de50
--- /dev/null
+++ b/source/qjs_utf8.c
@@ -0,0 +1,211 @@
+#include "qjs_utf8.h"
+#include "qjs_blob.h"
+#include "jsffi.h"
+#include <string.h>
+#include <stdlib.h>
+
+#include "kim.h"
+
+// Get codepoints from a UTF-8 string
+JSC_CCALL(utf8_codepoints,
+  const char *str = JS_ToCString(js, argv[0]);
+  if (!str) return JS_EXCEPTION;
+  
+  JSValue arr = JS_NewArray(js);
+  int idx = 0;
+  
+  char *ptr = (char*)str;
+  while (*ptr) {
+    int codepoint = decode_utf8(&ptr);
+    JS_SetPropertyUint32(js, arr, idx++, JS_NewInt32(js, codepoint));
+  }
+  
+  JS_FreeCString(js, str);
+  ret = arr;
+)
+
+// Create UTF-8 string from codepoints
+JSC_CCALL(utf8_from_codepoints,
+  int len = JS_ArrayLength(js, argv[0]);
+  
+  // Allocate buffer (worst case: 4 bytes per codepoint + null)
+  char *buffer = malloc(len * 4 + 1);
+  char *ptr = buffer;
+  
+  for (int i = 0; i < len; i++) {
+    JSValue val = JS_GetPropertyUint32(js, argv[0], i);
+    int codepoint;
+    JS_ToInt32(js, &codepoint, val);
+    JS_FreeValue(js, val);
+    
+    encode_utf8(&ptr, codepoint);
+  }
+  
+  *ptr = '\0';
+  ret = JS_NewString(js, buffer);
+  free(buffer);
+)
+
+// Count UTF-8 characters (runes) in a string
+JSC_SCALL(utf8_length,
+  int count = utf8_count(str);
+  ret = JS_NewInt32(js, count);
+)
+
+// Validate UTF-8 string
+JSC_SCALL(utf8_validate,
+  char *ptr = (char*)str;
+  int valid = 1;
+  
+  while (*ptr) {
+    int start_pos = ptr - str;
+    int codepoint = decode_utf8(&ptr);
+    
+    // Check for invalid sequences
+    if (codepoint < 0 || codepoint > 0x10FFFF || 
+        (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
+      valid = 0;
+      break;
+    }
+    
+    // Check for overlong encodings
+    int bytes_used = ptr - (str + start_pos);
+    if ((codepoint <= 0x7F && bytes_used != 1) ||
+        (codepoint <= 0x7FF && bytes_used != 2) ||
+        (codepoint <= 0xFFFF && bytes_used != 3) ||
+        (codepoint <= 0x10FFFF && bytes_used != 4)) {
+      valid = 0;
+      break;
+    }
+  }
+  
+  ret = JS_NewBool(js, valid);
+)
+
+// Get byte length of UTF-8 string
+JSC_SCALL(utf8_byte_length,
+  ret = JS_NewInt32(js, strlen(str));
+)
+
+// Encode string to UTF-8 bytes
+JSC_SCALL(utf8_encode,
+  size_t len = strlen(str);
+  ret = js_new_blob_stoned_copy(js, str, len);
+)
+
+// Decode UTF-8 bytes to string
+JSC_CCALL(utf8_decode,
+  size_t len;
+  void *data = js_get_blob_data(js, &len, argv[0]);
+  if (!data) return JS_ThrowTypeError(js, "Expected blob");
+  
+  // Create null-terminated string
+  char *str = malloc(len + 1);
+  memcpy(str, data, len);
+  str[len] = '\0';
+  
+  ret = JS_NewString(js, str);
+  free(str);
+)
+
+// Slice UTF-8 string by character indices (not byte indices)
+JSC_CCALL(utf8_slice,
+  const char *str = JS_ToCString(js, argv[0]);
+  if (!str) return JS_EXCEPTION;
+  
+  int start = 0;
+  int end = utf8_count(str);
+  
+  if (argc > 1) JS_ToInt32(js, &start, argv[1]);
+  if (argc > 2) JS_ToInt32(js, &end, argv[2]);
+  
+  // Handle negative indices
+  int total = end;
+  if (start < 0) start = total + start;
+  if (end < 0) end = total + end;
+  
+  // Clamp values
+  if (start < 0) start = 0;
+  if (end > total) end = total;
+  if (start >= end) {
+    JS_FreeCString(js, str);
+    return JS_NewString(js, "");
+  }
+  
+  // Find start position
+  char *ptr = (char*)str;
+  for (int i = 0; i < start && *ptr; i++) {
+    decode_utf8(&ptr);
+  }
+  char *start_ptr = ptr;
+  
+  // Find end position
+  for (int i = start; i < end && *ptr; i++) {
+    decode_utf8(&ptr);
+  }
+  
+  // Create substring
+  size_t slice_len = ptr - start_ptr;
+  char *slice = malloc(slice_len + 1);
+  memcpy(slice, start_ptr, slice_len);
+  slice[slice_len] = '\0';
+  
+  ret = JS_NewString(js, slice);
+  free(slice);
+  JS_FreeCString(js, str);
+)
+
+// Get character at index
+JSC_CCALL(utf8_char_at,
+  const char *str = JS_ToCString(js, argv[0]);
+  if (!str) return JS_EXCEPTION;
+  
+  int index;
+  JS_ToInt32(js, &index, argv[1]);
+  
+  char *ptr = (char*)str;
+  int count = 0;
+  
+  // Skip to index
+  while (*ptr && count < index) {
+    decode_utf8(&ptr);
+    count++;
+  }
+  
+  if (!*ptr || count != index) {
+    JS_FreeCString(js, str);
+    return JS_UNDEFINED;
+  }
+  
+  // Get the character
+  char *char_start = ptr;
+  decode_utf8(&ptr);
+  
+  size_t char_len = ptr - char_start;
+  char *result = malloc(char_len + 1);
+  memcpy(result, char_start, char_len);
+  result[char_len] = '\0';
+  
+  ret = JS_NewString(js, result);
+  free(result);
+  JS_FreeCString(js, str);
+)
+
+static const JSCFunctionListEntry js_utf8_funcs[] = {
+  MIST_FUNC_DEF(utf8, codepoints, 1),
+  MIST_FUNC_DEF(utf8, from_codepoints, 1),
+  MIST_FUNC_DEF(utf8, length, 1),
+  MIST_FUNC_DEF(utf8, validate, 1),
+  MIST_FUNC_DEF(utf8, byte_length, 1),
+  MIST_FUNC_DEF(utf8, encode, 1),
+  MIST_FUNC_DEF(utf8, decode, 1),
+  MIST_FUNC_DEF(utf8, slice, 3),
+  MIST_FUNC_DEF(utf8, char_at, 2),
+};
+
+JSValue js_utf8_use(JSContext *js)
+{
+  JSValue mod = JS_NewObject(js);
+  JS_SetPropertyFunctionList(js, mod, js_utf8_funcs, countof(js_utf8_funcs));
+  return mod;
+}
\ No newline at end of file
diff --git a/source/qjs_utf8.h b/source/qjs_utf8.h
new file mode 100644
index 00000000..ae1327c9
--- /dev/null
+++ b/source/qjs_utf8.h
@@ -0,0 +1,8 @@
+#ifndef QJS_UTF8_H
+#define QJS_UTF8_H
+
+#include "cell.h"
+
+JSValue js_utf8_use(JSContext*);
+
+#endif
\ No newline at end of file
diff --git a/tests/kim.ce b/tests/kim.ce
new file mode 100644
index 00000000..89aa4ca0
--- /dev/null
+++ b/tests/kim.ce
@@ -0,0 +1,51 @@
+var kim = use("kim");
+var blob = use('blob')
+
+// Test basic ASCII
+var test1 = "Hello, World!";
+var encoded1 = kim.encode(test1);
+var decoded1 = kim.decode(encoded1);
+log.console("ASCII test:", test1 === decoded1 ? "PASS" : "FAIL");
+if (test1 !== decoded1) {
+  log.console("  Expected:", test1);
+  log.console("  Got:", decoded1);
+}
+
+// Test Unicode characters
+var test2 = "Hello, 世界! 🌍 Привет мир";
+var encoded2 = kim.encode(test2);
+var decoded2 = kim.decode(encoded2);
+log.console("Unicode test:", test2 === decoded2 ? "PASS" : "FAIL");
+if (test2 !== decoded2) {
+  log.console("  Expected:", test2);
+  log.console("  Got:", decoded2);
+}
+
+// Test empty string
+var test3 = "";
+var encoded3 = kim.encode(test3);
+log.console(typeof encoded3)
+log.console(encoded3 instanceof blob)
+var decoded3 = kim.decode(encoded3);
+log.console("Empty string test:", test3 === decoded3 ? "PASS" : "FAIL");
+
+// Test various Unicode ranges
+var test4 = "αβγδε АБВГД 你好 😀😎🎉 ∑∏∫";
+var encoded4 = kim.encode(test4);
+var decoded4 = kim.decode(encoded4);
+log.console("Mixed Unicode test:", test4 === decoded4 ? "PASS" : "FAIL");
+if (test4 !== decoded4) {
+  log.console("  Expected:", test4);
+  log.console("  Got:", decoded4);
+}
+
+// Test efficiency - KIM should be smaller for high codepoints
+var highCodepoints = "🌍🌎🌏🗺️🧭";
+var encodedHigh = kim.encode(highCodepoints);
+var utf8Bytes = new Blob([highCodepoints]).size;
+log.console("High codepoint efficiency:");
+log.console("  UTF-8 bytes:", utf8Bytes);
+log.console("  KIM bytes:", encodedHigh.byteLength);
+log.console("  Savings:", utf8Bytes - encodedHigh.byteLength, "bytes");
+
+log.console("\nAll tests completed!");
\ No newline at end of file
diff --git a/tests/text_utf8.ce b/tests/text_utf8.ce
new file mode 100644
index 00000000..42f0b128
--- /dev/null
+++ b/tests/text_utf8.ce
@@ -0,0 +1,47 @@
+var text = use('text');
+var blob = use('blob');
+var utf8 = use('utf8');
+
+// Test blob to text conversion
+var test_string = "Hello, 世界! 🌍";
+var encoded_blob = utf8.encode(test_string);
+var decoded_text = text(encoded_blob);
+
+log.console("Blob to text test:");
+log.console("  Original:", test_string);
+log.console("  Decoded:", decoded_text);
+log.console("  Match:", test_string === decoded_text ? "PASS" : "FAIL");
+
+// Test array of codepoints conversion
+var codepoints = [72, 101, 108, 108, 111, 44, 32, 19990, 30028, 33, 32, 127757];
+var from_codepoints = text(codepoints);
+log.console("\nCodepoints to text test:");
+log.console("  From codepoints:", from_codepoints);
+log.console("  Match:", from_codepoints === test_string ? "PASS" : "FAIL");
+
+// Test array with separator
+var words = ["Hello", "world", "from", "text"];
+var joined = text(words, " ");
+log.console("\nArray with separator test:");
+log.console("  Joined:", joined);
+log.console("  Expected: Hello world from text");
+log.console("  Match:", joined === "Hello world from text" ? "PASS" : "FAIL");
+
+// Test mixed array with codepoints
+var mixed = [72, "ello", 32, "world"];
+var mixed_result = text(mixed, "");
+log.console("\nMixed array test:");
+log.console("  Result:", mixed_result);
+log.console("  Expected: Hello world");
+log.console("  Match:", mixed_result === "Hello world" ? "PASS" : "FAIL");
+
+// Test blob encoding formats still work
+var test_data = utf8.encode("ABC");
+log.console("\nBlob format tests:");
+log.console("  Hex:", text(test_data, "h"));
+log.console("  Binary:", text(test_data, "b"));
+log.console("  Octal:", text(test_data, "o"));
+
+log.console("\nAll tests completed!");
+
+$_.stop();
\ No newline at end of file
diff --git a/tests/utf8.ce b/tests/utf8.ce
new file mode 100644
index 00000000..41a2bd4a
--- /dev/null
+++ b/tests/utf8.ce
@@ -0,0 +1,70 @@
+var utf8 = use("utf8");
+
+// Test character counting vs byte counting
+var test1 = "Hello";
+log.console("ASCII length test:");
+log.console("  Characters:", utf8.length(test1));
+log.console("  Bytes:", utf8.byte_length(test1));
+log.console("  Match:", utf8.length(test1) === utf8.byte_length(test1) ? "PASS" : "FAIL");
+
+var test2 = "Hello 世界";
+log.console("\nMixed ASCII/Unicode length test:");
+log.console("  Characters:", utf8.length(test2));
+log.console("  Bytes:", utf8.byte_length(test2));
+log.console("  Bytes > Characters:", utf8.byte_length(test2) > utf8.length(test2) ? "PASS" : "FAIL");
+
+// Test codepoints
+var test3 = "A😀B";
+var codepoints = utf8.codepoints(test3);
+log.console("\nCodepoints test:");
+log.console("  String:", test3);
+log.console("  Codepoints:", codepoints);
+log.console("  A=65:", codepoints[0] === 65 ? "PASS" : "FAIL");
+log.console("  😀=128512:", codepoints[1] === 128512 ? "PASS" : "FAIL");
+log.console("  B=66:", codepoints[2] === 66 ? "PASS" : "FAIL");
+
+// Test from_codepoints
+var reconstructed = utf8.from_codepoints(codepoints);
+log.console("  Reconstructed:", reconstructed);
+log.console("  Match:", test3 === reconstructed ? "PASS" : "FAIL");
+
+// Test encode/decode
+var test4 = "UTF-8 encoding: 你好世界 🌍";
+var encoded = utf8.encode(test4);
+var decoded = utf8.decode(encoded);
+log.console("\nEncode/decode test:");
+log.console("  Original:", test4);
+log.console("  Decoded:", decoded);
+log.console("  Match:", test4 === decoded ? "PASS" : "FAIL");
+
+// Test validation
+log.console("\nValidation tests:");
+log.console("  Valid UTF-8:", utf8.validate("Hello 世界") ? "PASS" : "FAIL");
+
+// Test slicing
+var test5 = "Hello 世界!";
+log.console("\nSlice tests:");
+log.console("  Original:", test5);
+log.console("  slice(0, 5):", utf8.slice(test5, 0, 5));
+log.console("  slice(6, 8):", utf8.slice(test5, 6, 8));
+log.console("  slice(-3):", utf8.slice(test5, -3));
+log.console("  slice(0, -1):", utf8.slice(test5, 0, -1));
+
+// Test char_at
+log.console("\nchar_at tests:");
+log.console("  char_at(0):", utf8.char_at(test5, 0));
+log.console("  char_at(6):", utf8.char_at(test5, 6));
+log.console("  char_at(7):", utf8.char_at(test5, 7));
+log.console("  char_at(100):", utf8.char_at(test5, 100));
+
+// Test with emoji sequences
+var test6 = "👨‍👩‍👧‍👦";
+log.console("\nComplex emoji test:");
+log.console("  String:", test6);
+log.console("  Length:", utf8.length(test6));
+log.console("  Byte length:", utf8.byte_length(test6));
+log.console("  Codepoints:", utf8.codepoints(test6).length);
+
+log.console("\nAll tests completed!");
+
+$_.stop()
\ No newline at end of file