add utf8 and kim text encoder/decoders
This commit is contained in:
@@ -295,7 +295,7 @@ src += [
|
||||
'anim.c', 'config.c', 'datastream.c','font.c','HandmadeMath.c','jsffi.c','model.c',
|
||||
'render.c','simplex.c','spline.c', 'transform.c','cell.c', 'wildmatch.c',
|
||||
'sprite.c', 'rtree.c', 'qjs_nota.c', 'qjs_soloud.c', 'qjs_sdl.c', 'qjs_sdl_input.c', 'qjs_sdl_video.c', 'qjs_sdl_surface.c', 'qjs_math.c', 'qjs_geometry.c', 'qjs_transform.c', 'qjs_sprite.c', 'qjs_io.c', 'qjs_fd.c', 'qjs_os.c', 'qjs_actor.c',
|
||||
'qjs_qr.c', 'qjs_wota.c', 'monocypher.c', 'qjs_blob.c', 'qjs_crypto.c', 'qjs_time.c', 'qjs_http.c', 'qjs_rtree.c', 'qjs_spline.c', 'qjs_js.c', 'qjs_debug.c', 'picohttpparser.c', 'qjs_miniz.c', 'timer.c', 'qjs_socket.c'
|
||||
'qjs_qr.c', 'qjs_wota.c', 'monocypher.c', 'qjs_blob.c', 'qjs_crypto.c', 'qjs_time.c', 'qjs_http.c', 'qjs_rtree.c', 'qjs_spline.c', 'qjs_js.c', 'qjs_debug.c', 'picohttpparser.c', 'qjs_miniz.c', 'timer.c', 'qjs_socket.c', 'qjs_kim.c', 'qjs_utf8.c'
|
||||
]
|
||||
# quirc src
|
||||
src += [
|
||||
|
||||
113
scripts/text.cm
113
scripts/text.cm
@@ -3,6 +3,7 @@
|
||||
/* -------- helper functions ----------------------------------------- */
|
||||
|
||||
var blob = use('blob')
|
||||
var utf8 = use('utf8')
|
||||
|
||||
// Convert number to string with given radix
|
||||
function to_radix(num, radix) {
|
||||
@@ -179,114 +180,44 @@ function text() {
|
||||
}
|
||||
|
||||
// Default: interpret as UTF-8 text
|
||||
var byte_count = Math.floor(bit_length / 8);
|
||||
var bytes = [];
|
||||
|
||||
// Read bytes from the blob
|
||||
for (var i = 0; i < byte_count; i++) {
|
||||
var byte_val = 0;
|
||||
for (var j = 0; j < 8; j++) {
|
||||
var bit_pos = i * 8 + j;
|
||||
var bit = arg.read_logical(bit_pos);
|
||||
if (bit) byte_val |= (1 << j);
|
||||
}
|
||||
bytes.push(byte_val);
|
||||
}
|
||||
|
||||
// Convert bytes to UTF-8 string
|
||||
var result = "";
|
||||
var i = 0;
|
||||
while (i < bytes.length) {
|
||||
var b1 = bytes[i];
|
||||
var codepoint;
|
||||
var nextI;
|
||||
|
||||
if (b1 < 0x80) {
|
||||
// 1-byte ASCII
|
||||
codepoint = b1;
|
||||
nextI = i + 1;
|
||||
} else if (b1 < 0xC0) {
|
||||
// Invalid start byte, treat as replacement character
|
||||
codepoint = 0xFFFD;
|
||||
nextI = i + 1;
|
||||
} else if (b1 < 0xE0) {
|
||||
// 2-byte sequence
|
||||
if (i + 1 < bytes.length && (bytes[i + 1] & 0xC0) === 0x80) {
|
||||
codepoint = ((b1 & 0x1F) << 6) | (bytes[i + 1] & 0x3F);
|
||||
nextI = i + 2;
|
||||
} else {
|
||||
codepoint = 0xFFFD;
|
||||
nextI = i + 1;
|
||||
}
|
||||
} else if (b1 < 0xF0) {
|
||||
// 3-byte sequence
|
||||
if (i + 2 < bytes.length &&
|
||||
(bytes[i + 1] & 0xC0) === 0x80 &&
|
||||
(bytes[i + 2] & 0xC0) === 0x80) {
|
||||
codepoint = ((b1 & 0x0F) << 12) |
|
||||
((bytes[i + 1] & 0x3F) << 6) |
|
||||
(bytes[i + 2] & 0x3F);
|
||||
nextI = i + 3;
|
||||
} else {
|
||||
codepoint = 0xFFFD;
|
||||
nextI = i + 1;
|
||||
}
|
||||
} else if (b1 < 0xF8) {
|
||||
// 4-byte sequence
|
||||
if (i + 3 < bytes.length &&
|
||||
(bytes[i + 1] & 0xC0) === 0x80 &&
|
||||
(bytes[i + 2] & 0xC0) === 0x80 &&
|
||||
(bytes[i + 3] & 0xC0) === 0x80) {
|
||||
codepoint = ((b1 & 0x07) << 18) |
|
||||
((bytes[i + 1] & 0x3F) << 12) |
|
||||
((bytes[i + 2] & 0x3F) << 6) |
|
||||
(bytes[i + 3] & 0x3F);
|
||||
nextI = i + 4;
|
||||
} else {
|
||||
codepoint = 0xFFFD;
|
||||
nextI = i + 1;
|
||||
}
|
||||
} else {
|
||||
// Invalid start byte
|
||||
codepoint = 0xFFFD;
|
||||
nextI = i + 1;
|
||||
}
|
||||
|
||||
// Convert codepoint to string
|
||||
if (codepoint <= 0xFFFF) {
|
||||
result += String.fromCharCode(codepoint);
|
||||
} else if (codepoint <= 0x10FFFF) {
|
||||
// Convert to surrogate pair for JavaScript
|
||||
codepoint -= 0x10000;
|
||||
result += String.fromCharCode(0xD800 + (codepoint >> 10));
|
||||
result += String.fromCharCode(0xDC00 + (codepoint & 0x3FF));
|
||||
} else {
|
||||
result += String.fromCharCode(0xFFFD); // Replacement character
|
||||
}
|
||||
|
||||
i = nextI;
|
||||
}
|
||||
|
||||
return result;
|
||||
// Use the utf8 module to decode the blob
|
||||
return utf8.decode(arg);
|
||||
}
|
||||
|
||||
// Handle array conversion
|
||||
if (Array.isArray(arg)) {
|
||||
var separator = arguments[1] || "";
|
||||
|
||||
// Check if all items are valid codepoints
|
||||
var all_codepoints = true;
|
||||
for (var i = 0; i < arg.length; i++) {
|
||||
var item = arg[i];
|
||||
if (!(typeof item === 'number' && item >= 0 && item <= 0x10FFFF && item === Math.floor(item))) {
|
||||
all_codepoints = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_codepoints && separator === "") {
|
||||
// Use utf8 module to convert codepoints to string
|
||||
return utf8.from_codepoints(arg);
|
||||
} else {
|
||||
// General array to string conversion
|
||||
var result = "";
|
||||
for (var i = 0; i < arg.length; i++) {
|
||||
if (i > 0) result += separator;
|
||||
|
||||
var item = arg[i];
|
||||
if (typeof item === 'number' && item >= 0 && item <= 0x10FFFF && item === Math.floor(item)) {
|
||||
// Unicode codepoint
|
||||
result += String.fromCharCode(item);
|
||||
// Single codepoint - use utf8 module
|
||||
result += utf8.from_codepoints([item]);
|
||||
} else {
|
||||
result += String(item);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle number conversion
|
||||
if (typeof arg === 'number') {
|
||||
|
||||
@@ -53,6 +53,8 @@
|
||||
#include "qjs_debug.h"
|
||||
#include "qjs_sdl_surface.h"
|
||||
#include "qjs_sdl.h"
|
||||
#include "qjs_kim.h"
|
||||
#include "qjs_utf8.h"
|
||||
#ifndef NSTEAM
|
||||
#include "qjs_steam.h"
|
||||
#endif
|
||||
@@ -1554,6 +1556,8 @@ void ffi_load(JSContext *js)
|
||||
arrput(rt->module_registry, MISTLINE(http));
|
||||
arrput(rt->module_registry, MISTLINE(crypto));
|
||||
arrput(rt->module_registry, MISTLINE(miniz));
|
||||
arrput(rt->module_registry, MISTLINE(kim));
|
||||
arrput(rt->module_registry, MISTLINE(utf8));
|
||||
|
||||
// power user
|
||||
arrput(rt->module_registry, MISTLINE(js));
|
||||
|
||||
12
source/kim.h
12
source/kim.h
@@ -10,17 +10,17 @@ void kim_to_utf8(char **kim, char **utf, int runes);
|
||||
// Return the number of runes in a utf8 string
|
||||
int utf8_count(const char *utf8);
|
||||
|
||||
int decode_utf8(char **s);
|
||||
void encode_utf8(char **s, int code);
|
||||
void encode_kim(char **s, int code);
|
||||
int decode_kim(char **s);
|
||||
|
||||
#ifdef KIM_IMPLEMENTATION
|
||||
|
||||
#define KIM_CONT 0x80
|
||||
#define KIM_DATA 0x7f
|
||||
#define CONTINUE(CHAR) (CHAR>>7)
|
||||
|
||||
int decode_utf8(char **s);
|
||||
void encode_utf8(char **s, int code);
|
||||
static void encode_kim(char **s, int code);
|
||||
int decode_kim(char **s);
|
||||
|
||||
static inline int utf8_bytes(char c)
|
||||
{
|
||||
int bytes = __builtin_clz(~(c));
|
||||
@@ -70,7 +70,7 @@ void encode_utf8(char **s, int rune) {
|
||||
}
|
||||
|
||||
// write and advance s with rune in kim
|
||||
static inline void encode_kim(char **s, int rune)
|
||||
void encode_kim(char **s, int rune)
|
||||
{
|
||||
if (rune < KIM_CONT) {
|
||||
**s = 0 | (KIM_DATA & rune);
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include "kim.h"
|
||||
|
||||
/* Nota type nibble values */
|
||||
#define NOTA_BLOB 0x00
|
||||
|
||||
82
source/qjs_kim.c
Normal file
82
source/qjs_kim.c
Normal file
@@ -0,0 +1,82 @@
|
||||
#include "qjs_kim.h"
|
||||
#include "qjs_blob.h"
|
||||
#include "jsffi.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define KIM_IMPLEMENTATION
|
||||
#include "kim.h"
|
||||
|
||||
JSC_CCALL(kim_encode,
|
||||
const char *utf8_str = JS_ToCString(js, argv[0]);
|
||||
if (!utf8_str) return JS_EXCEPTION;
|
||||
|
||||
// Count runes to estimate kim buffer size
|
||||
int rune_count = utf8_count(utf8_str);
|
||||
|
||||
// Allocate kim buffer (worst case: 5 bytes per rune)
|
||||
size_t kim_size = rune_count * 5;
|
||||
char *kim_buffer = malloc(kim_size);
|
||||
char *kim_ptr = kim_buffer;
|
||||
|
||||
// Encode utf8 to kim
|
||||
long long runes_encoded;
|
||||
utf8_to_kim(&utf8_str, &kim_ptr, &runes_encoded);
|
||||
|
||||
// Calculate actual size used
|
||||
size_t actual_size = kim_ptr - kim_buffer;
|
||||
|
||||
// Create blob with the encoded data
|
||||
ret = js_new_blob_stoned_copy(js, kim_buffer, actual_size);
|
||||
|
||||
free(kim_buffer);
|
||||
JS_FreeCString(js, utf8_str);
|
||||
)
|
||||
|
||||
JSC_CCALL(kim_decode,
|
||||
size_t kim_len;
|
||||
void *kim_data = js_get_blob_data(js, &kim_len, argv[0]);
|
||||
if (!kim_data) return JS_ThrowTypeError(js, "Expected blob");
|
||||
|
||||
// Allocate UTF-8 buffer (worst case: 4 bytes per kim byte)
|
||||
size_t utf8_size = kim_len * 4;
|
||||
char *utf8_buffer = malloc(utf8_size + 1); // +1 for null terminator
|
||||
char *utf8_ptr = utf8_buffer;
|
||||
|
||||
// Copy kim data since kim_to_utf8 modifies the pointer
|
||||
char *kim_copy = malloc(kim_len);
|
||||
memcpy(kim_copy, kim_data, kim_len);
|
||||
char *kim_ptr = kim_copy;
|
||||
|
||||
// Count runes in kim data
|
||||
int rune_count = 0;
|
||||
char *temp_ptr = kim_copy;
|
||||
while (temp_ptr < kim_copy + kim_len) {
|
||||
decode_kim(&temp_ptr);
|
||||
rune_count++;
|
||||
}
|
||||
|
||||
// Reset pointer and decode
|
||||
kim_ptr = kim_copy;
|
||||
kim_to_utf8(&kim_ptr, &utf8_ptr, rune_count);
|
||||
|
||||
// Null terminate
|
||||
*utf8_ptr = '\0';
|
||||
|
||||
ret = JS_NewString(js, utf8_buffer);
|
||||
|
||||
free(utf8_buffer);
|
||||
free(kim_copy);
|
||||
)
|
||||
|
||||
static const JSCFunctionListEntry js_kim_funcs[] = {
|
||||
MIST_FUNC_DEF(kim, encode, 1),
|
||||
MIST_FUNC_DEF(kim, decode, 1),
|
||||
};
|
||||
|
||||
JSValue js_kim_use(JSContext *js)
|
||||
{
|
||||
JSValue mod = JS_NewObject(js);
|
||||
JS_SetPropertyFunctionList(js, mod, js_kim_funcs, countof(js_kim_funcs));
|
||||
return mod;
|
||||
}
|
||||
8
source/qjs_kim.h
Normal file
8
source/qjs_kim.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef QJS_KIM_H
|
||||
#define QJS_KIM_H
|
||||
|
||||
#include "cell.h"
|
||||
|
||||
JSValue js_kim_use(JSContext*);
|
||||
|
||||
#endif
|
||||
211
source/qjs_utf8.c
Normal file
211
source/qjs_utf8.c
Normal file
@@ -0,0 +1,211 @@
|
||||
#include "qjs_utf8.h"
|
||||
#include "qjs_blob.h"
|
||||
#include "jsffi.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "kim.h"
|
||||
|
||||
// Get codepoints from a UTF-8 string
|
||||
JSC_CCALL(utf8_codepoints,
|
||||
const char *str = JS_ToCString(js, argv[0]);
|
||||
if (!str) return JS_EXCEPTION;
|
||||
|
||||
JSValue arr = JS_NewArray(js);
|
||||
int idx = 0;
|
||||
|
||||
char *ptr = (char*)str;
|
||||
while (*ptr) {
|
||||
int codepoint = decode_utf8(&ptr);
|
||||
JS_SetPropertyUint32(js, arr, idx++, JS_NewInt32(js, codepoint));
|
||||
}
|
||||
|
||||
JS_FreeCString(js, str);
|
||||
ret = arr;
|
||||
)
|
||||
|
||||
// Create UTF-8 string from codepoints
|
||||
JSC_CCALL(utf8_from_codepoints,
|
||||
int len = JS_ArrayLength(js, argv[0]);
|
||||
|
||||
// Allocate buffer (worst case: 4 bytes per codepoint + null)
|
||||
char *buffer = malloc(len * 4 + 1);
|
||||
char *ptr = buffer;
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
JSValue val = JS_GetPropertyUint32(js, argv[0], i);
|
||||
int codepoint;
|
||||
JS_ToInt32(js, &codepoint, val);
|
||||
JS_FreeValue(js, val);
|
||||
|
||||
encode_utf8(&ptr, codepoint);
|
||||
}
|
||||
|
||||
*ptr = '\0';
|
||||
ret = JS_NewString(js, buffer);
|
||||
free(buffer);
|
||||
)
|
||||
|
||||
// Count UTF-8 characters (runes) in a string
|
||||
JSC_SCALL(utf8_length,
|
||||
int count = utf8_count(str);
|
||||
ret = JS_NewInt32(js, count);
|
||||
)
|
||||
|
||||
// Validate UTF-8 string
|
||||
JSC_SCALL(utf8_validate,
|
||||
char *ptr = (char*)str;
|
||||
int valid = 1;
|
||||
|
||||
while (*ptr) {
|
||||
int start_pos = ptr - str;
|
||||
int codepoint = decode_utf8(&ptr);
|
||||
|
||||
// Check for invalid sequences
|
||||
if (codepoint < 0 || codepoint > 0x10FFFF ||
|
||||
(codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
|
||||
valid = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
// Check for overlong encodings
|
||||
int bytes_used = ptr - (str + start_pos);
|
||||
if ((codepoint <= 0x7F && bytes_used != 1) ||
|
||||
(codepoint <= 0x7FF && bytes_used != 2) ||
|
||||
(codepoint <= 0xFFFF && bytes_used != 3) ||
|
||||
(codepoint <= 0x10FFFF && bytes_used != 4)) {
|
||||
valid = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ret = JS_NewBool(js, valid);
|
||||
)
|
||||
|
||||
// Get byte length of UTF-8 string
|
||||
JSC_SCALL(utf8_byte_length,
|
||||
ret = JS_NewInt32(js, strlen(str));
|
||||
)
|
||||
|
||||
// Encode string to UTF-8 bytes
|
||||
JSC_SCALL(utf8_encode,
|
||||
size_t len = strlen(str);
|
||||
ret = js_new_blob_stoned_copy(js, str, len);
|
||||
)
|
||||
|
||||
// Decode UTF-8 bytes to string
|
||||
JSC_CCALL(utf8_decode,
|
||||
size_t len;
|
||||
void *data = js_get_blob_data(js, &len, argv[0]);
|
||||
if (!data) return JS_ThrowTypeError(js, "Expected blob");
|
||||
|
||||
// Create null-terminated string
|
||||
char *str = malloc(len + 1);
|
||||
memcpy(str, data, len);
|
||||
str[len] = '\0';
|
||||
|
||||
ret = JS_NewString(js, str);
|
||||
free(str);
|
||||
)
|
||||
|
||||
// Slice UTF-8 string by character indices (not byte indices)
|
||||
JSC_CCALL(utf8_slice,
|
||||
const char *str = JS_ToCString(js, argv[0]);
|
||||
if (!str) return JS_EXCEPTION;
|
||||
|
||||
int start = 0;
|
||||
int end = utf8_count(str);
|
||||
|
||||
if (argc > 1) JS_ToInt32(js, &start, argv[1]);
|
||||
if (argc > 2) JS_ToInt32(js, &end, argv[2]);
|
||||
|
||||
// Handle negative indices
|
||||
int total = end;
|
||||
if (start < 0) start = total + start;
|
||||
if (end < 0) end = total + end;
|
||||
|
||||
// Clamp values
|
||||
if (start < 0) start = 0;
|
||||
if (end > total) end = total;
|
||||
if (start >= end) {
|
||||
JS_FreeCString(js, str);
|
||||
return JS_NewString(js, "");
|
||||
}
|
||||
|
||||
// Find start position
|
||||
char *ptr = (char*)str;
|
||||
for (int i = 0; i < start && *ptr; i++) {
|
||||
decode_utf8(&ptr);
|
||||
}
|
||||
char *start_ptr = ptr;
|
||||
|
||||
// Find end position
|
||||
for (int i = start; i < end && *ptr; i++) {
|
||||
decode_utf8(&ptr);
|
||||
}
|
||||
|
||||
// Create substring
|
||||
size_t slice_len = ptr - start_ptr;
|
||||
char *slice = malloc(slice_len + 1);
|
||||
memcpy(slice, start_ptr, slice_len);
|
||||
slice[slice_len] = '\0';
|
||||
|
||||
ret = JS_NewString(js, slice);
|
||||
free(slice);
|
||||
JS_FreeCString(js, str);
|
||||
)
|
||||
|
||||
// Get character at index
|
||||
JSC_CCALL(utf8_char_at,
|
||||
const char *str = JS_ToCString(js, argv[0]);
|
||||
if (!str) return JS_EXCEPTION;
|
||||
|
||||
int index;
|
||||
JS_ToInt32(js, &index, argv[1]);
|
||||
|
||||
char *ptr = (char*)str;
|
||||
int count = 0;
|
||||
|
||||
// Skip to index
|
||||
while (*ptr && count < index) {
|
||||
decode_utf8(&ptr);
|
||||
count++;
|
||||
}
|
||||
|
||||
if (!*ptr || count != index) {
|
||||
JS_FreeCString(js, str);
|
||||
return JS_UNDEFINED;
|
||||
}
|
||||
|
||||
// Get the character
|
||||
char *char_start = ptr;
|
||||
decode_utf8(&ptr);
|
||||
|
||||
size_t char_len = ptr - char_start;
|
||||
char *result = malloc(char_len + 1);
|
||||
memcpy(result, char_start, char_len);
|
||||
result[char_len] = '\0';
|
||||
|
||||
ret = JS_NewString(js, result);
|
||||
free(result);
|
||||
JS_FreeCString(js, str);
|
||||
)
|
||||
|
||||
static const JSCFunctionListEntry js_utf8_funcs[] = {
|
||||
MIST_FUNC_DEF(utf8, codepoints, 1),
|
||||
MIST_FUNC_DEF(utf8, from_codepoints, 1),
|
||||
MIST_FUNC_DEF(utf8, length, 1),
|
||||
MIST_FUNC_DEF(utf8, validate, 1),
|
||||
MIST_FUNC_DEF(utf8, byte_length, 1),
|
||||
MIST_FUNC_DEF(utf8, encode, 1),
|
||||
MIST_FUNC_DEF(utf8, decode, 1),
|
||||
MIST_FUNC_DEF(utf8, slice, 3),
|
||||
MIST_FUNC_DEF(utf8, char_at, 2),
|
||||
};
|
||||
|
||||
JSValue js_utf8_use(JSContext *js)
|
||||
{
|
||||
JSValue mod = JS_NewObject(js);
|
||||
JS_SetPropertyFunctionList(js, mod, js_utf8_funcs, countof(js_utf8_funcs));
|
||||
return mod;
|
||||
}
|
||||
8
source/qjs_utf8.h
Normal file
8
source/qjs_utf8.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef QJS_UTF8_H
|
||||
#define QJS_UTF8_H
|
||||
|
||||
#include "cell.h"
|
||||
|
||||
JSValue js_utf8_use(JSContext*);
|
||||
|
||||
#endif
|
||||
51
tests/kim.ce
Normal file
51
tests/kim.ce
Normal file
@@ -0,0 +1,51 @@
|
||||
var kim = use("kim");
|
||||
var blob = use('blob')
|
||||
|
||||
// Test basic ASCII
|
||||
var test1 = "Hello, World!";
|
||||
var encoded1 = kim.encode(test1);
|
||||
var decoded1 = kim.decode(encoded1);
|
||||
log.console("ASCII test:", test1 === decoded1 ? "PASS" : "FAIL");
|
||||
if (test1 !== decoded1) {
|
||||
log.console(" Expected:", test1);
|
||||
log.console(" Got:", decoded1);
|
||||
}
|
||||
|
||||
// Test Unicode characters
|
||||
var test2 = "Hello, 世界! 🌍 Привет мир";
|
||||
var encoded2 = kim.encode(test2);
|
||||
var decoded2 = kim.decode(encoded2);
|
||||
log.console("Unicode test:", test2 === decoded2 ? "PASS" : "FAIL");
|
||||
if (test2 !== decoded2) {
|
||||
log.console(" Expected:", test2);
|
||||
log.console(" Got:", decoded2);
|
||||
}
|
||||
|
||||
// Test empty string
|
||||
var test3 = "";
|
||||
var encoded3 = kim.encode(test3);
|
||||
log.console(typeof encoded3)
|
||||
log.console(encoded3 instanceof blob)
|
||||
var decoded3 = kim.decode(encoded3);
|
||||
log.console("Empty string test:", test3 === decoded3 ? "PASS" : "FAIL");
|
||||
|
||||
// Test various Unicode ranges
|
||||
var test4 = "αβγδε АБВГД 你好 😀😎🎉 ∑∏∫";
|
||||
var encoded4 = kim.encode(test4);
|
||||
var decoded4 = kim.decode(encoded4);
|
||||
log.console("Mixed Unicode test:", test4 === decoded4 ? "PASS" : "FAIL");
|
||||
if (test4 !== decoded4) {
|
||||
log.console(" Expected:", test4);
|
||||
log.console(" Got:", decoded4);
|
||||
}
|
||||
|
||||
// Test efficiency - KIM should be smaller for high codepoints
|
||||
var highCodepoints = "🌍🌎🌏🗺️🧭";
|
||||
var encodedHigh = kim.encode(highCodepoints);
|
||||
var utf8Bytes = new Blob([highCodepoints]).size;
|
||||
log.console("High codepoint efficiency:");
|
||||
log.console(" UTF-8 bytes:", utf8Bytes);
|
||||
log.console(" KIM bytes:", encodedHigh.byteLength);
|
||||
log.console(" Savings:", utf8Bytes - encodedHigh.byteLength, "bytes");
|
||||
|
||||
log.console("\nAll tests completed!");
|
||||
47
tests/text_utf8.ce
Normal file
47
tests/text_utf8.ce
Normal file
@@ -0,0 +1,47 @@
|
||||
var text = use('text');
|
||||
var blob = use('blob');
|
||||
var utf8 = use('utf8');
|
||||
|
||||
// Test blob to text conversion
|
||||
var test_string = "Hello, 世界! 🌍";
|
||||
var encoded_blob = utf8.encode(test_string);
|
||||
var decoded_text = text(encoded_blob);
|
||||
|
||||
log.console("Blob to text test:");
|
||||
log.console(" Original:", test_string);
|
||||
log.console(" Decoded:", decoded_text);
|
||||
log.console(" Match:", test_string === decoded_text ? "PASS" : "FAIL");
|
||||
|
||||
// Test array of codepoints conversion
|
||||
var codepoints = [72, 101, 108, 108, 111, 44, 32, 19990, 30028, 33, 32, 127757];
|
||||
var from_codepoints = text(codepoints);
|
||||
log.console("\nCodepoints to text test:");
|
||||
log.console(" From codepoints:", from_codepoints);
|
||||
log.console(" Match:", from_codepoints === test_string ? "PASS" : "FAIL");
|
||||
|
||||
// Test array with separator
|
||||
var words = ["Hello", "world", "from", "text"];
|
||||
var joined = text(words, " ");
|
||||
log.console("\nArray with separator test:");
|
||||
log.console(" Joined:", joined);
|
||||
log.console(" Expected: Hello world from text");
|
||||
log.console(" Match:", joined === "Hello world from text" ? "PASS" : "FAIL");
|
||||
|
||||
// Test mixed array with codepoints
|
||||
var mixed = [72, "ello", 32, "world"];
|
||||
var mixed_result = text(mixed, "");
|
||||
log.console("\nMixed array test:");
|
||||
log.console(" Result:", mixed_result);
|
||||
log.console(" Expected: Hello world");
|
||||
log.console(" Match:", mixed_result === "Hello world" ? "PASS" : "FAIL");
|
||||
|
||||
// Test blob encoding formats still work
|
||||
var test_data = utf8.encode("ABC");
|
||||
log.console("\nBlob format tests:");
|
||||
log.console(" Hex:", text(test_data, "h"));
|
||||
log.console(" Binary:", text(test_data, "b"));
|
||||
log.console(" Octal:", text(test_data, "o"));
|
||||
|
||||
log.console("\nAll tests completed!");
|
||||
|
||||
$_.stop();
|
||||
70
tests/utf8.ce
Normal file
70
tests/utf8.ce
Normal file
@@ -0,0 +1,70 @@
|
||||
var utf8 = use("utf8");
|
||||
|
||||
// Test character counting vs byte counting
|
||||
var test1 = "Hello";
|
||||
log.console("ASCII length test:");
|
||||
log.console(" Characters:", utf8.length(test1));
|
||||
log.console(" Bytes:", utf8.byte_length(test1));
|
||||
log.console(" Match:", utf8.length(test1) === utf8.byte_length(test1) ? "PASS" : "FAIL");
|
||||
|
||||
var test2 = "Hello 世界";
|
||||
log.console("\nMixed ASCII/Unicode length test:");
|
||||
log.console(" Characters:", utf8.length(test2));
|
||||
log.console(" Bytes:", utf8.byte_length(test2));
|
||||
log.console(" Bytes > Characters:", utf8.byte_length(test2) > utf8.length(test2) ? "PASS" : "FAIL");
|
||||
|
||||
// Test codepoints
|
||||
var test3 = "A😀B";
|
||||
var codepoints = utf8.codepoints(test3);
|
||||
log.console("\nCodepoints test:");
|
||||
log.console(" String:", test3);
|
||||
log.console(" Codepoints:", codepoints);
|
||||
log.console(" A=65:", codepoints[0] === 65 ? "PASS" : "FAIL");
|
||||
log.console(" 😀=128512:", codepoints[1] === 128512 ? "PASS" : "FAIL");
|
||||
log.console(" B=66:", codepoints[2] === 66 ? "PASS" : "FAIL");
|
||||
|
||||
// Test from_codepoints
|
||||
var reconstructed = utf8.from_codepoints(codepoints);
|
||||
log.console(" Reconstructed:", reconstructed);
|
||||
log.console(" Match:", test3 === reconstructed ? "PASS" : "FAIL");
|
||||
|
||||
// Test encode/decode
|
||||
var test4 = "UTF-8 encoding: 你好世界 🌍";
|
||||
var encoded = utf8.encode(test4);
|
||||
var decoded = utf8.decode(encoded);
|
||||
log.console("\nEncode/decode test:");
|
||||
log.console(" Original:", test4);
|
||||
log.console(" Decoded:", decoded);
|
||||
log.console(" Match:", test4 === decoded ? "PASS" : "FAIL");
|
||||
|
||||
// Test validation
|
||||
log.console("\nValidation tests:");
|
||||
log.console(" Valid UTF-8:", utf8.validate("Hello 世界") ? "PASS" : "FAIL");
|
||||
|
||||
// Test slicing
|
||||
var test5 = "Hello 世界!";
|
||||
log.console("\nSlice tests:");
|
||||
log.console(" Original:", test5);
|
||||
log.console(" slice(0, 5):", utf8.slice(test5, 0, 5));
|
||||
log.console(" slice(6, 8):", utf8.slice(test5, 6, 8));
|
||||
log.console(" slice(-3):", utf8.slice(test5, -3));
|
||||
log.console(" slice(0, -1):", utf8.slice(test5, 0, -1));
|
||||
|
||||
// Test char_at
|
||||
log.console("\nchar_at tests:");
|
||||
log.console(" char_at(0):", utf8.char_at(test5, 0));
|
||||
log.console(" char_at(6):", utf8.char_at(test5, 6));
|
||||
log.console(" char_at(7):", utf8.char_at(test5, 7));
|
||||
log.console(" char_at(100):", utf8.char_at(test5, 100));
|
||||
|
||||
// Test with emoji sequences
|
||||
var test6 = "👨👩👧👦";
|
||||
log.console("\nComplex emoji test:");
|
||||
log.console(" String:", test6);
|
||||
log.console(" Length:", utf8.length(test6));
|
||||
log.console(" Byte length:", utf8.byte_length(test6));
|
||||
log.console(" Codepoints:", utf8.codepoints(test6).length);
|
||||
|
||||
log.console("\nAll tests completed!");
|
||||
|
||||
$_.stop()
|
||||
Reference in New Issue
Block a user