faster wota encoding

2025-06-08 08:35:12 -05:00
parent 3176e6775d
commit c1d341eecd
11 changed files with 193 additions and 268 deletions
--- a/source/wota.h
+++ b/source/wota.h
@@ -73,6 +73,7 @@ void wota_buffer_free(WotaBuffer *wb);
 /* Writing function prototypes */
 void wota_write_blob   (WotaBuffer *wb, unsigned long long nbits, const char *data);
 void wota_write_text   (WotaBuffer *wb, const char *utf8);
+void wota_write_text_len(WotaBuffer *wb, const char *utf8, size_t len);
 void wota_write_array  (WotaBuffer *wb, unsigned long long count);
 void wota_write_record (WotaBuffer *wb, unsigned long long count);
 /* We'll store numbers as either 56-bit integers or raw double */
@@ -396,10 +397,9 @@ char *wota_read_blob(long long *byte_len, char **blob, char *wota)

 /*
  TEXT:
-    preamble => top 56 bits = #characters, LSB=0x05
-    then floor((nchars+1)/2) 64-bit words
-       each word has 2 UTF-32 codepoints: top 32 bits = codepoint1,
-                                         low 32 bits = codepoint2
+    preamble => top 56 bits = #bytes in UTF-8, LSB=0x05
+    then floor((nbytes + 7)/8) 64-bit words
+       containing the UTF-8 bytes, packed 8 bytes per word
 */
 char *wota_read_text(char **text_utf8, char *wota)
 {
@@ -412,68 +412,26 @@ char *wota_read_text(char **text_utf8, char *wota)
        return wota_skip1(wota);
    }

-    uint64_t nchars = (first >> 8);
-    long long nwords = (long long)((nchars + 1ULL) >> 1);
+    uint64_t nbytes = (first >> 8);
+    long long nwords = (long long)((nbytes + 7ULL) >> 3);

    uint64_t *data_words = p + 1;
-    /*
-      We'll convert them to a UTF-8 string. Each codepoint can
-      become up to 4 bytes. So we need up to 4*nchars + 1.
-    */
-    size_t max_utf8 = (size_t)(4 * nchars + 1);
-    char *out = (char *)malloc(max_utf8);
+    
+    char *out = (char *)malloc((size_t)(nbytes + 1));
    if (!out) {
        fprintf(stderr, "malloc failed in wota_read_text\n");
        abort();
    }
-    size_t out_len = 0;

+    /* Copy bytes from the packed 64-bit words */
    for (long long i = 0; i < nwords; i++) {
        uint64_t wval = data_words[i];
-        uint32_t c1 = (uint32_t)(wval >> 32);
-        uint32_t c2 = (uint32_t)(wval & 0xffffffffULL);
-
-        // If we haven't exceeded nchars, convert c1 -> UTF-8
-        if ((i * 2) + 0 < (long long)nchars) {
-            uint32_t c = c1;
-            if (c < 0x80) {
-                out[out_len++] = (char)c;
-            } else if (c < 0x800) {
-                out[out_len++] = (char)(0xC0 | (c >> 6));
-                out[out_len++] = (char)(0x80 | (c & 0x3F));
-            } else if (c < 0x10000) {
-                out[out_len++] = (char)(0xE0 | (c >> 12));
-                out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
-                out[out_len++] = (char)(0x80 | (c & 0x3F));
-            } else {
-                out[out_len++] = (char)(0xF0 | (c >> 18));
-                out[out_len++] = (char)(0x80 | ((c >> 12) & 0x3F));
-                out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
-                out[out_len++] = (char)(0x80 | (c & 0x3F));
-            }
-        }
-        // Similarly for c2:
-        if ((i * 2) + 1 < (long long)nchars) {
-            uint32_t c = c2;
-            if (c < 0x80) {
-                out[out_len++] = (char)c;
-            } else if (c < 0x800) {
-                out[out_len++] = (char)(0xC0 | (c >> 6));
-                out[out_len++] = (char)(0x80 | (c & 0x3F));
-            } else if (c < 0x10000) {
-                out[out_len++] = (char)(0xE0 | (c >> 12));
-                out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
-                out[out_len++] = (char)(0x80 | (c & 0x3F));
-            } else {
-                out[out_len++] = (char)(0xF0 | (c >> 18));
-                out[out_len++] = (char)(0x80 | ((c >> 12) & 0x3F));
-                out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
-                out[out_len++] = (char)(0x80 | (c & 0x3F));
-            }
+        for (int j = 0; j < 8 && (i * 8 + j) < (long long)nbytes; j++) {
+            out[i * 8 + j] = (char)((wval >> (56 - j * 8)) & 0xff);
        }
    }

-    out[out_len] = '\0';
+    out[nbytes] = '\0';
    *text_utf8 = out;

    return (char *)(data_words + nwords);
@@ -625,70 +583,37 @@ void wota_write_blob(WotaBuffer *wb, unsigned long long nbits, const char *data)
    }
 }

-void wota_write_text(WotaBuffer *wb, const char *utf8)
+void wota_write_text_len(WotaBuffer *wb, const char *utf8, size_t nbytes)
 {
    if (!utf8) utf8 = "";

-    /* Convert the utf8 string to an array of UTF-32 codepoints. */
-    size_t len = strlen(utf8);
-    const unsigned char *uc = (const unsigned char *)utf8;
-    /* In worst case, every single byte might form a codepoint, so we allocate enough: */
-    uint32_t *codepoints = (uint32_t *)malloc(sizeof(uint32_t)*(len+1));
-    if (!codepoints) {
-        fprintf(stderr, "malloc failed in wota_write_text\n");
-        abort();
-    }
-    size_t ccount = 0;
-
-    while (*uc) {
-        uint32_t c;
-        if ((uc[0] & 0x80) == 0) {
-            c = uc[0];
-            uc += 1;
-        } else if ((uc[0] & 0xe0) == 0xc0 && (uc[1] != 0)) {
-            c = ((uc[0] & 0x1f) << 6) | (uc[1] & 0x3f);
-            uc += 2;
-        } else if ((uc[0] & 0xf0) == 0xe0 && (uc[1] != 0) && (uc[2] != 0)) {
-            c = ((uc[0] & 0x0f) << 12) | ((uc[1] & 0x3f) << 6) | (uc[2] & 0x3f);
-            uc += 3;
-        } else if ((uc[0] & 0xf8) == 0xf0 && (uc[1] != 0) && (uc[2] != 0) && (uc[3] != 0)) {
-            c = ((uc[0] & 0x07) << 18) | ((uc[1] & 0x3f) << 12)
-                | ((uc[2] & 0x3f) << 6) | (uc[3] & 0x3f);
-            uc += 4;
-        } else {
-            /* invalid sequence => skip 1 byte */
-            c = uc[0];
-            uc++;
-        }
-        codepoints[ccount++] = c;
-    }
-
-    /* preamble => top 56 bits = ccount, LSB=0x05 */
-    uint64_t preamble = ((uint64_t)ccount << 8) | (uint64_t)WOTA_TEXT;
+    /* preamble => top 56 bits = nbytes, LSB=0x05 */
+    uint64_t preamble = ((uint64_t)nbytes << 8) | (uint64_t)WOTA_TEXT;
    uint64_t *pw = wota_buffer_alloc(wb, 1);
    pw[0] = preamble;

-    /* store pairs of 32-bit codepoints in 64-bit words */
-    size_t nwords = (ccount + 1) / 2;
+    /* pack UTF-8 bytes into 64-bit words, 8 bytes per word */
+    size_t nwords = (nbytes + 7) / 8;
    if (nwords == 0) {
-        free(codepoints);
        return;
    }

    uint64_t *blocks = wota_buffer_alloc(wb, nwords);
-    size_t idx = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        uint64_t hi = 0, lo = 0;
-        if (idx < ccount) {
-            hi = codepoints[idx++];
-        }
-        if (idx < ccount) {
-            lo = codepoints[idx++];
-        }
-        blocks[i] = ((hi & 0xffffffffULL) << 32) | (lo & 0xffffffffULL);
-    }
+    memset(blocks, 0, nwords * sizeof(uint64_t));

-    free(codepoints);
+    for (size_t i = 0; i < nwords; i++) {
+        uint64_t wval = 0;
+        for (int j = 0; j < 8 && (i * 8 + j) < nbytes; j++) {
+            wval |= ((uint64_t)(unsigned char)utf8[i * 8 + j]) << (56 - j * 8);
+        }
+        blocks[i] = wval;
+    }
+}
+
+void wota_write_text(WotaBuffer *wb, const char *utf8)
+{
+    if (!utf8) utf8 = "";
+    wota_write_text_len(wb, utf8, strlen(utf8));
 }

 void wota_write_array(WotaBuffer *wb, unsigned long long count)