faster wota encoding

This commit is contained in:
2025-06-08 08:35:12 -05:00
parent 3176e6775d
commit c1d341eecd
11 changed files with 193 additions and 268 deletions

View File

@@ -73,6 +73,7 @@ void wota_buffer_free(WotaBuffer *wb);
/* Writing function prototypes */
void wota_write_blob (WotaBuffer *wb, unsigned long long nbits, const char *data);
void wota_write_text (WotaBuffer *wb, const char *utf8);
void wota_write_text_len(WotaBuffer *wb, const char *utf8, size_t len);
void wota_write_array (WotaBuffer *wb, unsigned long long count);
void wota_write_record (WotaBuffer *wb, unsigned long long count);
/* We'll store numbers as either 56-bit integers or raw double */
@@ -396,10 +397,9 @@ char *wota_read_blob(long long *byte_len, char **blob, char *wota)
/*
TEXT:
preamble => top 56 bits = #characters, LSB=0x05
then floor((nchars+1)/2) 64-bit words
each word has 2 UTF-32 codepoints: top 32 bits = codepoint1,
low 32 bits = codepoint2
preamble => top 56 bits = #bytes in UTF-8, LSB=0x05
then floor((nbytes + 7)/8) 64-bit words
containing the UTF-8 bytes, packed 8 bytes per word
*/
char *wota_read_text(char **text_utf8, char *wota)
{
@@ -412,68 +412,26 @@ char *wota_read_text(char **text_utf8, char *wota)
return wota_skip1(wota);
}
uint64_t nchars = (first >> 8);
long long nwords = (long long)((nchars + 1ULL) >> 1);
uint64_t nbytes = (first >> 8);
long long nwords = (long long)((nbytes + 7ULL) >> 3);
uint64_t *data_words = p + 1;
/*
We'll convert them to a UTF-8 string. Each codepoint can
become up to 4 bytes. So we need up to 4*nchars + 1.
*/
size_t max_utf8 = (size_t)(4 * nchars + 1);
char *out = (char *)malloc(max_utf8);
char *out = (char *)malloc((size_t)(nbytes + 1));
if (!out) {
fprintf(stderr, "malloc failed in wota_read_text\n");
abort();
}
size_t out_len = 0;
/* Copy bytes from the packed 64-bit words */
for (long long i = 0; i < nwords; i++) {
uint64_t wval = data_words[i];
uint32_t c1 = (uint32_t)(wval >> 32);
uint32_t c2 = (uint32_t)(wval & 0xffffffffULL);
// If we haven't exceeded nchars, convert c1 -> UTF-8
if ((i * 2) + 0 < (long long)nchars) {
uint32_t c = c1;
if (c < 0x80) {
out[out_len++] = (char)c;
} else if (c < 0x800) {
out[out_len++] = (char)(0xC0 | (c >> 6));
out[out_len++] = (char)(0x80 | (c & 0x3F));
} else if (c < 0x10000) {
out[out_len++] = (char)(0xE0 | (c >> 12));
out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
out[out_len++] = (char)(0x80 | (c & 0x3F));
} else {
out[out_len++] = (char)(0xF0 | (c >> 18));
out[out_len++] = (char)(0x80 | ((c >> 12) & 0x3F));
out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
out[out_len++] = (char)(0x80 | (c & 0x3F));
}
}
// Similarly for c2:
if ((i * 2) + 1 < (long long)nchars) {
uint32_t c = c2;
if (c < 0x80) {
out[out_len++] = (char)c;
} else if (c < 0x800) {
out[out_len++] = (char)(0xC0 | (c >> 6));
out[out_len++] = (char)(0x80 | (c & 0x3F));
} else if (c < 0x10000) {
out[out_len++] = (char)(0xE0 | (c >> 12));
out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
out[out_len++] = (char)(0x80 | (c & 0x3F));
} else {
out[out_len++] = (char)(0xF0 | (c >> 18));
out[out_len++] = (char)(0x80 | ((c >> 12) & 0x3F));
out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
out[out_len++] = (char)(0x80 | (c & 0x3F));
}
for (int j = 0; j < 8 && (i * 8 + j) < (long long)nbytes; j++) {
out[i * 8 + j] = (char)((wval >> (56 - j * 8)) & 0xff);
}
}
out[out_len] = '\0';
out[nbytes] = '\0';
*text_utf8 = out;
return (char *)(data_words + nwords);
@@ -625,70 +583,37 @@ void wota_write_blob(WotaBuffer *wb, unsigned long long nbits, const char *data)
}
}
void wota_write_text(WotaBuffer *wb, const char *utf8)
void wota_write_text_len(WotaBuffer *wb, const char *utf8, size_t nbytes)
{
if (!utf8) utf8 = "";
/* Convert the utf8 string to an array of UTF-32 codepoints. */
size_t len = strlen(utf8);
const unsigned char *uc = (const unsigned char *)utf8;
/* In worst case, every single byte might form a codepoint, so we allocate enough: */
uint32_t *codepoints = (uint32_t *)malloc(sizeof(uint32_t)*(len+1));
if (!codepoints) {
fprintf(stderr, "malloc failed in wota_write_text\n");
abort();
}
size_t ccount = 0;
while (*uc) {
uint32_t c;
if ((uc[0] & 0x80) == 0) {
c = uc[0];
uc += 1;
} else if ((uc[0] & 0xe0) == 0xc0 && (uc[1] != 0)) {
c = ((uc[0] & 0x1f) << 6) | (uc[1] & 0x3f);
uc += 2;
} else if ((uc[0] & 0xf0) == 0xe0 && (uc[1] != 0) && (uc[2] != 0)) {
c = ((uc[0] & 0x0f) << 12) | ((uc[1] & 0x3f) << 6) | (uc[2] & 0x3f);
uc += 3;
} else if ((uc[0] & 0xf8) == 0xf0 && (uc[1] != 0) && (uc[2] != 0) && (uc[3] != 0)) {
c = ((uc[0] & 0x07) << 18) | ((uc[1] & 0x3f) << 12)
| ((uc[2] & 0x3f) << 6) | (uc[3] & 0x3f);
uc += 4;
} else {
/* invalid sequence => skip 1 byte */
c = uc[0];
uc++;
}
codepoints[ccount++] = c;
}
/* preamble => top 56 bits = ccount, LSB=0x05 */
uint64_t preamble = ((uint64_t)ccount << 8) | (uint64_t)WOTA_TEXT;
/* preamble => top 56 bits = nbytes, LSB=0x05 */
uint64_t preamble = ((uint64_t)nbytes << 8) | (uint64_t)WOTA_TEXT;
uint64_t *pw = wota_buffer_alloc(wb, 1);
pw[0] = preamble;
/* store pairs of 32-bit codepoints in 64-bit words */
size_t nwords = (ccount + 1) / 2;
/* pack UTF-8 bytes into 64-bit words, 8 bytes per word */
size_t nwords = (nbytes + 7) / 8;
if (nwords == 0) {
free(codepoints);
return;
}
uint64_t *blocks = wota_buffer_alloc(wb, nwords);
size_t idx = 0;
for (size_t i = 0; i < nwords; i++) {
uint64_t hi = 0, lo = 0;
if (idx < ccount) {
hi = codepoints[idx++];
}
if (idx < ccount) {
lo = codepoints[idx++];
}
blocks[i] = ((hi & 0xffffffffULL) << 32) | (lo & 0xffffffffULL);
}
memset(blocks, 0, nwords * sizeof(uint64_t));
free(codepoints);
for (size_t i = 0; i < nwords; i++) {
uint64_t wval = 0;
for (int j = 0; j < 8 && (i * 8 + j) < nbytes; j++) {
wval |= ((uint64_t)(unsigned char)utf8[i * 8 + j]) << (56 - j * 8);
}
blocks[i] = wval;
}
}
void wota_write_text(WotaBuffer *wb, const char *utf8)
{
if (!utf8) utf8 = "";
wota_write_text_len(wb, utf8, strlen(utf8));
}
void wota_write_array(WotaBuffer *wb, unsigned long long count)