add wota
This commit is contained in:
727
source/wota.h
Normal file
727
source/wota.h
Normal file
@@ -0,0 +1,727 @@
|
||||
#ifndef WOTA_H
|
||||
#define WOTA_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
Wota Type Codes (LSB of a 64-bit word)
|
||||
---------------------------------------------------------------- */
|
||||
#define WOTA_INT 0x00
|
||||
#define WOTA_FLOAT 0x01
|
||||
#define WOTA_ARR 0x02
|
||||
#define WOTA_REC 0x03
|
||||
#define WOTA_BLOB 0x04
|
||||
#define WOTA_TEXT 0x05
|
||||
#define WOTA_SYM 0x07
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
Wota Symbol Codes (stored in top 56 bits)
|
||||
e.g. word = ((uint64_t)sym_code << 8) | WOTA_SYM
|
||||
---------------------------------------------------------------- */
|
||||
#define WOTA_NULL 0x00
|
||||
#define WOTA_FALSE 0x02
|
||||
#define WOTA_TRUE 0x03
|
||||
#define WOTA_PRIVATE 0x08
|
||||
#define WOTA_SYSTEM 0x09
|
||||
|
||||
/*
|
||||
We store all data in 64-bit words. The least significant byte
|
||||
is the type code. The top 56 bits are used differently depending
|
||||
on type.
|
||||
|
||||
This version (non-standard) stores floating-point values
|
||||
as *raw 64-bit IEEE 754 doubles* in a second 64-bit word.
|
||||
*/
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
Accessor: return the Wota type code from the LSB of a 64-bit word
|
||||
---------------------------------------------------------------- */
|
||||
static inline int wota_type(const uint64_t *w) {
|
||||
return (int)(*w & 0xffU);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
Reading function prototypes. Each consumes some number of 64-bit
|
||||
words and returns a pointer to the next Wota data. If you pass
|
||||
a NULL for the output pointer, the function will skip the data.
|
||||
---------------------------------------------------------------- */
|
||||
char *wota_read_blob (long long *byte_len, char **blob, char *wota);
|
||||
char *wota_read_text (char **text_utf8, char *wota);
|
||||
char *wota_read_array (long long *count, char *wota);
|
||||
char *wota_read_record (long long *count, char *wota);
|
||||
char *wota_read_float (double *d, char *wota);
|
||||
char *wota_read_int (long long *n, char *wota);
|
||||
char *wota_read_sym (int *sym_code, char *wota);
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
WotaBuffer: dynamic array of 64-bit words for building a Wota
|
||||
message in memory.
|
||||
---------------------------------------------------------------- */
|
||||
typedef struct WotaBuffer {
|
||||
uint64_t *data; /* allocated array of 64-bit words */
|
||||
size_t size; /* how many 64-bit words are used */
|
||||
size_t capacity; /* allocated capacity in 64-bit words */
|
||||
} WotaBuffer;
|
||||
|
||||
/* Buffer management */
|
||||
void wota_buffer_init(WotaBuffer *wb, size_t initial_capacity_in_words);
|
||||
void wota_buffer_free(WotaBuffer *wb);
|
||||
|
||||
/* Writing function prototypes */
|
||||
void wota_write_blob (WotaBuffer *wb, unsigned long long nbits, const char *data);
|
||||
void wota_write_text (WotaBuffer *wb, const char *utf8);
|
||||
void wota_write_array (WotaBuffer *wb, unsigned long long count);
|
||||
void wota_write_record (WotaBuffer *wb, unsigned long long count);
|
||||
/* We'll store numbers as either 56-bit integers or raw double */
|
||||
void wota_write_number (WotaBuffer *wb, double n);
|
||||
/* Symbol codes (WOTA_NULL, WOTA_FALSE, etc.) */
|
||||
void wota_write_sym (WotaBuffer *wb, int sym_code);
|
||||
|
||||
|
||||
#ifdef WOTA_IMPLEMENTATION
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <limits.h>
|
||||
|
||||
/* ================================================================
|
||||
Detect endianness. We'll use this to do 64-bit byte-swaps if needed.
|
||||
If you know you only run on little-endian, you can hard-code that.
|
||||
================================================================ */
|
||||
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
#define WOTA_BIG_ENDIAN 1
|
||||
#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
|
||||
#define WOTA_LITTLE_ENDIAN 1
|
||||
#elif defined(_MSC_VER)
|
||||
/* MSVC on x86/x64 is little-endian */
|
||||
#define WOTA_LITTLE_ENDIAN 1
|
||||
#else
|
||||
/* Fallback: assume little-endian if unknown. Adjust if your platform is otherwise. */
|
||||
#define WOTA_LITTLE_ENDIAN 1
|
||||
#endif
|
||||
|
||||
/* 64-bit byte-swap helper (for bit-level blob ordering) */
|
||||
static inline uint64_t wota_bswap64(uint64_t x)
|
||||
{
|
||||
/* Modern compilers often have a built-in. If not, do manually: */
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
return __builtin_bswap64(x);
|
||||
#else
|
||||
/* Portable approach */
|
||||
x = ((x & 0x00000000FFFFFFFFULL) << 32) | ((x >> 32) & 0x00000000FFFFFFFFULL);
|
||||
x = ((x & 0x0000FFFF0000FFFFULL) << 16) | ((x >> 16) & 0x0000FFFF0000FFFFULL);
|
||||
x = ((x & 0x00FF00FF00FF00FFULL) << 8 ) | ((x >> 8 ) & 0x00FF00FF00FF00FFULL);
|
||||
return x;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ================================================================
|
||||
Helper: Grow the buffer to fit 'min_add' more 64-bit words
|
||||
================================================================ */
|
||||
static void wota_buffer_grow(WotaBuffer *wb, size_t min_add)
|
||||
{
|
||||
size_t needed = wb->size + min_add;
|
||||
if (needed <= wb->capacity) return;
|
||||
|
||||
size_t new_cap = (wb->capacity == 0 ? 8 : wb->capacity * 2);
|
||||
while (new_cap < needed) {
|
||||
new_cap *= 2;
|
||||
}
|
||||
uint64_t *new_data = (uint64_t *)realloc(wb->data, new_cap * sizeof(uint64_t));
|
||||
if (!new_data) {
|
||||
fprintf(stderr, "realloc failed in wota_buffer_grow\n");
|
||||
abort();
|
||||
}
|
||||
wb->data = new_data;
|
||||
wb->capacity = new_cap;
|
||||
}
|
||||
|
||||
void wota_buffer_init(WotaBuffer *wb, size_t initial_capacity_in_words)
|
||||
{
|
||||
wb->data = NULL;
|
||||
wb->size = 0;
|
||||
wb->capacity = 0;
|
||||
if (initial_capacity_in_words > 0) {
|
||||
wb->data = (uint64_t *)malloc(initial_capacity_in_words * sizeof(uint64_t));
|
||||
if (!wb->data) {
|
||||
fprintf(stderr, "malloc failed in wota_buffer_init\n");
|
||||
abort();
|
||||
}
|
||||
wb->capacity = initial_capacity_in_words;
|
||||
}
|
||||
}
|
||||
|
||||
void wota_buffer_free(WotaBuffer *wb)
|
||||
{
|
||||
if (wb->data) {
|
||||
free(wb->data);
|
||||
}
|
||||
wb->data = NULL;
|
||||
wb->size = 0;
|
||||
wb->capacity = 0;
|
||||
}
|
||||
|
||||
/* Alloc 'count' 64-bit words in the buffer, return pointer to them */
|
||||
static uint64_t *wota_buffer_alloc(WotaBuffer *wb, size_t count)
|
||||
{
|
||||
wota_buffer_grow(wb, count);
|
||||
uint64_t *p = wb->data + wb->size;
|
||||
wb->size += count;
|
||||
return p;
|
||||
}
|
||||
|
||||
/* ================================================================
|
||||
READING
|
||||
================================================================ */
|
||||
|
||||
/* We skip 1 word if we do not want to interpret it. */
|
||||
static inline char *wota_skip1(char *wota)
|
||||
{
|
||||
return wota + 8; /* skip one 64-bit word */
|
||||
}
|
||||
|
||||
char *wota_read_int(long long *n, char *wota)
|
||||
{
|
||||
/* WOTA_INT => single 64-bit word: top 56 bits is a signed integer, LSB=0. */
|
||||
if (!n) return wota_skip1(wota);
|
||||
|
||||
uint64_t *p = (uint64_t *)wota;
|
||||
uint64_t first = p[0];
|
||||
int type = (int)(first & 0xffU);
|
||||
if (type != WOTA_INT) {
|
||||
/* not an int; skip one word */
|
||||
return wota_skip1(wota);
|
||||
}
|
||||
/* sign-extend top 56 bits into a 64-bit signed integer */
|
||||
int64_t val = (int64_t)first;
|
||||
val >>= 8; /* arithmetic shift right 8 bits to keep sign */
|
||||
*n = val;
|
||||
return wota + 8;
|
||||
}
|
||||
|
||||
/*
|
||||
We store a double as:
|
||||
- first 64-bit word => type code (LSB=1), top 56 bits unused
|
||||
- second 64-bit word => raw IEEE 754 bits
|
||||
*/
|
||||
char *wota_read_float(double *out, char *wota)
|
||||
{
|
||||
if (!out) return wota + 16; /* skip 2 words if no pointer */
|
||||
|
||||
uint64_t *p = (uint64_t *)wota;
|
||||
uint64_t first = p[0];
|
||||
int type = (int)(first & 0xffU);
|
||||
if (type != WOTA_FLOAT) {
|
||||
/* skip if not float */
|
||||
return wota + 8;
|
||||
}
|
||||
/* second word has the raw double bits */
|
||||
uint64_t bits = p[1];
|
||||
union {
|
||||
uint64_t u;
|
||||
double d;
|
||||
} converter;
|
||||
converter.u = bits;
|
||||
*out = converter.d;
|
||||
return (char *)(p + 2);
|
||||
}
|
||||
|
||||
char *wota_read_sym(int *sym_code, char *wota)
|
||||
{
|
||||
if (!sym_code) return wota_skip1(wota);
|
||||
|
||||
uint64_t *p = (uint64_t *)wota;
|
||||
uint64_t first = p[0];
|
||||
int type = (int)(first & 0xffU);
|
||||
if (type != WOTA_SYM) {
|
||||
return wota_skip1(wota);
|
||||
}
|
||||
uint64_t top56 = (first >> 8); /* symbol code in top 56 bits */
|
||||
*sym_code = (int)top56;
|
||||
return wota + 8;
|
||||
}
|
||||
|
||||
char *wota_read_array(long long *count, char *wota)
|
||||
{
|
||||
if (!count) return wota_skip1(wota);
|
||||
|
||||
uint64_t *p = (uint64_t *)wota;
|
||||
uint64_t first = p[0];
|
||||
int type = (int)(first & 0xffU);
|
||||
if (type != WOTA_ARR) {
|
||||
return wota_skip1(wota);
|
||||
}
|
||||
uint64_t c = (first >> 8);
|
||||
*count = (long long)c;
|
||||
return wota + 8;
|
||||
}
|
||||
|
||||
char *wota_read_record(long long *count, char *wota)
|
||||
{
|
||||
if (!count) return wota_skip1(wota);
|
||||
|
||||
uint64_t *p = (uint64_t *)wota;
|
||||
uint64_t first = p[0];
|
||||
int type = (int)(first & 0xffU);
|
||||
if (type != WOTA_REC) {
|
||||
return wota_skip1(wota);
|
||||
}
|
||||
uint64_t c = (first >> 8);
|
||||
*count = (long long)c;
|
||||
return wota + 8;
|
||||
}
|
||||
|
||||
/*
|
||||
BLOB:
|
||||
preamble => top 56 bits = #bits, LSB=0x04
|
||||
then floor((nbits + 63)/64) 64-bit words of data
|
||||
The first bit is the MSB of the first data word.
|
||||
|
||||
Faster approach:
|
||||
- If nbits is a multiple of 8, do fast 8-byte copying (with endianness fix).
|
||||
- If partial bits remain, handle them with old bit-by-bit logic.
|
||||
*/
|
||||
char *wota_read_blob(long long *byte_len, char **blob, char *wota)
|
||||
{
|
||||
if (!byte_len || !blob) {
|
||||
return wota_skip1(wota);
|
||||
}
|
||||
|
||||
uint64_t *p = (uint64_t *)wota;
|
||||
uint64_t first = p[0];
|
||||
int type = (int)(first & 0xffU);
|
||||
if (type != WOTA_BLOB) {
|
||||
return wota_skip1(wota);
|
||||
}
|
||||
|
||||
uint64_t nbits = (first >> 8);
|
||||
long long nwords = (long long)((nbits + 63ULL) >> 6); /* # of 64-bit blocks */
|
||||
|
||||
*byte_len = (long long)((nbits + 7ULL) >> 3);
|
||||
*blob = (char *)malloc((size_t)(*byte_len));
|
||||
if (!(*blob)) {
|
||||
fprintf(stderr, "malloc failed in wota_read_blob\n");
|
||||
abort();
|
||||
}
|
||||
memset(*blob, 0, (size_t)(*byte_len));
|
||||
|
||||
uint64_t *data_words = p + 1;
|
||||
long long bits_remaining = (long long)nbits;
|
||||
size_t byte_i = 0;
|
||||
int bit_in_byte = 0;
|
||||
|
||||
/* If nbits is multiple of 8, we can do a bulk copy in 64-bit chunks, then do leftover if any. */
|
||||
long long full_bytes = (long long)(nbits / 8ULL); /* how many full bytes total */
|
||||
long long leftover_bits = (long long)(nbits % 8ULL);
|
||||
|
||||
/* We'll process 8 bytes at a time from each 64-bit block in big-endian format. */
|
||||
long long full_64_chunks = full_bytes / 8; /* # of full 8-byte chunks we can copy. */
|
||||
long long remainder_bytes = full_bytes % 8; /* leftover bytes after those chunks. */
|
||||
|
||||
size_t chunk_index = 0;
|
||||
|
||||
/* Bulk 64-bit copy for each full 8-byte chunk. */
|
||||
for (long long i = 0; i < full_64_chunks; i++) {
|
||||
uint64_t block = data_words[i];
|
||||
/* If we are on a little-endian system, we must swap to get the "first bit in MSB" ordering. */
|
||||
#if defined(WOTA_LITTLE_ENDIAN)
|
||||
block = wota_bswap64(block);
|
||||
#endif
|
||||
/* Copy the 8 bytes from `block` into the output. */
|
||||
memcpy((*blob) + (i * 8), &block, 8);
|
||||
chunk_index = i + 1;
|
||||
}
|
||||
|
||||
/* Now we handle leftover bytes (0..7) in the next block, if any. */
|
||||
if (remainder_bytes > 0) {
|
||||
uint64_t block = data_words[chunk_index];
|
||||
#if defined(WOTA_LITTLE_ENDIAN)
|
||||
block = wota_bswap64(block);
|
||||
#endif
|
||||
memcpy((*blob) + (chunk_index * 8), &block, (size_t)remainder_bytes);
|
||||
/* The chunk_index used up one block if there's leftover bytes. */
|
||||
chunk_index++;
|
||||
}
|
||||
|
||||
/* If leftover_bits != 0, we still have some partial bits at the end to decode. */
|
||||
if (leftover_bits != 0) {
|
||||
/* We'll handle that partial chunk bit-by-bit (since we only have up to 7 bits). */
|
||||
/* The next block is data_words[chunk_index-1] if remainder_bytes > 0,
|
||||
or data_words[chunk_index] if remainder_bytes == 0, depending on how we count. */
|
||||
long long block_idx;
|
||||
if (remainder_bytes > 0) {
|
||||
/* We partially used data_words[chunk_index-1]. So let's re-read it bit by bit. */
|
||||
block_idx = (long long)(chunk_index - 1);
|
||||
} else {
|
||||
/* We haven't used data_words[chunk_index] yet. */
|
||||
block_idx = (long long)chunk_index;
|
||||
}
|
||||
|
||||
uint64_t partial_block = data_words[block_idx];
|
||||
#if defined(WOTA_LITTLE_ENDIAN)
|
||||
partial_block = wota_bswap64(partial_block);
|
||||
#endif
|
||||
/* We used up remainder_bytes * 8 bits from this partial_block if remainder_bytes>0. */
|
||||
int start_bit = 63 - (int)(remainder_bytes * 8);
|
||||
if (start_bit < 0) start_bit = 63; /* if we used the entire block, clamp it */
|
||||
|
||||
/* Now decode leftover_bits from partial_block, from MSB to LSB. */
|
||||
for (int b = start_bit; b >= 0 && leftover_bits > 0; b--) {
|
||||
int bitval = (int)((partial_block >> b) & 1ULL);
|
||||
(*blob)[full_bytes] |= (char)(bitval << bit_in_byte);
|
||||
bit_in_byte++;
|
||||
leftover_bits--;
|
||||
if (bit_in_byte == 8) {
|
||||
bit_in_byte = 0;
|
||||
full_bytes++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* If the total # of blocks was more than chunk_index, skip them if necessary. */
|
||||
return (char *)(data_words + nwords);
|
||||
}
|
||||
|
||||
/*
|
||||
TEXT:
|
||||
preamble => top 56 bits = #characters, LSB=0x05
|
||||
then floor((nchars+1)/2) 64-bit words
|
||||
each word has 2 UTF-32 codepoints: top 32 bits = codepoint1,
|
||||
low 32 bits = codepoint2
|
||||
*/
|
||||
char *wota_read_text(char **text_utf8, char *wota)
|
||||
{
|
||||
if (!text_utf8) return wota_skip1(wota);
|
||||
|
||||
uint64_t *p = (uint64_t *)wota;
|
||||
uint64_t first = p[0];
|
||||
int type = (int)(first & 0xffU);
|
||||
if (type != WOTA_TEXT) {
|
||||
return wota_skip1(wota);
|
||||
}
|
||||
|
||||
uint64_t nchars = (first >> 8);
|
||||
long long nwords = (long long)((nchars + 1ULL) >> 1);
|
||||
|
||||
uint64_t *data_words = p + 1;
|
||||
/*
|
||||
We'll convert them to a UTF-8 string. Each codepoint can
|
||||
become up to 4 bytes. So we need up to 4*nchars + 1.
|
||||
*/
|
||||
size_t max_utf8 = (size_t)(4 * nchars + 1);
|
||||
char *out = (char *)malloc(max_utf8);
|
||||
if (!out) {
|
||||
fprintf(stderr, "malloc failed in wota_read_text\n");
|
||||
abort();
|
||||
}
|
||||
size_t out_len = 0;
|
||||
|
||||
for (long long i = 0; i < nwords; i++) {
|
||||
uint64_t wval = data_words[i];
|
||||
uint32_t c1 = (uint32_t)(wval >> 32);
|
||||
uint32_t c2 = (uint32_t)(wval & 0xffffffffULL);
|
||||
|
||||
// If we haven't exceeded nchars, convert c1 -> UTF-8
|
||||
if ((i * 2) + 0 < (long long)nchars) {
|
||||
uint32_t c = c1;
|
||||
if (c < 0x80) {
|
||||
out[out_len++] = (char)c;
|
||||
} else if (c < 0x800) {
|
||||
out[out_len++] = (char)(0xC0 | (c >> 6));
|
||||
out[out_len++] = (char)(0x80 | (c & 0x3F));
|
||||
} else if (c < 0x10000) {
|
||||
out[out_len++] = (char)(0xE0 | (c >> 12));
|
||||
out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
|
||||
out[out_len++] = (char)(0x80 | (c & 0x3F));
|
||||
} else {
|
||||
out[out_len++] = (char)(0xF0 | (c >> 18));
|
||||
out[out_len++] = (char)(0x80 | ((c >> 12) & 0x3F));
|
||||
out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
|
||||
out[out_len++] = (char)(0x80 | (c & 0x3F));
|
||||
}
|
||||
}
|
||||
// Similarly for c2:
|
||||
if ((i * 2) + 1 < (long long)nchars) {
|
||||
uint32_t c = c2;
|
||||
if (c < 0x80) {
|
||||
out[out_len++] = (char)c;
|
||||
} else if (c < 0x800) {
|
||||
out[out_len++] = (char)(0xC0 | (c >> 6));
|
||||
out[out_len++] = (char)(0x80 | (c & 0x3F));
|
||||
} else if (c < 0x10000) {
|
||||
out[out_len++] = (char)(0xE0 | (c >> 12));
|
||||
out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
|
||||
out[out_len++] = (char)(0x80 | (c & 0x3F));
|
||||
} else {
|
||||
out[out_len++] = (char)(0xF0 | (c >> 18));
|
||||
out[out_len++] = (char)(0x80 | ((c >> 12) & 0x3F));
|
||||
out[out_len++] = (char)(0x80 | ((c >> 6) & 0x3F));
|
||||
out[out_len++] = (char)(0x80 | (c & 0x3F));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out[out_len] = '\0';
|
||||
*text_utf8 = out;
|
||||
|
||||
return (char *)(data_words + nwords);
|
||||
}
|
||||
|
||||
/* ================================================================
|
||||
WRITING
|
||||
================================================================ */
|
||||
|
||||
/*
|
||||
Helper to see if double is integral and can fit in 56 bits signed.
|
||||
Range: -2^55 <= x <= 2^55 - 1
|
||||
*/
|
||||
static int fits_in_56_bits(long long x)
|
||||
{
|
||||
const long long min_val = -(1LL << 55);
|
||||
const long long max_val = (1LL << 55) - 1;
|
||||
return (x >= min_val && x <= max_val);
|
||||
}
|
||||
|
||||
/*
|
||||
Write a WOTA_INT (single 64-bit word):
|
||||
top 56 bits = signed integer (arithmetic shift), LSB=0x00
|
||||
*/
|
||||
static void wota_write_int_word(WotaBuffer *wb, long long val)
|
||||
{
|
||||
/* shift 'val' left by 8 bits into the top 56,
|
||||
then OR the type code in the bottom byte. */
|
||||
uint64_t u = (uint64_t)((int64_t)val) << 8;
|
||||
u |= WOTA_INT;
|
||||
uint64_t *p = wota_buffer_alloc(wb, 1);
|
||||
p[0] = u;
|
||||
}
|
||||
|
||||
/*
|
||||
Write a WOTA_FLOAT (2 words):
|
||||
first word => type=0x01 in LSB, top 56 bits=0
|
||||
second word => raw IEEE 754 double bits
|
||||
*/
|
||||
static void wota_write_float_word(WotaBuffer *wb, double val)
|
||||
{
|
||||
uint64_t *p = wota_buffer_alloc(wb, 2);
|
||||
p[0] = (uint64_t)WOTA_FLOAT; /* top 56 bits=0, LSB=0x01 */
|
||||
|
||||
union {
|
||||
double d;
|
||||
uint64_t u;
|
||||
} converter;
|
||||
converter.d = val;
|
||||
|
||||
p[1] = converter.u;
|
||||
}
|
||||
|
||||
void wota_write_sym(WotaBuffer *wb, int sym_code)
|
||||
{
|
||||
/* single word => top 56 bits = sym_code, LSB=0x07 */
|
||||
uint64_t w = ((uint64_t)(sym_code) << 8) | WOTA_SYM;
|
||||
uint64_t *p = wota_buffer_alloc(wb, 1);
|
||||
p[0] = w;
|
||||
}
|
||||
|
||||
/*
|
||||
BLOB:
|
||||
preamble word => top 56 bits= nbits, LSB=0x04
|
||||
then floor((nbits + 63)/64) 64-bit words
|
||||
If nbits is multiple of 8, we do a fast copy in 64-bit chunks
|
||||
(with a byte-swap if on little-endian) to place the first bit
|
||||
in the MSB of the first word.
|
||||
If partial bits remain, we handle them bit-by-bit at the end.
|
||||
*/
|
||||
void wota_write_blob(WotaBuffer *wb, unsigned long long nbits, const char *data)
|
||||
{
|
||||
/* preamble word => top 56 bits= nbits, LSB=0x04 */
|
||||
uint64_t preamble = ((uint64_t)nbits << 8) | (uint64_t)WOTA_BLOB;
|
||||
uint64_t *p = wota_buffer_alloc(wb, 1);
|
||||
p[0] = preamble;
|
||||
|
||||
unsigned long long nwords = (nbits + 63ULL) >> 6; /* # of 64-bit blocks */
|
||||
if (nwords == 0) {
|
||||
return; /* empty blob => done */
|
||||
}
|
||||
uint64_t *blocks = wota_buffer_alloc(wb, (size_t)nwords);
|
||||
memset(blocks, 0, (size_t)(nwords * sizeof(uint64_t)));
|
||||
|
||||
/* If exactly byte-aligned, do a fast copy first. */
|
||||
unsigned long long full_bytes = (nbits / 8ULL); /* total full bytes */
|
||||
unsigned long long leftover_bits = (nbits % 8ULL);/* leftover bits if not multiple of 8 */
|
||||
|
||||
size_t block_index = 0;
|
||||
unsigned long long num_full_64_chunks = full_bytes / 8ULL; /* how many full 8-byte chunks */
|
||||
unsigned long long remainder_bytes = full_bytes % 8ULL;
|
||||
|
||||
/* 1) Bulk copy each 8-byte chunk */
|
||||
for (unsigned long long i = 0; i < num_full_64_chunks; i++) {
|
||||
/* read 8 bytes from data, build a 64-bit. */
|
||||
uint64_t tmp = 0;
|
||||
memcpy(&tmp, data + (i * 8), 8);
|
||||
/* We must store it so that the first bit is in the MSB. On a little-endian CPU, that means bswap. */
|
||||
#if defined(WOTA_LITTLE_ENDIAN)
|
||||
tmp = wota_bswap64(tmp);
|
||||
#endif
|
||||
blocks[i] = tmp;
|
||||
block_index = (size_t)(i + 1);
|
||||
}
|
||||
|
||||
/* 2) If there's remainder_bytes in the next block, handle them. */
|
||||
if (remainder_bytes > 0) {
|
||||
uint64_t tmp = 0;
|
||||
memcpy(&tmp, data + (block_index * 8), (size_t)remainder_bytes);
|
||||
/* swap if needed */
|
||||
#if defined(WOTA_LITTLE_ENDIAN)
|
||||
tmp = wota_bswap64(tmp);
|
||||
#endif
|
||||
blocks[block_index] = tmp;
|
||||
block_index++;
|
||||
}
|
||||
|
||||
/* 3) If leftover_bits != 0, handle the final partial bits bit-by-bit. */
|
||||
if (leftover_bits != 0) {
|
||||
/* We have leftover_bits up to 7. We'll write them starting from the MSB. */
|
||||
/* We'll write them from data[full_bytes]. */
|
||||
/* The partial block is blocks[block_index - 1] if remainder_bytes>0, else blocks[block_index]. */
|
||||
size_t partial_idx;
|
||||
if (remainder_bytes > 0) {
|
||||
partial_idx = block_index - 1;
|
||||
} else {
|
||||
partial_idx = block_index;
|
||||
}
|
||||
|
||||
uint64_t outword = blocks[partial_idx];
|
||||
#if defined(WOTA_LITTLE_ENDIAN)
|
||||
/* We want to unify our approach: the block is currently in "MSB=first bit" form. Actually, let's do direct approach: re-swap? */
|
||||
/* For safety, let's swap back, set bits, then swap again. Another approach is to set bits from the top down. */
|
||||
outword = wota_bswap64(outword);
|
||||
#endif
|
||||
unsigned long long bits_used = remainder_bytes * 8ULL; /* how many bits we've used in this block so far if remainder_bytes>0 */
|
||||
int bitpos = 63 - (int)bits_used; /* start from MSB downwards */
|
||||
|
||||
for (unsigned long long b = 0; b < leftover_bits; b++) {
|
||||
int bitval = ( (unsigned char)data[full_bytes] >> b ) & 1;
|
||||
outword |= ((uint64_t)bitval << (bitpos));
|
||||
bitpos--;
|
||||
}
|
||||
|
||||
#if defined(WOTA_LITTLE_ENDIAN)
|
||||
outword = wota_bswap64(outword);
|
||||
#endif
|
||||
blocks[partial_idx] = outword;
|
||||
}
|
||||
}
|
||||
|
||||
void wota_write_text(WotaBuffer *wb, const char *utf8)
|
||||
{
|
||||
if (!utf8) utf8 = "";
|
||||
|
||||
/* Convert the utf8 string to an array of UTF-32 codepoints. */
|
||||
size_t len = strlen(utf8);
|
||||
const unsigned char *uc = (const unsigned char *)utf8;
|
||||
/* In worst case, every single byte might form a codepoint, so we allocate enough: */
|
||||
uint32_t *codepoints = (uint32_t *)malloc(sizeof(uint32_t)*(len+1));
|
||||
if (!codepoints) {
|
||||
fprintf(stderr, "malloc failed in wota_write_text\n");
|
||||
abort();
|
||||
}
|
||||
size_t ccount = 0;
|
||||
|
||||
while (*uc) {
|
||||
uint32_t c;
|
||||
if ((uc[0] & 0x80) == 0) {
|
||||
c = uc[0];
|
||||
uc += 1;
|
||||
} else if ((uc[0] & 0xe0) == 0xc0 && (uc[1] != 0)) {
|
||||
c = ((uc[0] & 0x1f) << 6) | (uc[1] & 0x3f);
|
||||
uc += 2;
|
||||
} else if ((uc[0] & 0xf0) == 0xe0 && (uc[1] != 0) && (uc[2] != 0)) {
|
||||
c = ((uc[0] & 0x0f) << 12) | ((uc[1] & 0x3f) << 6) | (uc[2] & 0x3f);
|
||||
uc += 3;
|
||||
} else if ((uc[0] & 0xf8) == 0xf0 && (uc[1] != 0) && (uc[2] != 0) && (uc[3] != 0)) {
|
||||
c = ((uc[0] & 0x07) << 18) | ((uc[1] & 0x3f) << 12)
|
||||
| ((uc[2] & 0x3f) << 6) | (uc[3] & 0x3f);
|
||||
uc += 4;
|
||||
} else {
|
||||
/* invalid sequence => skip 1 byte */
|
||||
c = uc[0];
|
||||
uc++;
|
||||
}
|
||||
codepoints[ccount++] = c;
|
||||
}
|
||||
|
||||
/* preamble => top 56 bits = ccount, LSB=0x05 */
|
||||
uint64_t preamble = ((uint64_t)ccount << 8) | (uint64_t)WOTA_TEXT;
|
||||
uint64_t *pw = wota_buffer_alloc(wb, 1);
|
||||
pw[0] = preamble;
|
||||
|
||||
/* store pairs of 32-bit codepoints in 64-bit words */
|
||||
size_t nwords = (ccount + 1) / 2;
|
||||
if (nwords == 0) {
|
||||
free(codepoints);
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t *blocks = wota_buffer_alloc(wb, nwords);
|
||||
size_t idx = 0;
|
||||
for (size_t i = 0; i < nwords; i++) {
|
||||
uint64_t hi = 0, lo = 0;
|
||||
if (idx < ccount) {
|
||||
hi = codepoints[idx++];
|
||||
}
|
||||
if (idx < ccount) {
|
||||
lo = codepoints[idx++];
|
||||
}
|
||||
blocks[i] = ((hi & 0xffffffffULL) << 32) | (lo & 0xffffffffULL);
|
||||
}
|
||||
|
||||
free(codepoints);
|
||||
}
|
||||
|
||||
void wota_write_array(WotaBuffer *wb, unsigned long long count)
|
||||
{
|
||||
/* single 64-bit word => top 56 bits = count, LSB=0x02 */
|
||||
uint64_t w = ((uint64_t)count << 8) | (uint64_t)WOTA_ARR;
|
||||
uint64_t *p = wota_buffer_alloc(wb, 1);
|
||||
p[0] = w;
|
||||
}
|
||||
|
||||
void wota_write_record(WotaBuffer *wb, unsigned long long count)
|
||||
{
|
||||
/* single 64-bit word => top 56 bits = count, LSB=0x03 */
|
||||
uint64_t w = ((uint64_t)count << 8) | (uint64_t)WOTA_REC;
|
||||
uint64_t *p = wota_buffer_alloc(wb, 1);
|
||||
p[0] = w;
|
||||
}
|
||||
|
||||
/*
|
||||
wota_write_number:
|
||||
If n is an integer (within 2^53 range) you might store as int,
|
||||
or specifically check if it fits in 56 bits. If it does, store
|
||||
as WOTA_INT. Otherwise store as WOTA_FLOAT (raw double).
|
||||
*/
|
||||
void wota_write_number(WotaBuffer *wb, double n)
|
||||
{
|
||||
/* Is it integral within 2^53? Quick check: */
|
||||
double ip;
|
||||
double frac = modf(n, &ip);
|
||||
if (frac == 0.0) {
|
||||
/* candidate integer */
|
||||
long long i = (long long)ip;
|
||||
if ((double)i == ip && fits_in_56_bits(i)) {
|
||||
/* store as a 56-bit integer */
|
||||
wota_write_int_word(wb, i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
/* fallback: store as double */
|
||||
wota_write_float_word(wb, n);
|
||||
}
|
||||
|
||||
#endif /* WOTA_IMPLEMENTATION */
|
||||
|
||||
#endif /* WOTA_H */
|
||||
Reference in New Issue
Block a user