#ifndef WOTA_H
#define WOTA_H

#include <stddef.h>
#include <stdint.h>

/* ----------------------------------------------------------------
   Wota Type Codes (LSB of a 64-bit word)
   ---------------------------------------------------------------- */
#define WOTA_INT   0x00
#define WOTA_FLOAT 0x01
#define WOTA_ARR   0x02
#define WOTA_REC   0x03
#define WOTA_BLOB  0x04
#define WOTA_TEXT  0x05
#define WOTA_SYM   0x07

/* ----------------------------------------------------------------
   Wota Symbol Codes (stored in top 56 bits)
   e.g. word = ((uint64_t)sym_code << 8) | WOTA_SYM
   ---------------------------------------------------------------- */
#define WOTA_NULL    0x00
#define WOTA_FALSE   0x02
#define WOTA_TRUE    0x03
#define WOTA_PRIVATE 0x08
#define WOTA_SYSTEM  0x09

/*
   We store all data in 64-bit words. The least significant byte
   is the type code. The top 56 bits are used differently depending
   on type.

   This version (non-standard) stores floating-point values
   as *raw 64-bit IEEE 754 doubles* in a second 64-bit word.
*/

/* ----------------------------------------------------------------
   Accessor: return the Wota type code from the LSB of a 64-bit word
   ---------------------------------------------------------------- */
static inline int wota_type(const uint64_t *w) {
    return (int)(*w & 0xffU);
}

/* ----------------------------------------------------------------
   Reading function prototypes. Each consumes some number of 64-bit
   words and returns a pointer to the next Wota data. If you pass
   a NULL for the output pointer, the function will skip the data.
   ---------------------------------------------------------------- */
char *wota_read_blob   (long long *byte_len, char **blob,    char *wota);
char *wota_read_text   (char **text_utf8,                   char *wota);
char *wota_read_text_len(size_t *byte_len, char **text_utf8, char *wota);
char *wota_read_array  (long long *count,                   char *wota);
char *wota_read_record (long long *count,                   char *wota);
char *wota_read_float  (double *d,                          char *wota);
char *wota_read_int    (long long *n,                       char *wota);
char *wota_read_sym    (int *sym_code,                      char *wota);

/* ----------------------------------------------------------------
   WotaBuffer: dynamic array of 64-bit words for building a Wota
   message in memory.
   ---------------------------------------------------------------- */
typedef struct WotaBuffer {
    uint64_t *data;     /* allocated array of 64-bit words */
    size_t    size;     /* how many 64-bit words are used */
    size_t    capacity; /* allocated capacity in 64-bit words */
} WotaBuffer;

void *wota_dup_data(struct WotaBuffer *wb);

/* Buffer management */
void wota_buffer_init(WotaBuffer *wb, size_t initial_capacity_in_words);
void wota_buffer_free(WotaBuffer *wb);

/* Writing function prototypes */
void wota_write_blob   (WotaBuffer *wb, unsigned long long nbits, const char *data);
void wota_write_text   (WotaBuffer *wb, const char *utf8);
void wota_write_text_len(WotaBuffer *wb, const char *utf8, size_t len);
void wota_write_array  (WotaBuffer *wb, unsigned long long count);
void wota_write_record (WotaBuffer *wb, unsigned long long count);
/* We'll store numbers as either 56-bit integers or raw double */
void wota_write_number (WotaBuffer *wb, double n);
/* Symbol codes (WOTA_NULL, WOTA_FALSE, etc.) */
void wota_write_sym    (WotaBuffer *wb, int sym_code);
void wota_write_int_word(WotaBuffer *wb, long long val);
void wota_write_float_word(WotaBuffer *wb, double val);


#ifdef WOTA_IMPLEMENTATION

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <limits.h>

/* ================================================================
   Detect endianness. We'll use this to do 64-bit byte-swaps if needed.
   If you know you only run on little-endian, you can hard-code that.
   ================================================================ */
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 #define WOTA_BIG_ENDIAN 1
#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
 #define WOTA_LITTLE_ENDIAN 1
#elif defined(_MSC_VER)
/* MSVC on x86/x64 is little-endian */
 #define WOTA_LITTLE_ENDIAN 1
#else
/* Fallback: assume little-endian if unknown. Adjust if your platform is otherwise. */
 #define WOTA_LITTLE_ENDIAN 1
#endif

/* 64-bit byte-swap helper (for bit-level blob ordering) */
static inline uint64_t wota_bswap64(uint64_t x)
{
    /* Modern compilers often have a built-in. If not, do manually: */
#if defined(__GNUC__) || defined(__clang__)
    return __builtin_bswap64(x);
#else
    /* Portable approach */
    x = ((x & 0x00000000FFFFFFFFULL) << 32) | ((x >> 32) & 0x00000000FFFFFFFFULL);
    x = ((x & 0x0000FFFF0000FFFFULL) << 16) | ((x >> 16) & 0x0000FFFF0000FFFFULL);
    x = ((x & 0x00FF00FF00FF00FFULL) << 8 ) | ((x >> 8 ) & 0x00FF00FF00FF00FFULL);
    return x;
#endif
}

void *wota_dup_data(WotaBuffer *wb)
{
  void *copy = malloc(wb->size*8);
  memcpy(copy, wb->data, wb->size*8);
  return copy;
}

/* ================================================================
   Helper: Grow the buffer to fit 'min_add' more 64-bit words
   ================================================================ */
static void wota_buffer_grow(WotaBuffer *wb, size_t min_add)
{
    size_t needed = wb->size + min_add;
    if (needed <= wb->capacity) return;

    size_t new_cap = (wb->capacity == 0 ? 8 : wb->capacity * 2);
    while (new_cap < needed) {
        new_cap *= 2;
    }
    uint64_t *new_data = (uint64_t *)realloc(wb->data, new_cap * sizeof(uint64_t));
    if (!new_data) {
        fprintf(stderr, "realloc failed in wota_buffer_grow\n");
        abort();
    }
    wb->data     = new_data;
    wb->capacity = new_cap;
}

void wota_buffer_init(WotaBuffer *wb, size_t initial_capacity_in_words)
{
    wb->data     = NULL;
    wb->size     = 0;
    wb->capacity = 0;
    if (initial_capacity_in_words > 0) {
        wb->data = (uint64_t *)malloc(initial_capacity_in_words * sizeof(uint64_t));
        if (!wb->data) {
            fprintf(stderr, "malloc failed in wota_buffer_init\n");
            abort();
        }
        wb->capacity = initial_capacity_in_words;
    }
}

void wota_buffer_free(WotaBuffer *wb)
{
    if (wb->data) {
        free(wb->data);
    }
    wb->data     = NULL;
    wb->size     = 0;
    wb->capacity = 0;
}

/* Alloc 'count' 64-bit words in the buffer, return pointer to them */
static uint64_t *wota_buffer_alloc(WotaBuffer *wb, size_t count)
{
    wota_buffer_grow(wb, count);
    uint64_t *p = wb->data + wb->size;
    wb->size += count;
    return p;
}

/* ================================================================
   READING
   ================================================================ */

/* We skip 1 word if we do not want to interpret it. */
static inline char *wota_skip1(char *wota)
{
    return wota + 8; /* skip one 64-bit word */
}

char *wota_read_int(long long *n, char *wota)
{
    /* WOTA_INT => single 64-bit word: top 56 bits is a signed integer, LSB=0. */
    if (!n) return wota_skip1(wota);

    uint64_t *p = (uint64_t *)wota;
    uint64_t first = p[0];
    int type = (int)(first & 0xffU);
    if (type != WOTA_INT) {
        /* not an int; skip one word */
        return wota_skip1(wota);
    }
    /* sign-extend top 56 bits into a 64-bit signed integer */
    int64_t val = (int64_t)first;
    val >>= 8; /* arithmetic shift right 8 bits to keep sign */
    *n = val;
    return wota + 8;
}

/*
   We store a double as:
     - first 64-bit word => type code (LSB=1), top 56 bits unused
     - second 64-bit word => raw IEEE 754 bits
*/
char *wota_read_float(double *out, char *wota)
{
    if (!out) return wota + 16; /* skip 2 words if no pointer */

    uint64_t *p = (uint64_t *)wota;
    uint64_t first = p[0];
    int type = (int)(first & 0xffU);
    if (type != WOTA_FLOAT) {
        /* skip if not float */
        return wota + 8;
    }
    /* second word has the raw double bits */
    uint64_t bits = p[1];
    union {
        uint64_t u;
        double   d;
    } converter;
    converter.u = bits;
    *out = converter.d;
    return (char *)(p + 2);
}

char *wota_read_sym(int *sym_code, char *wota)
{
    if (!sym_code) return wota_skip1(wota);

    uint64_t *p = (uint64_t *)wota;
    uint64_t first = p[0];
    int type = (int)(first & 0xffU);
    if (type != WOTA_SYM) {
        return wota_skip1(wota);
    }
    uint64_t top56 = (first >> 8); /* symbol code in top 56 bits */
    *sym_code = (int)top56;
    return wota + 8;
}

char *wota_read_array(long long *count, char *wota)
{
    if (!count) return wota_skip1(wota);

    uint64_t *p = (uint64_t *)wota;
    uint64_t first = p[0];
    int type = (int)(first & 0xffU);
    if (type != WOTA_ARR) {
        return wota_skip1(wota);
    }
    uint64_t c = (first >> 8);
    *count = (long long)c;
    return wota + 8;
}

char *wota_read_record(long long *count, char *wota)
{
    if (!count) return wota_skip1(wota);

    uint64_t *p = (uint64_t *)wota;
    uint64_t first = p[0];
    int type = (int)(first & 0xffU);
    if (type != WOTA_REC) {
        return wota_skip1(wota);
    }
    uint64_t c = (first >> 8);
    *count = (long long)c;
    return wota + 8;
}

/*
  BLOB:
    preamble => top 56 bits = #bits, LSB=0x04
    then floor((nbits + 63)/64) 64-bit words of data
    The first bit is the MSB of the first data word.

  Faster approach:
    - If nbits is a multiple of 8, do fast 8-byte copying (with endianness fix).
    - If partial bits remain, handle them with old bit-by-bit logic.
*/
char *wota_read_blob(long long *byte_len, char **blob, char *wota)
{
    if (!byte_len || !blob) {
        return wota_skip1(wota);
    }

    uint64_t *p = (uint64_t *)wota;
    uint64_t first = p[0];
    int type = (int)(first & 0xffU);
    if (type != WOTA_BLOB) {
        return wota_skip1(wota);
    }

    uint64_t nbits = (first >> 8);
    long long nwords = (long long)((nbits + 63ULL) >> 6); /* # of 64-bit blocks */

    *byte_len = (long long)((nbits + 7ULL) >> 3);
    *blob = (char *)malloc((size_t)(*byte_len));
    if (!(*blob)) {
        fprintf(stderr, "malloc failed in wota_read_blob\n");
        abort();
    }
    memset(*blob, 0, (size_t)(*byte_len));

    uint64_t *data_words = p + 1;
    int bit_in_byte = 0;

    /* If nbits is multiple of 8, we can do a bulk copy in 64-bit chunks, then do leftover if any. */
    long long full_bytes = (long long)(nbits / 8ULL);  /* how many full bytes total */
    long long leftover_bits = (long long)(nbits % 8ULL);

    /* We'll process 8 bytes at a time from each 64-bit block in big-endian format. */
    long long full_64_chunks = full_bytes / 8;  /* # of full 8-byte chunks we can copy. */
    long long remainder_bytes = full_bytes % 8; /* leftover bytes after those chunks. */

    size_t chunk_index = 0;

    /* Bulk 64-bit copy for each full 8-byte chunk. */
    for (long long i = 0; i < full_64_chunks; i++) {
        uint64_t block = data_words[i];
        /* If we are on a little-endian system, we must swap to get the "first bit in MSB" ordering. */
#if defined(WOTA_LITTLE_ENDIAN)
        block = wota_bswap64(block);
#endif
        /* Copy the 8 bytes from `block` into the output. */
        memcpy((*blob) + (i * 8), &block, 8);
        chunk_index = i + 1;
    }

    /* Now we handle leftover bytes (0..7) in the next block, if any. */
    if (remainder_bytes > 0) {
        uint64_t block = data_words[chunk_index];
#if defined(WOTA_LITTLE_ENDIAN)
        block = wota_bswap64(block);
#endif
        memcpy((*blob) + (chunk_index * 8), &block, (size_t)remainder_bytes);
        /* The chunk_index used up one block if there's leftover bytes. */
        chunk_index++;
    }

    /* If leftover_bits != 0, we still have some partial bits at the end to decode. */
    if (leftover_bits != 0) {
        /* We'll handle that partial chunk bit-by-bit (since we only have up to 7 bits). */
        /* The next block is data_words[chunk_index-1] if remainder_bytes > 0,
           or data_words[chunk_index] if remainder_bytes == 0, depending on how we count. */
        long long block_idx;
        if (remainder_bytes > 0) {
            /* We partially used data_words[chunk_index-1]. So let's re-read it bit by bit. */
            block_idx = (long long)(chunk_index - 1);
        } else {
            /* We haven't used data_words[chunk_index] yet. */
            block_idx = (long long)chunk_index;
        }

        uint64_t partial_block = data_words[block_idx];
#if defined(WOTA_LITTLE_ENDIAN)
        partial_block = wota_bswap64(partial_block);
#endif
        /* We used up remainder_bytes * 8 bits from this partial_block if remainder_bytes>0. */
        int start_bit = 63 - (int)(remainder_bytes * 8);
        if (start_bit < 0) start_bit = 63;  /* if we used the entire block, clamp it */

        /* Now decode leftover_bits from partial_block, from MSB to LSB. */
        for (int b = start_bit; b >= 0 && leftover_bits > 0; b--) {
            int bitval = (int)((partial_block >> b) & 1ULL);
            (*blob)[full_bytes] |= (char)(bitval << bit_in_byte);
            bit_in_byte++;
            leftover_bits--;
            if (bit_in_byte == 8) {
                bit_in_byte = 0;
                full_bytes++;
            }
        }
    }

    /* If the total # of blocks was more than chunk_index, skip them if necessary. */
    return (char *)(data_words + nwords);
}

/*
  TEXT:
    preamble => top 56 bits = #bytes in UTF-8, LSB=0x05
    then floor((nbytes + 7)/8) 64-bit words
       containing the UTF-8 bytes, packed 8 bytes per word
*/
char *wota_read_text_len(size_t *byte_len, char **text_utf8, char *wota)
{
    if (!text_utf8) return wota_skip1(wota);

    uint64_t *p = (uint64_t *)wota;
    uint64_t first = p[0];
    int type = (int)(first & 0xffU);
    if (type != WOTA_TEXT) {
        return wota_skip1(wota);
    }

    uint64_t nbytes = (first >> 8);
    long long nwords = (long long)((nbytes + 7ULL) >> 3);

    if (byte_len) {
        *byte_len = (size_t)nbytes;
    }

    uint64_t *data_words = p + 1;
    
    char *out = (char *)malloc((size_t)(nbytes + 1));
    if (!out) {
        fprintf(stderr, "malloc failed in wota_read_text_len\n");
        abort();
    }

    /* Copy bytes from the packed 64-bit words */
    for (long long i = 0; i < nwords; i++) {
        uint64_t wval = data_words[i];
        for (int j = 0; j < 8 && (i * 8 + j) < (long long)nbytes; j++) {
            out[i * 8 + j] = (char)((wval >> (56 - j * 8)) & 0xff);
        }
    }

    out[nbytes] = '\0';
    *text_utf8 = out;

    return (char *)(data_words + nwords);
}

char *wota_read_text(char **text_utf8, char *wota)
{
    return wota_read_text_len(NULL, text_utf8, wota);
}

/* ================================================================
   WRITING
   ================================================================ */

/*
   Helper to see if double is integral and can fit in 56 bits signed.
   Range: -2^55 <= x <= 2^55 - 1
*/
static int fits_in_56_bits(long long x)
{
    const long long min_val = -(1LL << 55);
    const long long max_val =  (1LL << 55) - 1;
    return (x >= min_val && x <= max_val);
}

/*
   Write a WOTA_INT (single 64-bit word):
     top 56 bits = signed integer (arithmetic shift), LSB=0x00
*/
void wota_write_int_word(WotaBuffer *wb, long long val)
{
    /* shift 'val' left by 8 bits into the top 56,
       then OR the type code in the bottom byte. */
    uint64_t u = (uint64_t)((int64_t)val) << 8;
    u |= WOTA_INT;
    uint64_t *p = wota_buffer_alloc(wb, 1);
    p[0] = u;
}

/*
   Write a WOTA_FLOAT (2 words):
     first word => type=0x01 in LSB, top 56 bits=0
     second word => raw IEEE 754 double bits
*/
void wota_write_float_word(WotaBuffer *wb, double val)
{
    uint64_t *p = wota_buffer_alloc(wb, 2);
    p[0] = (uint64_t)WOTA_FLOAT; /* top 56 bits=0, LSB=0x01 */

    union {
        double   d;
        uint64_t u;
    } converter;
    converter.d = val;

    p[1] = converter.u;
}

void wota_write_sym(WotaBuffer *wb, int sym_code)
{
    /* single word => top 56 bits = sym_code, LSB=0x07 */
    uint64_t w = ((uint64_t)(sym_code) << 8) | WOTA_SYM;
    uint64_t *p = wota_buffer_alloc(wb, 1);
    p[0] = w;
}

/*
  BLOB:
    preamble word => top 56 bits= nbits, LSB=0x04
    then floor((nbits + 63)/64) 64-bit words
    If nbits is multiple of 8, we do a fast copy in 64-bit chunks
    (with a byte-swap if on little-endian) to place the first bit
    in the MSB of the first word.
    If partial bits remain, we handle them bit-by-bit at the end.
*/
void wota_write_blob(WotaBuffer *wb, unsigned long long nbits, const char *data)
{
    /* preamble word => top 56 bits= nbits, LSB=0x04 */
    uint64_t preamble = ((uint64_t)nbits << 8) | (uint64_t)WOTA_BLOB;
    uint64_t *p = wota_buffer_alloc(wb, 1);
    p[0] = preamble;

    unsigned long long nwords = (nbits + 63ULL) >> 6; /* # of 64-bit blocks */
    if (nwords == 0) {
        return; /* empty blob => done */
    }
    uint64_t *blocks = wota_buffer_alloc(wb, (size_t)nwords);
    memset(blocks, 0, (size_t)(nwords * sizeof(uint64_t)));

    /* If exactly byte-aligned, do a fast copy first. */
    unsigned long long full_bytes = (nbits / 8ULL);   /* total full bytes */
    unsigned long long leftover_bits = (nbits % 8ULL);/* leftover bits if not multiple of 8 */

    size_t block_index = 0;
    unsigned long long num_full_64_chunks = full_bytes / 8ULL; /* how many full 8-byte chunks */
    unsigned long long remainder_bytes = full_bytes % 8ULL;

    /* 1) Bulk copy each 8-byte chunk */
    for (unsigned long long i = 0; i < num_full_64_chunks; i++) {
        /* read 8 bytes from data, build a 64-bit. */
        uint64_t tmp = 0;
        memcpy(&tmp, data + (i * 8), 8);
        /* We must store it so that the first bit is in the MSB. On a little-endian CPU, that means bswap. */
#if defined(WOTA_LITTLE_ENDIAN)
        tmp = wota_bswap64(tmp);
#endif
        blocks[i] = tmp;
        block_index = (size_t)(i + 1);
    }

    /* 2) If there's remainder_bytes in the next block, handle them. */
    if (remainder_bytes > 0) {
        uint64_t tmp = 0;
        memcpy(&tmp, data + (block_index * 8), (size_t)remainder_bytes);
        /* swap if needed */
#if defined(WOTA_LITTLE_ENDIAN)
        tmp = wota_bswap64(tmp);
#endif
        blocks[block_index] = tmp;
        block_index++;
    }

    /* 3) If leftover_bits != 0, handle the final partial bits bit-by-bit. */
    if (leftover_bits != 0) {
        /* We have leftover_bits up to 7. We'll write them starting from the MSB. */
        /* We'll write them from data[full_bytes]. */
        /* The partial block is blocks[block_index - 1] if remainder_bytes>0, else blocks[block_index]. */
        size_t partial_idx;
        if (remainder_bytes > 0) {
            partial_idx = block_index - 1;
        } else {
            partial_idx = block_index;
        }

        uint64_t outword = blocks[partial_idx];
#if defined(WOTA_LITTLE_ENDIAN)
        /* We want to unify our approach: the block is currently in "MSB=first bit" form. Actually, let's do direct approach: re-swap? */
        /* For safety, let's swap back, set bits, then swap again. Another approach is to set bits from the top down. */
        outword = wota_bswap64(outword);
#endif
        unsigned long long bits_used = remainder_bytes * 8ULL; /* how many bits we've used in this block so far if remainder_bytes>0 */
        int bitpos = 63 - (int)bits_used;  /* start from MSB downwards */

        for (unsigned long long b = 0; b < leftover_bits; b++) {
            int bitval = ( (unsigned char)data[full_bytes] >> b ) & 1;
            outword |= ((uint64_t)bitval << (bitpos));
            bitpos--;
        }

#if defined(WOTA_LITTLE_ENDIAN)
        outword = wota_bswap64(outword);
#endif
        blocks[partial_idx] = outword;
    }
}

void wota_write_text_len(WotaBuffer *wb, const char *utf8, size_t nbytes)
{
    if (!utf8) utf8 = "";

    /* preamble => top 56 bits = nbytes, LSB=0x05 */
    uint64_t preamble = ((uint64_t)nbytes << 8) | (uint64_t)WOTA_TEXT;
    uint64_t *pw = wota_buffer_alloc(wb, 1);
    pw[0] = preamble;

    /* pack UTF-8 bytes into 64-bit words, 8 bytes per word */
    size_t nwords = (nbytes + 7) / 8;
    if (nwords == 0) {
        return;
    }

    uint64_t *blocks = wota_buffer_alloc(wb, nwords);
    memset(blocks, 0, nwords * sizeof(uint64_t));

    for (size_t i = 0; i < nwords; i++) {
        uint64_t wval = 0;
        for (int j = 0; j < 8 && (i * 8 + j) < nbytes; j++) {
            wval |= ((uint64_t)(unsigned char)utf8[i * 8 + j]) << (56 - j * 8);
        }
        blocks[i] = wval;
    }
}

void wota_write_text(WotaBuffer *wb, const char *utf8)
{
    if (!utf8) utf8 = "";
    wota_write_text_len(wb, utf8, strlen(utf8));
}

void wota_write_array(WotaBuffer *wb, unsigned long long count)
{
    /* single 64-bit word => top 56 bits = count, LSB=0x02 */
    uint64_t w = ((uint64_t)count << 8) | (uint64_t)WOTA_ARR;
    uint64_t *p = wota_buffer_alloc(wb, 1);
    p[0] = w;
}

void wota_write_record(WotaBuffer *wb, unsigned long long count)
{
    /* single 64-bit word => top 56 bits = count, LSB=0x03 */
    uint64_t w = ((uint64_t)count << 8) | (uint64_t)WOTA_REC;
    uint64_t *p = wota_buffer_alloc(wb, 1);
    p[0] = w;
}

/*
   wota_write_number:
     If n is an integer (within 2^53 range) you might store as int,
     or specifically check if it fits in 56 bits. If it does, store
     as WOTA_INT. Otherwise store as WOTA_FLOAT (raw double).
*/
void wota_write_number(WotaBuffer *wb, double n)
{
    /* Is it integral within 2^53? Quick check: */
    double ip;
    double frac = modf(n, &ip);
    if (frac == 0.0) {
        /* candidate integer */
        long long i = (long long)ip;
        if ((double)i == ip && fits_in_56_bits(i)) {
            /* store as a 56-bit integer */
            wota_write_int_word(wb, i);
            return;
        }
    }
    /* fallback: store as double */
    wota_write_float_word(wb, n);
}

#endif /* WOTA_IMPLEMENTATION */

#endif /* WOTA_H */