diff --git a/meson.build b/meson.build
index 5f0226ab..9d3a1b30 100644
--- a/meson.build
+++ b/meson.build
@@ -1,13 +1,14 @@
-project('prosperon', ['c', 'cpp'], default_options : [ 'cpp_std=c++11'])
+project('prosperon', ['c', 'cpp'],
+  version: '0.9.2',
+  meson_version: '>=1.4',
+  default_options : [ 'cpp_std=c++11'])
 
 libtype = get_option('default_library') 
 
 link = []
 src = []
 
-if not get_option('editor')
-#  add_project_arguments('-DNEDITOR', language:'c')
-endif
+add_project_arguments('-pedantic', language: ['c'])
 
 git_tag_cmd = run_command('git', 'describe', '--tags', '--abbrev=0', check: false)
 prosperon_version = 'unknown'
@@ -46,12 +47,13 @@ endif
 
 cmake = import('cmake')
 
-sdl3_cmake_options = [
-  '-DSDL_STATIC=ON',
-  '-DSDL_SHARED=OFF',
-  '-DSDL_TEST=OFF',
-  '-DCMAKE_BUILD_TYPE=Release',
-]
+sdl3_opts = cmake.subproject_options()
+sdl3_opts.add_cmake_defines({
+  'SDL_STATIC': 'ON',
+  'SDL_SHARED': 'OFF',
+  'SDL_TEST': 'OFF',
+  'CMAKE_BUILD_TYPE': 'Release'
+})
 
 cc = meson.get_compiler('c')
 
@@ -75,7 +77,7 @@ if host_machine.system() == 'windows'
   deps += cc.find_library('imm32')
   deps += cc.find_library('version')
   deps += cc.find_library('cfgmgr32')
-  sdl3_cmake_options += '-DHAVE_ISINF=1' # TODO: A hack to get this to compile on MSYS2; otherwise it doesn't link correctly
+  sdl3_opts.add_cmake_defines({'HAVE_ISINF': '1'}) # TODO: A hack to get this to compile on MSYS2; otherwise it doesn't link correctly
   link += '-static'
 endif
 
@@ -83,7 +85,7 @@ if host_machine.system() == 'emscripten'
   link += '-sUSE_WEBGPU'
 endif
 
-sdl3_proj = cmake.subproject('sdl3', cmake_options: sdl3_cmake_options)
+sdl3_proj = cmake.subproject('sdl3', options: sdl3_opts)
 
 deps += sdl3_proj.dependency('SDL3-static')
 
@@ -125,7 +127,7 @@ if get_option('enet')
 endif
 
 sources = []
-src += ['anim.c', 'config.c', 'datastream.c','font.c','gameobject.c','HandmadeMath.c','jsffi.c','model.c','render.c','script.c','simplex.c','spline.c', 'timer.c', 'transform.c','prosperon.c',  'wildmatch.c', 'sprite.c', 'quadtree.c', 'aabb.c', 'rtree.c']
+src += ['anim.c', 'config.c', 'datastream.c','font.c','HandmadeMath.c','jsffi.c','model.c','render.c','script.c','simplex.c','spline.c', 'timer.c', 'transform.c','prosperon.c',  'wildmatch.c', 'sprite.c', 'rtree.c']
 
 imsrc = ['GraphEditor.cpp','ImCurveEdit.cpp','ImGradient.cpp','imgui_draw.cpp','imgui_tables.cpp','imgui_widgets.cpp','imgui.cpp','ImGuizmo.cpp','imnodes.cpp','implot_items.cpp','implot.cpp', 'imgui_impl_sdlrenderer3.cpp', 'imgui_impl_sdl3.cpp', 'imgui_impl_sdlgpu3.cpp']
 
@@ -167,7 +169,7 @@ core = custom_target('core.zip',
     ' && echo "Rebuilding core.zip" && rm -f ' + meson.current_build_dir() + '/core.zip && ' +
     'zip -r ' + meson.current_build_dir() + '/core.zip scripts fonts icons shaders'
   ],
-  build_always: true,
+  build_always_stale: true,
   build_by_default: true
 )
 
@@ -196,7 +198,7 @@ prosperon = custom_target('prosperon',
     '@INPUT1@',
     '@OUTPUT@'
   ],
-  build_always: true,
+  build_always_stale: true,
   build_by_default: true
 )
 
@@ -209,10 +211,10 @@ copy_tests = custom_target(
   output: 'tests',
   command: [
     'cp', '-rf',
-    join_paths(meson.source_root(), 'tests'),
-    meson.build_root()
+    join_paths(meson.project_source_root(), 'tests'),
+    meson.project_build_root()
   ],
-  build_always: true,
+  build_always_stale: true,
   build_by_default: true
 )
 
diff --git a/scripts/modules/imgui.js b/scripts/modules/imgui.js
index 6431fc7c..3a2318a3 100644
--- a/scripts/modules/imgui.js
+++ b/scripts/modules/imgui.js
@@ -322,16 +322,6 @@ imgui.barplot[prosperon.DOC] = `Plot a bar chart in the current ImPlot.
 :return: None
 `;
 
-imgui.pieplot[prosperon.DOC] = `Plot a pie chart in the current ImPlot.
-
-:param labels: An array of label strings for each slice.
-:param values: An array of numeric values corresponding to each slice.
-:param x: The x position of the pie’s center.
-:param y: The y position of the pie’s center.
-:param radius: The radius of the pie chart.
-:return: None
-`;
-
 imgui.textplot[prosperon.DOC] = `Render text at the specified coordinates in plot space.
 
 :param text: The string to render.
diff --git a/scripts/modules/io.js b/scripts/modules/io.js
index 084e2faf..644387fe 100644
--- a/scripts/modules/io.js
+++ b/scripts/modules/io.js
@@ -103,8 +103,10 @@ io.basedir[prosperon.DOC] = `Return the application's base directory (where the
 :return: A string with the base directory path.
 `
 
-io.userdir[prosperon.DOC] = `Return the user's directory, often used for saving data.
+io.prefdir[prosperon.DOC] = `Get the user-and-app-specific path where files can be written.
 
+:param org: The name of your organization.
+:param app: The name of your application.
 :return: A string with the user's directory path.
 `
 
diff --git a/scripts/modules/render.js b/scripts/modules/render.js
index 522e5584..9450b3d6 100644
--- a/scripts/modules/render.js
+++ b/scripts/modules/render.js
@@ -361,7 +361,6 @@ function make_shader(sh_file) {
     num_uniform_buffers: refl.ubos ? refl.ubos.length : 0,
     entrypoint: shader_type === "msl" ? "main0" : "main"
   }
-  console.log(`making shader ${sh_file} of format ${shader_type}`)
 
   shader.gpu = render._main.make_shader(shader)
   shader.reflection = refl;
diff --git a/source/aabb.c b/source/aabb.c
deleted file mode 100644
index 0b09b377..00000000
--- a/source/aabb.c
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <stdlib.h>
-#include <math.h>
-
-#include "aabb.h"
-
-aabb*
-aabb_new(float x, float y, float hW, float hH) {
-	aabb* a = malloc(sizeof(aabb));
-	a->center.x = x;
-	a->center.y = y;
-	a->dims.w = hW;
-	a->dims.h = hH;
-	return a;
-}
-
-void
-aabb_free(aabb *a) {
-	free(a);
-}
-
-int
-aabb_contains(aabb *a, float x, float y) {
-	return (x >= a->center.x-a->dims.w &&
-			x <= a->center.x+a->dims.w) &&
-		   (y >= a->center.y-a->dims.h &&
-			y <= a->center.y+a->dims.h);
-}
-
-int
-aabb_intersects(aabb *a, aabb *b) {
-	return (abs(a->center.x - b->center.x) < (a->dims.w + b->dims.w)) &&
-		   (abs(a->center.y - b->center.y) < (a->dims.h + b->dims.h));
-}
-
diff --git a/source/aabb.h b/source/aabb.h
deleted file mode 100644
index 963222b0..00000000
--- a/source/aabb.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
-  aabb.h
-  2014 JSK (kutani@projectkutani.com)
-
-  Simple (2D) axis-aligned bounding box implementation. Part of the Panic
-  Panic project.
-
-  Released to the public domain. See LICENSE for details.
-*/
-#ifndef _AABB_H
- #define _AABB_H
-
-/** \brief axis-aligned bounding box
-
-	Simple struct of four floats, divided into two sub-structs.
-
-	center {x, y} - The center point of the bounding box
-	dims {w, h} - The half-width and half-height of the box
-*/
-typedef struct aabb {
-	struct {
-		float x;
-		float y;
-	} center;
-	struct {
-		float w;
-		float h;
-	} dims;
-} aabb;
-
-/// Malloc's a new aabb struct
-/*!
-  Mallocs a new aabb struct and sets center and dims to the passed
-  x, y, hW, and hH values.
-*/
-aabb* aabb_new(float x, float y, float hW, float hH);
-
-
-/// Frees the passed aabb.
-void aabb_free(aabb *a);
-
-/// Checks if the point x,y lies within the passed aabb
-int aabb_contains(aabb *a, float x, float y);
-
-/// Checks if the two passed aabb's intersect
-int aabb_intersects(aabb *a, aabb *b);
-
-#endif
diff --git a/source/anim.c b/source/anim.c
index b80a17dc..5c106094 100644
--- a/source/anim.c
+++ b/source/anim.c
@@ -16,7 +16,10 @@ void animation_run(struct animation *anim, float now)
 
 HMM_Vec4 sample_cubicspline(sampler *sampler, float t, int prev, int next)
 {
-  return (HMM_Vec4)HMM_SLerp(HMM_QV4(sampler->data[prev]), t, HMM_QV4(sampler->data[next]));
+  HMM_Vec4 ret;
+  HMM_Quat qv = HMM_SLerp(HMM_QV4(sampler->data[prev]), t, HMM_QV4(sampler->data[next]));
+  memcpy(ret.e, qv.e, sizeof(ret.e));
+  return ret;
 }
 
 HMM_Vec4 sample_sampler(sampler *sampler, float time)
@@ -37,6 +40,9 @@ HMM_Vec4 sample_sampler(sampler *sampler, float time)
   float td = sampler->times[next_time]-sampler->times[previous_time];
   float t = (time - sampler->times[previous_time])/td;
 
+  HMM_Vec4 ret;
+  HMM_Quat qv;
+
   switch(sampler->type) {
     case LINEAR:
       return HMM_LerpV4(sampler->data[previous_time],time,sampler->data[next_time]);
@@ -48,7 +54,9 @@ HMM_Vec4 sample_sampler(sampler *sampler, float time)
       return sample_cubicspline(sampler,t, previous_time, next_time);
       break;
     case SLERP:
-      return (HMM_Vec4)HMM_SLerp(sampler->data[previous_time].quat, time, sampler->data[next_time].quat);
+      qv = HMM_SLerp(sampler->data[previous_time].quat, time, sampler->data[next_time].quat);
+      memcpy(ret.e,qv.e,sizeof(ret.e));
+      return ret;
       break;
   }
   return sample_cubicspline(sampler,t, previous_time, next_time);  
diff --git a/source/config.c b/source/config.c
index 3451d647..5ae39455 100644
--- a/source/config.c
+++ b/source/config.c
@@ -24,11 +24,12 @@
 #define STBI_NO_STDIO
 #include "stb_image.h"
 
+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_BOX
+
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
 #include "stb_image_resize2.h"
 
 #define STB_IMAGE_WRITE_IMPLEMENTATION
-#define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_BOX
 #include "stb_image_write.h"
 
 #define PL_MPEG_IMPLEMENTATION
diff --git a/source/cute_aseprite.h b/source/cute_aseprite.h
index 63ca7cae..08e582a1 100644
--- a/source/cute_aseprite.h
+++ b/source/cute_aseprite.h
@@ -306,24 +306,7 @@ struct ase_t
 	void* mem_ctx;
 };
 
-#define ASEPRITE_ERROR_MAX 256
-static char aseprite_error[ASEPRITE_ERROR_MAX] = {0};
-
-static const char *aseprite_GetError() {
-  return aseprite_error;
-}
-
-static void aseprite_clear_error() {
-  aseprite_error[0] = 0;
-}
-
-static void aseprite_set_error(const char *msg) {
-  if (msg) {
-    strncpy(aseprite_error, msg, ASEPRITE_ERROR_MAX-1);
-    aseprite_error[ASEPRITE_ERROR_MAX-1] = 0;
-  } else
-    aseprite_error[0] = 0;
-}
+const char *aseprite_GetError();
 
 #endif // CUTE_ASEPRITE_H
 
@@ -331,6 +314,24 @@ static void aseprite_set_error(const char *msg) {
 #ifndef CUTE_ASEPRITE_IMPLEMENTATION_ONCE
 #define CUTE_ASEPRITE_IMPLEMENTATION_ONCE
 
+#define ASEPRITE_ERROR_MAX 256
+char aseprite_error[ASEPRITE_ERROR_MAX] = {0};
+
+const char *aseprite_GetError() {
+  return aseprite_error;
+}
+
+void aseprite_clear_error() {
+  aseprite_error[0] = 0;
+}
+
+void aseprite_set_error(const char *msg) {
+  if (msg) {
+    strncpy(aseprite_error, msg, ASEPRITE_ERROR_MAX-1);
+    aseprite_error[ASEPRITE_ERROR_MAX-1] = 0;
+  } else
+    aseprite_error[0] = 0;
+}
 
 #ifndef _CRT_SECURE_NO_WARNINGS
 	#define _CRT_SECURE_NO_WARNINGS
diff --git a/source/datastream.c b/source/datastream.c
index 1f2b1acf..1ef4bd5a 100644
--- a/source/datastream.c
+++ b/source/datastream.c
@@ -15,11 +15,6 @@ void datastream_free(JSRuntime *rt,datastream *ds)
   free(ds);
 }
 
-static void render_audio(plm_t *mpeg, plm_samples_t *samples, struct datastream *ds) {
-//  for (int i = 0; i < samples->count * CHANNELS; i++)
-//    ringpush(ds->ring, samples->interleaved[i]);
-}
-
 struct datastream *ds_openvideo(void *raw, size_t rawlen)
 {
   struct datastream *ds = malloc(sizeof(*ds));
diff --git a/source/font.c b/source/font.c
index 686af696..0391c8c3 100644
--- a/source/font.c
+++ b/source/font.c
@@ -21,31 +21,6 @@ void font_free(JSRuntime *rt, font *f)
   free(f);
 }
 
-struct sFont *MakeSDFFont(const char *fontfile, int height)
-{
-  int packsize = 1024;
-  struct sFont *newfont = calloc(1, sizeof(struct sFont));
-  newfont->height = height;
-
-  char fontpath[256];
-  snprintf(fontpath, 256, "fonts/%s", fontfile);
-
-//  unsigned char *ttf_buffer = slurp_file(fontpath, NULL);
-  unsigned char *bitmap = malloc(packsize * packsize);
-
-  stbtt_fontinfo fontinfo;
-//  if (!stbtt_InitFont(&fontinfo, ttf_buffer, stbtt_GetFontOffsetForIndex(ttf_buffer, 0))) {
-//    YughError("Failed to make font %s", fontfile);
-//  }
-
-  for (int i = 32; i < 95; i++) {
-    int w, h, xoff, yoff;
-//    unsigned char *stbtt_GetGlyphSDF(&fontinfo, height, i, 1, 0, 1, &w, &h, &xoff, &yoff);
-  }
-
-  return newfont;
-}
-
 struct sFont *MakeFont(void *ttf_buffer, size_t len, int height) {
   if (!ttf_buffer)
     return NULL;
@@ -111,29 +86,6 @@ struct sFont *MakeFont(void *ttf_buffer, size_t len, int height) {
   return newfont;
 }
 
-int text_flush() {
-/*  if (arrlen(text_buffer) ==  0) return 0;
-
-  sg_range verts;
-  verts.ptr = text_buffer;
-  verts.size = sizeof(struct text_vert) * arrlen(text_buffer);
-  if (sg_query_buffer_will_overflow(*buf, verts.size)) {
-    sg_destroy_buffer(*buf);
-    *buf = sg_make_buffer(&(sg_buffer_desc){
-      .size = verts.size,
-      .type = SG_BUFFERTYPE_STORAGEBUFFER,
-      .usage = SG_USAGE_STREAM,
-      .label = "text buffer"
-    });
-  }
-    
-  sg_append_buffer(*buf, &verts);
-  int n = arrlen(text_buffer);
-  arrsetlen(text_buffer, 0);
-  return n;
-*/
-}
-
 void sdrawCharacter(struct text_vert **buffer, stbtt_packedchar c, HMM_Vec2 cursor, float scale, struct rgba color) {
   struct text_vert vert;
 
@@ -185,7 +137,7 @@ const char *esc_color(const char *c, struct rgba *color, struct rgba defc)
 {
   struct rgba d;
   if (!color) color = &d;
-  if (*c != '\e') return c;
+  if (*c != '\033') return c;
   c++;
   if (*c != '[') return c;
   c++;
@@ -235,7 +187,7 @@ HMM_Vec2 measure_text(const char *text, font *f, float size, float letterSpacing
             continue;
         }
 
-        float charWidth = f->Characters[*c].advance + letterSpacing;
+        float charWidth = f->Characters[(unsigned char)*c].advance + letterSpacing;
 
         // Handle wrapping
         if (wrap > 0 && lineWidth + charWidth > wrap) {
@@ -283,15 +235,13 @@ HMM_Vec2 measure_text(const char *text, font *f, float size, float letterSpacing
 }
 /* pos given in screen coordinates */
 struct text_vert *renderText(const char *text, HMM_Vec2 pos, font *f, float scale, colorf color, float wrap) {
-  int wrapAtWord = 1;
   text_vert *buffer = NULL;
-  int len = strlen(text);
 
   HMM_Vec2 cursor = pos;
   float lineHeight = f->ascent - f->descent;
   float lineWidth = 0;
 
-  for (char *c = text; *c != 0; c++) {
+  for (const char *c = text; *c != 0; c++) {
     if (*c == '\n') {
       cursor.x = pos.x;
       cursor.y -= lineHeight + f->linegap;
@@ -299,7 +249,7 @@ struct text_vert *renderText(const char *text, HMM_Vec2 pos, font *f, float scal
       continue;
     }
 
-    struct character chara = f->Characters[*c];
+    struct character chara = f->Characters[(unsigned char)*c];
    
     if (wrap > 0 && lineWidth + chara.advance > wrap) {
       cursor.x = pos.x;
diff --git a/source/font.h b/source/font.h
index f4e36289..fe8fc757 100644
--- a/source/font.h
+++ b/source/font.h
@@ -60,7 +60,4 @@ struct sFont *MakeFont(void *data, size_t len, int height);
 struct text_vert *renderText(const char *text, HMM_Vec2 pos, font *f, float scale, colorf color, float wrap);
 HMM_Vec2 measure_text(const char *text, font *f, float scale, float letterSpacing, float wrap);
 
-// Flushes all letters from renderText calls into the provided buffer
-int text_flush();
-
 #endif
diff --git a/source/gameobject.c b/source/gameobject.c
deleted file mode 100644
index ebb1fc96..00000000
--- a/source/gameobject.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "gameobject.h"
-
-#include "math.h"
-#include <chipmunk/chipmunk.h>
-
-#include "stb_ds.h"
-
-static void velocityFn(cpBody *body, cpVect gravity, cpFloat damping, cpFloat dt)
-{
-/*  gameobject *go = body2go(body);
-  gameobject_apply(go);  
-  cpVect pos = cpBodyGetPosition(body);  
-  HMM_Vec2 g = warp_force((HMM_Vec3){pos.x, pos.y, 0}, go->warp_mask).xy;
-  if (!go) {
-    cpBodyUpdateVelocity(body,g.cp,damping,dt);
-    return;
-  }
-
-//  cpFloat d = isfinite(go->damping) ? go->damping : damping;
-  cpFloat d = damping;
-  
-  cpBodyUpdateVelocity(body,g.cp,d,dt*go->timescale);
-
-  if (isfinite(go->maxvelocity))
-    cpBodySetVelocity(body, cpvclamp(cpBodyGetVelocity(body), go->maxvelocity));
-
-  if (isfinite(go->maxangularvelocity)) {
-    float av = cpBodyGetAngularVelocity(body);
-    if (fabs(av) > go->maxangularvelocity)
-      cpBodySetAngularVelocity(body, copysignf(go->maxangularvelocity, av));
-  }
-*/
-}
diff --git a/source/gameobject.h b/source/gameobject.h
deleted file mode 100644
index 668ecb22..00000000
--- a/source/gameobject.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef GAMEOBJECT_H
-#define GAMEOBJECT_H
-
-#define dag_rm(p,c) do{\
- for (int i = arrlen(p->children)-1; i--; i >=0) {\
-  if (p->children[i] == c) { \
-  arrdelswap(p->children,i);\
-  c->parent=NULL;\
-  break;\
-}}}while(0)
-
-#define dag_set(p,c) do{\
-  arrpush(p->children,c);\
-  if(c->parent) dag_rm(c->parent,c);\
-  c->parent=p;\
-}while(0)
-
-#define dag_clip(p) do{\
-  if (p->parent)\
-    dag_rm(p->parent,p);\
-}while(0)
-
-struct gameobject {
-  float damping;
-  float timescale;
-  float maxvelocity;
-  float maxangularvelocity;
-  unsigned int layer;
-  unsigned int warp_mask;
-};
-
-/*
-  Friction uses coulomb model. When shapes collide, their friction is multiplied. Some example values:
-  Steel on steel: 0.0005
-  Wood on steel: 0.0012
-  Wood on wood: 0.0015
-  => steel = 0.025
-  => wood = 0.04
-  => hardrubber = 0.31
-  => concrete = 0.05
-  => rubber = 0.5
-  Hardrubber on steel: 0.0077
-  Hardrubber on concrete: 0.015
-  Rubber on concrete: 0.025
-*/
-
-typedef struct gameobject gameobject;
-
-#endif
diff --git a/source/jsffi.c b/source/jsffi.c
index 88b3776e..7dcf9bdf 100644
--- a/source/jsffi.c
+++ b/source/jsffi.c
@@ -62,8 +62,6 @@ typedef struct rtree rtree;
 //#include <cblas.h>
 #endif
 
-
-
 #define STATE_VECTOR_LENGTH 624
 #define STATE_VECTOR_M      397 /* changes to STATE_VECTOR_LENGTH also require changes to this */
 
@@ -1038,7 +1036,7 @@ static const char *vals_SDL_GPUTextureFormat[] = {
   "astc 12x12 float"
 };
 
-JS2ENUM(SDL_GPUTextureFormat, rets_SDL_GPUTextureFormat, vals_SDL_GPUTextureFormat);
+JS2ENUM(SDL_GPUTextureFormat, rets_SDL_GPUTextureFormat, vals_SDL_GPUTextureFormat)
 
 SDL_GPUColorTargetBlendState js2SDL_GPUColorTargetBlendState(JSContext *js, JSValue v)
 {
@@ -1606,7 +1604,7 @@ int point2segindex(HMM_Vec2 p, HMM_Vec2 *segs, double slop) {
   return best;
 }
 
-static JSValue idx_buffer = JS_UNDEFINED;
+static JSValue idx_buffer;
 static int idx_count = 0;
 
 JSValue make_quad_indices_buffer(JSContext *js, int quads)
@@ -1828,7 +1826,7 @@ JSValue js_math_dot(JSContext *js, JSValue self, int argc, JSValue *argv) {
   free(a);
   free(b);
   return number2js(js,dot);
-};
+}
 
 JSValue js_math_project(JSContext *js, JSValue self, int argc, JSValue *argv) {
   size_t alen, blen;
@@ -2572,10 +2570,10 @@ JSC_CCALL(os_engine_start,
   JS_SDL_PROP(js, p, SDL_PROP_APP_METADATA_URL_STRING, url)  
   JS_SDL_PROP(js, p, SDL_PROP_APP_METADATA_TYPE_STRING, type)  
   
-  if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_CAMERA) < 0)
-  return JS_ThrowReferenceError(js, "Couldn't initialize SDL: %s\n", SDL_GetError());
+  if (!SDL_Init(SDL_INIT_VIDEO | SDL_INIT_CAMERA))
+    return JS_ThrowReferenceError(js, "Couldn't initialize SDL: %s\n", SDL_GetError());
 
-  char *title;
+  const char *title;
   JS_GETPROP(js,title,argv[0],title,cstring)
   
   SDL_Window *new = SDL_CreateWindow(title, js2number(js, js_getproperty(js,argv[0], width_atom)), js2number(js,js_getproperty(js,argv[0], height_atom)), SDL_WINDOW_RESIZABLE);
@@ -3533,7 +3531,6 @@ JSC_CCALL(renderer_make_sprite_mesh,
   for (int i = 0; i < quads; i++) {
     JSValue sub = JS_GetPropertyUint32(js,sprites,i);
     JSValue jstransform = JS_GetProperty(js,sub,transform_atom);
-    transform *tr = js2transform(js,jstransform);
 
     JSValue jssrc = JS_GetProperty(js,sub,src_atom);
     JSValue jscolor = JS_GetProperty(js,sub,color_atom);
@@ -3552,24 +3549,12 @@ JSC_CCALL(renderer_make_sprite_mesh,
 
     // Calculate the base index for the current quad
     size_t base = i * 4;
-    
-//    HMM_Mat3 trmat = transform2mat3_global(tr);
-
-    HMM_Vec3 base_quad[4] = {
-      {0.0,0.0,1.0},
-      {1.0,0.0,1.0},
-      {0.0,1.0,1.0},
-      {1.0,1.0,1.0}
-    };
-
-//    for (int j = 0; j < 4; j++)
-//      posdata[base+j] = HMM_MulM3V3(trmat, base_quad[j]).xy;
 
     // Define the UV coordinates based on the source rectangle
-    uvdata[base + 0] = (HMM_Vec2){ src.x,                  src.y + src.h };
-    uvdata[base + 1] = (HMM_Vec2){ src.x + src.w,      src.y + src.h };    
-    uvdata[base + 2] = (HMM_Vec2){ src.x,                  src.y };
-    uvdata[base + 3] = (HMM_Vec2){ src.x + src.w,      src.y };
+    uvdata[base + 0] = (HMM_Vec2){ src.x,            src.y + src.h };
+    uvdata[base + 1] = (HMM_Vec2){ src.x + src.w,    src.y + src.h };    
+    uvdata[base + 2] = (HMM_Vec2){ src.x,            src.y };
+    uvdata[base + 3] = (HMM_Vec2){ src.x + src.w,    src.y };
 
     colordata[base] = color;
     colordata[base+1] = color;
@@ -4156,18 +4141,12 @@ JSC_CCALL(gpu_texture,
   return jstex;
 )
 
-static     HMM_Vec3 base_quad[4] = {
-      {0.0,0.0,1.0},
-      {1.0,0.0,1.0},
-      {0.0,1.0,1.0},
-      {1.0,1.0,1.0}
-    };
-    static HMM_Vec4 base_quad_4[4] = {
-    { 0.0,0.0, 1.0f, 1.0f },
-    { 1,0,0.0, 1.0f, 1.0f },
-    { 0.0,1.0, 1.0f, 1.0f },
-    { 1.0,1.0, 1.0f, 1.0f }
-    };
+static HMM_Vec3 base_quad[4] = {
+  {0.0,0.0,1.0},
+  {1.0,0.0,1.0},
+  {0.0,1.0,1.0},
+  {1.0,1.0,1.0}
+};
     
 static inline void add_quad(text_vert **verts, rect *restrict src, rect *restrict dst)
 {
@@ -4644,7 +4623,6 @@ JSC_CCALL(gpu_acquire_cmd_buffer,
    2: an optional transfer buffer to use; if undefined a temporary one is used
 */
 JSC_CCALL(gpu_upload,
-  Uint64 ss = SDL_GetTicksNS();
   JSValue js_cmd = argv[0];
   JSValue js_buffers = argv[1];
   JSValue js_transfer = argv[2];
@@ -4724,16 +4702,14 @@ JSC_CCALL(gpu_upload,
         return JS_ThrowReferenceError(js, "Failed to map transfer buffer: %s", SDL_GetError());
     }
 
-   Uint64 sy = SDL_GetTicksNS();
     // Copy all data into the mapped transfer buffer
     size_t current_offset = 0;
     for (size_t i = 0; i < len; i++) {
-        memcpy(mapped_data + current_offset, items[i].data, items[i].size);
+        memcpy((char*)mapped_data + current_offset, items[i].data, items[i].size);
         current_offset += items[i].size;
     }
     SDL_UnmapGPUTransferBuffer(gpu, transfer);
 
-
     // Issue uploads for each item
     current_offset = 0;
     for (size_t i = 0; i < len; i++) {
@@ -5120,14 +5096,6 @@ JSC_CCALL(gpu_tile,
   arrfree(verts);
 )
 
-static const JSCFunctionListEntry js_SDL_GPUCopyPass_funcs[] = {};
-static const JSCFunctionListEntry js_SDL_GPUFence_funcs[] = {};
-static const JSCFunctionListEntry js_SDL_GPUTransferBuffer_funcs[] = {};
-static const JSCFunctionListEntry js_SDL_GPUShader_funcs[] = {};
-static const JSCFunctionListEntry js_SDL_GPUSampler_funcs[] = {};
-static const JSCFunctionListEntry js_SDL_GPUGraphicsPipeline_funcs[] = {};
-static const JSCFunctionListEntry js_SDL_GPUComputePipeline_funcs[] = {};
-
 static const JSCFunctionListEntry js_SDL_GPUDevice_funcs[] = {
   MIST_FUNC_DEF(gpu, claim_window, 1),
   MIST_FUNC_DEF(gpu, make_pipeline, 1), // loads pipeline state into an object
@@ -5707,8 +5675,6 @@ static const JSCFunctionListEntry js_SDL_Camera_funcs[] =
   MIST_FUNC_DEF(camera, release_frame, 1),
 };
 
-static const JSCFunctionListEntry js_SDL_Cursor_funcs[] = {};
-
 JSC_CCALL(texture_mode,
   SDL_Texture *tex = js2SDL_Texture(js,self);
   SDL_SetTextureScaleMode(tex,js2number(js,argv[0]));
@@ -5832,42 +5798,14 @@ static const JSCFunctionListEntry js_console_funcs[] = {
   MIST_FUNC_DEF(console,print,1),
 };
 
-JSC_CCALL(profile_gather_rate,
-  JS_SetInterruptRate(js2number(js,argv[0]));
-)
-
-JSC_CCALL(profile_gather_stop,
-  JS_SetInterruptHandler(JS_GetRuntime(js),NULL,NULL);
-)
-
-JSC_CCALL(profile_best_t,
-  char* result[50];
-  double seconds = js2number(js,argv[0]);
-  if (seconds < 1e-6)
-    snprintf(result, 50, "%.2f ns", seconds * 1e9);
-  else if (seconds < 1e-3)
-    snprintf(result, 50, "%.2f µs", seconds * 1e6);
-  else if (seconds < 1)
-    snprintf(result, 50, "%.2f ms", seconds * 1e3);
-  else
-    snprintf(result, 50, "%.2f s", seconds);
-
-  return JS_NewString(js,result);
-)
-
-static const JSCFunctionListEntry js_profile_funcs[] = {
-  MIST_FUNC_DEF(profile,best_t, 1),
-  MIST_FUNC_DEF(profile,gather_rate,1),
-  MIST_FUNC_DEF(profile,gather_stop,0),
-};
-
 JSC_CCALL(debug_stack_depth, return number2js(js,js_debugger_stack_depth(js)))
 JSC_CCALL(debug_build_backtrace, return js_debugger_build_backtrace(js,NULL))
 JSC_CCALL(debug_closure_vars, return js_debugger_closure_variables(js,argv[0]))
 JSC_CCALL(debug_local_vars, return js_debugger_local_variables(js, js2number(js,argv[0])))
-JSC_CCALL(debug_fn_info, return js_debugger_fn_info(js, argv[0]));
-JSC_CCALL(debug_backtrace_fns, return js_debugger_backtrace_fns(js,NULL));
-JSC_CCALL(debug_dump_obj, return js_dump_value(js, argv[0]));
+JSC_CCALL(debug_fn_info, return js_debugger_fn_info(js, argv[0]))
+JSC_CCALL(debug_backtrace_fns, return js_debugger_backtrace_fns(js,NULL))
+JSC_CCALL(debug_dump_obj, return js_dump_value(js, argv[0]))
+
 static const JSCFunctionListEntry js_debug_funcs[] = {
   MIST_FUNC_DEF(debug, stack_depth, 0),
   MIST_FUNC_DEF(debug, build_backtrace, 0),
@@ -5932,22 +5870,33 @@ JSC_SCALL(io_slurp,
   END:
 )
 
+size_t js_physfs_write(JSContext *js, PHYSFS_File *f, JSValue val)
+{
+  size_t len;
+  size_t wrote;
+  if (JS_IsString(val)) {
+    const char *data = JS_ToCStringLen(js,&len,val);
+    wrote = PHYSFS_writeBytes(f,data,len);
+    JS_FreeCString(js,data);
+  } else {
+    unsigned char *data = JS_GetArrayBuffer(js,&len,val);
+    wrote = PHYSFS_writeBytes(f,data,len);
+  }
+
+  if (wrote < len) wrote = -1;
+  return wrote;
+}
+
 JSC_SCALL(io_slurpwrite,
   PHYSFS_File *f = PHYSFS_openWrite(str);
   if (!f) {
     ret = JS_ThrowReferenceError(js,"could not write to %s: %s", str, PHYSFS_getErrorByCode(PHYSFS_getLastErrorCode()));
     goto END;
   }
-  size_t len;
-  unsigned char *data;
-  if (JS_IsString(argv[1]))
-    data = JS_ToCStringLen(js,&len,argv[1]);
-   else
-    data = JS_GetArrayBuffer(js,&len, argv[1]);
+  size_t wrote = js_physfs_write(js,f,argv[1]);
 
-  size_t wrote = PHYSFS_writeBytes(f,data, len);
   PHYSFS_close(f);  
-  if (wrote == -1 || wrote < len)
+  if (wrote == -1)
     ret = JS_ThrowReferenceError(js,"%s", PHYSFS_getErrorByCode(PHYSFS_getLastErrorCode()));
   
   END:
@@ -5989,10 +5938,11 @@ int globfs_cb(struct globdata *data, char *dir, char *file)
   }
 
   char **glob = data->globs;
+
   while (*glob != NULL) {
     if (wildmatch(*glob, path, WM_WILDSTAR) == WM_MATCH)
       goto END;
-    *glob++;
+    glob++;
   }
 
   PHYSFS_Stat stat;
@@ -6023,18 +5973,18 @@ JSC_CCALL(io_globfs,
   data.arr = ret;
   data.idx = 0;
   int globs_len = js_arrlen(js,argv[0]);
-  char *globs[globs_len+1];
+  const char *globs[globs_len+1];
   for (int i = 0; i < globs_len; i++) {
     JSValue g = JS_GetPropertyUint32(js,argv[0],i);
     globs[i] = JS_ToCString(js,g);
     JS_FreeValue(js,g);
   }
+  
   globs[globs_len] = NULL;
   data.globs = globs;
 
   const char *path = NULL;
   if (!JS_IsUndefined(argv[1])) path = JS_ToCString(js,argv[1]);
-  printf("LOOKING INTO %s\n", path);
   PHYSFS_enumerate(path, globfs_cb, &data);
 
   for (int i = 0; i < globs_len; i++)
@@ -6100,7 +6050,7 @@ JSC_SCALL(io_enumerate,
 )
 
 JSC_CCALL(io_basedir, return JS_NewString(js,PHYSFS_getBaseDir()))
-JSC_CCALL(io_userdir, return JS_NewString(js,PHYSFS_getUserDir()))
+JSC_SSCALL(io_prefdir, return JS_NewString(js,PHYSFS_getPrefDir(str, str2)))
 
 JSC_SCALL(io_open,
   PHYSFS_File *f = PHYSFS_openWrite(str);
@@ -6158,7 +6108,7 @@ static const JSCFunctionListEntry js_io_funcs[] = {
   MIST_FUNC_DEF(io,slurpwrite,2),
   MIST_FUNC_DEF(io,writepath, 1),
   MIST_FUNC_DEF(io,basedir, 0),
-  MIST_FUNC_DEF(io, userdir, 0),
+  MIST_FUNC_DEF(io, prefdir, 2),
   MIST_FUNC_DEF(io, realdir, 1),
   MIST_FUNC_DEF(io, open, 1),
   MIST_FUNC_DEF(io, searchpath, 0),
@@ -6174,15 +6124,8 @@ JSC_CCALL(file_close,
 
 JSC_CCALL(file_write,
   PHYSFS_File *f = js2PHYSFS_File(js,self);
-  size_t len;
-  unsigned char *data;
-  if (JS_IsString(argv[0]))
-    data = JS_ToCStringLen(js,&len,argv[0]);
-   else
-    data = JS_GetArrayBuffer(js,&len, argv[0]);
-
-  size_t wrote = PHYSFS_writeBytes(f,data,len);
-  if (wrote == -1 || wrote < len)
+  size_t wrote = js_physfs_write(js,f,argv[0]);
+  if (wrote == -1)
     return JS_ThrowReferenceError(js,"%s", PHYSFS_getErrorByCode(PHYSFS_getLastErrorCode()));
 )
 
@@ -6309,6 +6252,7 @@ static JSValue js_transform_set_change_hook(JSContext *js, JSValueConst self, JS
   if (!JS_IsUndefined(v) && !JS_IsFunction(js,v)) return JS_ThrowReferenceError(js, "Hook must be a function.");
   JS_FreeValue(js,t->change_hook);
   t->change_hook = JS_DupValue(js,v);
+  return JS_UNDEFINED;
 }
 
 static JSValue js_transform_get_parent(JSContext *js, JSValueConst self)
@@ -6590,17 +6534,6 @@ JSC_CCALL(os_gc_threshold, JS_SetGCThreshold(JS_GetRuntime(js), js2number(js,arg
 JSC_CCALL(os_max_stacksize, JS_SetMaxStackSize(JS_GetRuntime(js), js2number(js,argv[0])))
 JSC_CCALL(os_rt_info, return JS_GetRTInfo(JS_GetRuntime(js),js))
 
-static JSValue tmp2js(JSContext *js,FILE *tmp)
-{
-  size_t size = ftell(tmp);
-  rewind(tmp);
-  char *buffer = calloc(size+1, sizeof(char));
-  fread(buffer, sizeof(char),size, tmp);
-  JSValue ret = JS_NewString(js,buffer);
-  free(buffer);
-  return ret;
-}
-
 JSC_CCALL(os_dump_atoms,
   return js_dump_atoms(js);
 )
@@ -6641,8 +6574,8 @@ JSC_CCALL(os_mallinfo,
   JSJMEMRET(keepcost);*/
 )
 
-JSC_CCALL(os_rusage,
-  ret = JS_NewObject(js);
+JSValue js_os_rusage(JSContext *js, JSValue self, int argc, JSValue *argv) {
+  JSValue ret = JS_NewObject(js);
 
 #ifndef _WIN32
   struct rusage jsmem;
@@ -6662,7 +6595,9 @@ JSC_CCALL(os_rusage,
   JSJMEMRET(ru_nvcsw);
   JSJMEMRET(ru_nivcsw);
 #endif
-)
+
+  return ret;
+}
 
 JSC_CCALL(os_mem, return js_get_memory_usage(js))
 JSC_CCALL(os_value_id,
@@ -7135,7 +7070,7 @@ JSC_CCALL(os_hostname,
   return JS_NewString(js,"");
 )
 
-JSC_CCALL(os_freemem,
+JSValue js_os_freemem(JSContext *js, JSValue self, int argc, JSValue *argv) {
 #ifdef _WIN32
   MEMORYSTATUSEX statex;
   statex.dwLength = sizeof(statex);
@@ -7158,9 +7093,9 @@ JSC_CCALL(os_freemem,
   // Fallback: unknown
   return JS_NewInt64(js,0);
 #endif
-)
+}
 
-JSC_CCALL(os_arch,
+JSValue js_os_arch(JSContext *js, JSValue self, int argc, JSValue *argv) {
 #if defined(__x86_64__) || defined(_M_X64)
   return JS_NewString(js,"x64");
 #elif defined(__aarch64__) || defined(_M_ARM64)
@@ -7184,9 +7119,9 @@ JSC_CCALL(os_arch,
 #else
   return JS_NewString(js,"unknown");
 #endif
-)
+}
 
-JSC_CCALL(os_version,
+JSValue js_os_version(JSContext *js, JSValue self, int argc, JSValue *argv) {
 #ifdef _WIN32
   typedef LONG (WINAPI *RtlGetVersionPtr)(PRTL_OSVERSIONINFOW);
   HMODULE h = GetModuleHandleA("ntdll.dll");
@@ -7225,7 +7160,9 @@ JSC_CCALL(os_version,
   if (!uname(&info)) return JS_NewString(js, info.release);
   return JS_NewString(js, "");
 #endif
-)
+
+  return JS_UNDEFINED;
+}
 
 static const JSCFunctionListEntry js_os_funcs[] = {
   MIST_FUNC_DEF(os, make_transform, 0),
@@ -7269,7 +7206,7 @@ JSC_CCALL(js_dump_class, return js_get_object_class_distribution(js))
 JSC_CCALL(js_dump_type_overheads, return js_get_object_type_overheads(js))
 JSC_CCALL(js_dump_objects, return js_dump_objects(js))
 
-JSValue cycle_fn = JS_UNDEFINED;
+static JSValue cycle_fn;
 
 void cycle_hook_call(JSContext *js, JSValue v)
 {
@@ -7336,7 +7273,7 @@ static const JSCFunctionListEntry js_video_funcs[] = {
 
 void gui_input(SDL_Event *e);
 // Polls and handles all input events
-JSC_CCALL(os_engine_input,
+JSValue js_os_engine_input(JSContext *js, JSValue self, int argc, JSValue *argv) {
   SDL_Event event;
   while (SDL_PollEvent(&event)) {
 #ifndef NEDITOR
@@ -7346,7 +7283,8 @@ JSC_CCALL(os_engine_input,
     JSValue ret = JS_Call(js,argv[0], JS_UNDEFINED, 1, &e);
     uncaught_exception(js,ret);
   }
-)
+  return JS_UNDEFINED;
+}
 
 JSC_CCALL(os_push_event,
   SDL_UserEvent e;
@@ -7714,19 +7652,21 @@ void ffi_load(JSContext *js, int argc, char **argv) {
   QJSCLASSPREP_FUNCS(SDL_Texture)
   QJSCLASSPREP_FUNCS(SDL_Renderer)
   QJSCLASSPREP_FUNCS(SDL_Camera)
-  QJSCLASSPREP_FUNCS(SDL_Cursor)
   QJSCLASSPREP_FUNCS(SDL_GPUDevice)
   QJSCLASSPREP_FUNCS(SDL_GPUTexture)
   QJSCLASSPREP_FUNCS(SDL_GPUCommandBuffer)
   QJSCLASSPREP_FUNCS(SDL_GPURenderPass)
   QJSCLASSPREP_FUNCS(SDL_GPUComputePass)
-  QJSCLASSPREP_FUNCS(SDL_GPUCopyPass)
-  QJSCLASSPREP_FUNCS(SDL_GPUFence)
-  QJSCLASSPREP_FUNCS(SDL_GPUTransferBuffer)
-  QJSCLASSPREP_FUNCS(SDL_GPUShader)
-  QJSCLASSPREP_FUNCS(SDL_GPUSampler)
-  QJSCLASSPREP_FUNCS(SDL_GPUGraphicsPipeline)
-  QJSCLASSPREP_FUNCS(SDL_GPUComputePipeline)
+
+  QJSCLASSPREP_NO_FUNCS(SDL_Cursor)
+  QJSCLASSPREP_NO_FUNCS(SDL_GPUCopyPass)
+  QJSCLASSPREP_NO_FUNCS(SDL_GPUFence)
+  QJSCLASSPREP_NO_FUNCS(SDL_GPUTransferBuffer)
+  QJSCLASSPREP_NO_FUNCS(SDL_GPUShader)
+  QJSCLASSPREP_NO_FUNCS(SDL_GPUSampler)
+  QJSCLASSPREP_NO_FUNCS(SDL_GPUGraphicsPipeline)
+  QJSCLASSPREP_NO_FUNCS(SDL_GPUComputePipeline)
+  
   QJSCLASSPREP_FUNCS(sprite)
 //  QJSCLASSPREP_FUNCS(SDL_GPUGraphicsPipeline)
 //  QJSCLASSPREP_FUNCS(SDL_GPUSampler)
@@ -7874,5 +7814,8 @@ void ffi_load(JSContext *js, int argc, char **argv) {
 
   JS_SetPropertyStr(js,globalThis,"prosperon", prosp);
 
+  idx_buffer = JS_UNDEFINED;
+  cycle_fn = JS_UNDEFINED;
+
   JS_FreeValue(js,globalThis);  
 }
diff --git a/source/model.c b/source/model.c
index ef8f28cb..7b02b618 100644
--- a/source/model.c
+++ b/source/model.c
@@ -1,7 +1,6 @@
 #include "model.h"
 
 #include "stb_ds.h"
-#include "gameobject.h"
 
 #include "render.h"
 
@@ -18,59 +17,19 @@
 
 #include "jsffi.h"
 
-unsigned short pack_short_tex(float c) { return c * USHRT_MAX; }
-
 SDL_GPUBuffer *texcoord_floats(float *f, int n)
 {
-  unsigned short packed[n];
-  for (int i = 0; i < n; i++) {
-    float v = f[i];
-    if (v < 0) v = 0;
-    if (v > 1) v = 1;
-    packed[i] = pack_short_tex(v);
-  }
-
-/*  return sg_make_buffer(&(sg_buffer_desc){
-    .data = SG_RANGE(packed),
-    .label = "tex coord vert buffer",
-  });*/
-  return NULL;
-}
-
-SDL_GPUBuffer *par_idx_buffer(uint32_t *p, int v)
-{
-  uint16_t idx[v];
-  for (int i = 0; i < v; i++) idx[i] = p[i];
-  
-/*  return sg_make_buffer(&(sg_buffer_desc){
-    .data = SG_RANGE(idx),
-    .type = SG_BUFFERTYPE_INDEXBUFFER
-  });*/
   return NULL;
 }
 
 SDL_GPUBuffer *float_buffer(float *f, int v)
 {
   return NULL;
-/*  return sg_make_buffer(&(sg_buffer_desc){
-    .data = (sg_range){
-      .ptr = f,
-      .size = sizeof(*f)*v
-    }
-  });*/
 }
 
 SDL_GPUBuffer *index_buffer(float *f, int verts)
 {
   return NULL;
-/*  uint16_t idxs[verts];
-  for (int i = 0; i < verts; i++)
-    idxs[i] = f[i];
-  
-  return sg_make_buffer(&(sg_buffer_desc){
-    .data = SG_RANGE(idxs),
-    .type = SG_BUFFERTYPE_INDEXBUFFER,
-  });*/
 }
 
 uint32_t pack_int10_n2(float *norm)
@@ -87,71 +46,21 @@ uint32_t pack_int10_n2(float *norm)
 SDL_GPUBuffer *normal_floats(float *f, int n)
 {
   return float_buffer(f, n);
-/*  uint32_t packed_norms[n/3];
-  for (int v = 0, i = 0; v < n/3; v++, i+= 3)
-    packed_norms[v] = pack_int10_n2(f+i);
-
-  return sg_make_buffer(&(sg_buffer_desc){
-    .data = SG_RANGE(packed_norms),
-    .label = "normal vert buffer",
-  });*/
 }
 
 SDL_GPUBuffer *ubyten_buffer(float *f, int v)
 {
   return NULL;
-/*  unsigned char b[v];
-  for (int i = 0; i < (v); i++)
-    b[i] = f[i]*255;
-    
-  return sg_make_buffer(&(sg_buffer_desc){.data=SG_RANGE(b)});*/
 }
 
 SDL_GPUBuffer *ubyte_buffer(float *f, int v)
 {
   return NULL;
-/*  unsigned char b[v];
-  for (int i = 0; i < (v); i++)
-    b[i] = f[i];
-    
-  return sg_make_buffer(&(sg_buffer_desc){.data=SG_RANGE(b)});
-*/
 }
 
 SDL_GPUBuffer *accessor2buffer(cgltf_accessor *a, int type)
 {
   return NULL;
-/*  int n = cgltf_accessor_unpack_floats(a, NULL, 0);
-  float vs[n];
-  cgltf_accessor_unpack_floats(a, vs, n);
-
-  switch(type) {
-    case MAT_POS:
-      return sg_make_buffer(&(sg_buffer_desc){
-        .data.ptr = vs,
-	.data.size = sizeof(float)*n
-      });
-    case MAT_NORM:
-      return normal_floats(vs,n);
-    case MAT_TAN:
-      return normal_floats(vs,n); // TODO: MAKE A TANGENT READER
-    case MAT_COLOR:
-      return ubyten_buffer(vs,n);
-    case MAT_WEIGHT:
-      return ubyten_buffer(vs,n);
-    case MAT_BONE:
-      return ubyte_buffer(vs,n);
-    case MAT_UV:
-      return texcoord_floats(vs,n);
-    case MAT_INDEX:
-      return index_buffer(vs,n);
-  }
-
-  return sg_make_buffer(&(sg_buffer_desc) {
-  .data.size = 4,
-  .usage = SG_USAGE_STREAM
-  });
-*/
 }
 
 void packFloats(float *src, float *dest, int srcLength) {
diff --git a/source/model.h b/source/model.h
index 3313011e..8bb2dd67 100644
--- a/source/model.h
+++ b/source/model.h
@@ -3,7 +3,6 @@
 
 #include "HandmadeMath.h"
 #include "transform.h"
-#include "gameobject.h"
 #include "anim.h"
 #include "cgltf.h"
 
diff --git a/source/prosperon.c b/source/prosperon.c
index 0e45ae47..1836b097 100644
--- a/source/prosperon.c
+++ b/source/prosperon.c
@@ -96,7 +96,7 @@ int main(int argc, char **argv) {
   prosperon = argv[0];
 
   PHYSFS_init(argv[0]);
-  char *base = PHYSFS_getBaseDir();
+  const char *base = PHYSFS_getBaseDir();
   PHYSFS_setWriteDir(base);
 
   PHYSFS_mount(base, "/", 0);
diff --git a/source/qjs_imgui.cpp b/source/qjs_imgui.cpp
index 60dbcaf0..a336a3d6 100644
--- a/source/qjs_imgui.cpp
+++ b/source/qjs_imgui.cpp
@@ -3,6 +3,8 @@
 #include "imnodes.h"
 #include "quickjs.h"
 
+#include <stb_ds.h>
+
 #include <SDL3/SDL.h>
 #include <SDL3/SDL_gpu.h>
 #include "imgui_impl_sdl3.h"
@@ -163,11 +165,10 @@ JSC_SCALL(imgui_plot,
   fill_plotdata(js, argv[1], argv[3]); \
   bool shaded = JS_ToBool(js,argv[2]);\
   int flag = 0; \
+  if (shaded) flag = SHADED; \
+  ImPlot::FN(str, &plotdata[0].x, &plotdata[0].y, arrlen(plotdata), ADD flag, 0, sizeof(ImVec2)); \
 ) \
 
-//if (shaded) flag = SHADED;
-//  ImPlot::FN(str, &plotdata[0].x, &plotdata[0].y, arrlen(plotdata), ADD flag, 0, sizeof(ImVec2));
-
 static ImVec2 *plotdata = NULL;
 
 void fill_plotdata(JSContext *js, JSValue v, JSValue last)
@@ -204,40 +205,19 @@ PLOT_FN(digitalplot, PlotDigital,,0)
 
 JSC_SCALL(imgui_barplot,
   fill_plotdata(js, argv[1], JS_UNDEFINED);
-//  ImPlot::PlotBars(str, &plotdata[0].x, &plotdata[0].y, js_arrlen(js, argv[1]), js2number(js, argv[2]), 0, 0, sizeof(ImVec2));
+  ImPlot::PlotBars(str, &plotdata[0].x, &plotdata[0].y, js_arrlen(js, argv[1]), js2number(js, argv[2]), 0, 0, sizeof(ImVec2));
 )
 
 JSC_SCALL(imgui_histogramplot,
   size_t offset, len, per_e;
   JSValue typed = JS_GetTypedArrayBuffer(js, argv[1], &offset, &len, &per_e);
-//  ImPlot::PlotHistogram(str, JS_GetArrayBuffer(js, NULL, typed), js_arrlen(js, argv[1]));
+  ImPlot::PlotHistogram(str, JS_GetArrayBuffer(js, NULL, typed), js_arrlen(js, argv[1]));
   JS_FreeValue(js, typed);
 )
 
-JSC_SCALL(imgui_heatplot,
-  int rows = js2number(js, argv[2]);
-  int cols = js2number(js, argv[3]);
-//  if (rows*cols == (int)js_arrlen(js, argv[1]))
-//    ImPlot::PlotHeatmap(str, histodata, rows, cols);
-)
-
-JSC_CCALL(imgui_pieplot,
-/*  if (js_arrlen(js, argv[0]) != js_arrlen(js, argv[1])) return JS_UNDEFINED;
-  
-  const char *labels[js_arrlen(js, argv[0])];
-  for (int i = 0; i < js_arrlen(js, argv[0]); i++)
-    labels[i] = JS_ToCString(js, js_getpropidx(argv[0], i));
-
-  fill_histodata(argv[1]);
-  ImPlot::PlotPieChart(labels, histodata, js_arrlen(js, argv[1]), js2number(js, argv[2]), js2number(js, argv[3]), js2number(js, argv[4]));
-
-  for (int i = 0; i < js_arrlen(js, argv[0]); i++)
-    JS_FreeCString(js,labels[i]);*/
-)
-
 JSC_SCALL(imgui_textplot,
   ImVec2 c = js2vec2(js, argv[1]);
-//  ImPlot::PlotText(str, c.x, c.y);
+  ImPlot::PlotText(str, c.x, c.y);
 )
 
 JSC_CCALL(imgui_inplot,
@@ -275,7 +255,7 @@ JSC_SSCALL(imgui_textinput,
   if (JS_IsUndefined(argv[1]))
     buffer[0] = 0;
   else
-    strncpy(buffer, str2, 512);
+    strncpy(buffer, str2, sizeof(buffer)-1);
     
   ImGui::InputText(str, buffer, sizeof(buffer));
   if (strcmp(buffer, str2))
@@ -289,7 +269,7 @@ JSC_SSCALL(imgui_textbox,
   if (JS_IsUndefined(argv[1]))
     buffer[0] = 0;
   else
-    strncpy(buffer, str2, 512);
+    strncpy(buffer, str2, sizeof(buffer)-1);
     
   ImGui::InputTextMultiline(str, buffer, sizeof(buffer));
   if (strcmp(buffer, str2))
@@ -834,7 +814,6 @@ const JSCFunctionListEntry js_imgui_funcs[] = {
   MIST_FUNC_DEF(imgui, stairplot, 4),
   MIST_FUNC_DEF(imgui, digitalplot, 4),
   MIST_FUNC_DEF(imgui, barplot, 3),
-  MIST_FUNC_DEF(imgui, pieplot, 5),
   MIST_FUNC_DEF(imgui, textplot, 2),
   MIST_FUNC_DEF(imgui, histogramplot, 2),
   MIST_FUNC_DEF(imgui, plotaxes, 2),
diff --git a/source/qjs_macros.h b/source/qjs_macros.h
index 7586540e..47f05fb5 100644
--- a/source/qjs_macros.h
+++ b/source/qjs_macros.h
@@ -134,10 +134,13 @@ JS_SetPropertyStr(js, globalThis, #NAME, NAME); \
 
 /* Defines a class and uses its function list as its prototype */
 #define QJSCLASSPREP_FUNCS(TYPE) \
+QJSCLASSPREP_NO_FUNCS(TYPE) \
+JS_SetPropertyFunctionList(js, TYPE##_proto, js_##TYPE##_funcs, countof(js_##TYPE##_funcs)); \
+
+#define QJSCLASSPREP_NO_FUNCS(TYPE) \
 JS_NewClassID(&js_##TYPE##_id);\
 JS_NewClass(JS_GetRuntime(js), js_##TYPE##_id, &js_##TYPE##_class);\
 JSValue TYPE##_proto = JS_NewObject(js); \
-JS_SetPropertyFunctionList(js, TYPE##_proto, js_##TYPE##_funcs, countof(js_##TYPE##_funcs)); \
 JS_SetClassProto(js, js_##TYPE##_id, TYPE##_proto); \
 JS_SetPropertyStr(js, c_types, #TYPE, JS_DupValue(js,TYPE##_proto)); \
 
@@ -147,8 +150,6 @@ JSValue js_##NAME##_use(JSContext *js) { \
   JS_SetPropertyFunctionList(js,mod,js_##NAME##_funcs,countof(js_##NAME##_funcs)); \
   return mod; } \
 
-#define MISTLINE(NAME) (ModuleEntry){ #NAME, js_##NAME##_funcs, countof(js_##NAME##_funcs) }
-
 #define countof(x) (sizeof(x)/sizeof((x)[0]))
 
 
diff --git a/source/qjs_tracy.c b/source/qjs_tracy.c
index 082f1ab0..da9db3c0 100644
--- a/source/qjs_tracy.c
+++ b/source/qjs_tracy.c
@@ -37,6 +37,7 @@ static JSValue js_tracy_fiber_leave(JSContext *js, JSValue self, int argc, JSVal
   const char *str = JS_AtomToCString(js, atom);
   TracyCFiberLeave(str);
   JS_FreeAtom(js,atom);
+  return JS_UNDEFINED;
 }
 
 static JSValue js_tracy_plot(JSContext *js, JSValue self, int argc, JSValue *argv)
@@ -61,7 +62,15 @@ static JSValue js_tracy_plot_config(JSContext *js, JSValue self, int argc, JSVal
     return JS_UNDEFINED;
 #endif
 
-//  TracyCPlotConfig(str, js2number(js,argv[1]), JS_ToBool(js,argv[2]), JS_ToBool(js,argv[3]), js2number(js,argv[4]))
+  const char *str = JS_ToCString(js,argv[0]);
+
+  uint32_t type, color;
+  
+  JS_ToUint32(js,&type, argv[1]);
+  JS_ToUint32(js,&color,argv[4]);
+  TracyCPlotConfig(str, type, JS_ToBool(js,argv[2]), JS_ToBool(js,argv[3]), color);
+  JS_FreeCString(js,str);
+  
   return JS_UNDEFINED;
 }
 
@@ -73,6 +82,8 @@ static JSValue js_tracy_frame_mark(JSContext *js, JSValue self, int argc, JSValu
 #endif
 
   TracyCFrameMark
+
+  return JS_UNDEFINED;
 }
 
 static JSValue js_tracy_message(JSContext *js, JSValue self, int argc, JSValue *argv)
@@ -86,6 +97,7 @@ static JSValue js_tracy_message(JSContext *js, JSValue self, int argc, JSValue *
   const char *str = JS_ToCStringLen(js, &len, argv[0]);
   TracyCMessage(str,len);
   JS_FreeCString(js,str);
+  return JS_UNDEFINED;
 }
 
 static JSValue js_tracy_thread_name(JSContext *js, JSValue self, int argc, JSValue *argv)
@@ -98,6 +110,7 @@ static JSValue js_tracy_thread_name(JSContext *js, JSValue self, int argc, JSVal
   const char *str = JS_ToCString(js, argv[0]);
   TracyCSetThreadName(str);
   JS_FreeCString(js,str);
+  return JS_UNDEFINED;  
 }
 
 static JSValue js_tracy_zone_begin(JSContext *js, JSValue self, int argc, JSValue *argv)
@@ -121,6 +134,7 @@ static JSValue js_tracy_zone_begin(JSContext *js, JSValue self, int argc, JSValu
   
   JS_Call(js, argv[0], JS_UNDEFINED, 0, NULL);
   TracyCZoneEnd(TCTX);
+  return JS_UNDEFINED;  
 }
 
 #ifdef SOKOL_GLCORE
@@ -189,7 +203,6 @@ static JSValue js_tracy_gpu_zone_begin(JSContext *js, JSValue self, int argc, JS
   };
   ___tracy_emit_gpu_zone_end(enddata);
   qhead = (qhead+1)%query_count;
-
   
   return ret;
 }
@@ -477,8 +490,8 @@ static JSValue js_tracy_image(JSContext *js, JSValue self, int argc, JSValue *ar
 /*  SDL_Surface *img = js2SDL_Surface(js,argv[0]);
   SDL_Surface *scaled = SDL_ScaleSurface(img, 320,180,SDL_SCALEMODE_LINEAR);
   ___tracy_emit_frame_image(scaled->pixels, scaled->w,scaled->h, 0,0);
-  SDL_DestroySurface(scaled);
-  return JS_UNDEFINED;*/
+  SDL_DestroySurface(scaled);*/
+  return JS_UNDEFINED;
 }
 
 #endif
@@ -513,6 +526,8 @@ JSValue js_tracy_level(JSContext *js, JSValue selff, int argc, JSValue *argv)
     js_debug_sethook(js, tracy_call_hook, JS_HOOK_CALL);
     js_debug_sethook(js, tracy_end_hook, JS_HOOK_RET);
   }
+
+  return JS_UNDEFINED;
 }
 
 static const JSCFunctionListEntry js_tracy_funcs[] = {
@@ -523,11 +538,13 @@ static const JSCFunctionListEntry js_tracy_funcs[] = {
   JS_CFUNC_DEF("gpu_init", 0, js_tracy_gpu_init),
   JS_CFUNC_DEF("gpu_sync", 0, js_tracy_gpu_sync),
   JS_CFUNC_DEF("end_frame", 0, js_tracy_frame_mark),
+  JS_CFUNC_DEF("thread_name", 1, js_tracy_thread_name),
   JS_CFUNC_DEF("zone", 1, js_tracy_zone_begin),
   JS_CFUNC_DEF("message", 1, js_tracy_message),
   JS_CFUNC_DEF("plot", 2, js_tracy_plot),
   JS_CFUNC_DEF("image", 3, js_tracy_image),
   JS_CFUNC_DEF("level", 1, js_tracy_level),
+  JS_CFUNC_DEF("plot_config", 5, js_tracy_plot_config),
 };
 
 JSValue js_tracy_use(JSContext *js)
diff --git a/source/quadtree.c b/source/quadtree.c
deleted file mode 100644
index 0c635c4d..00000000
--- a/source/quadtree.c
+++ /dev/null
@@ -1,246 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "aabb.h"
-
-/// Default node size cap
-#define QTREE_STDCAP 4
-
-/// A function pointer def for determining if an element exists in a range
-typedef int (*qtree_fnc)(void *ptr, aabb *range);
-typedef int (*qtree_rm)(void *ptr, void *cmp);
-
-/// Quadtree node
-typedef struct qnode {
-	uint16_t cnt;     ///< Number of elements in this node
-	aabb bound;       ///< Area this node covers
-	void **elist;     ///< List of element pointers
-	struct qnode *nw; ///< NW quadrant of this node
-	struct qnode *ne; ///< NE quadrant of this node
-	struct qnode *sw; ///< SW quadrant of this node
-	struct qnode *se; ///< SE quadrant of this node
-} qnode;
-
-/// Quadtree container
-typedef struct _qtree {
-	uint16_t maxnodecap; ///< Maximum element count per node
-	qnode *root;         ///< Root node
-	qtree_fnc cmpfnc;    ///< Element range compare function pointer
-  qtree_rm rmfnc;
-} _qtree;
-
-typedef struct _qtree* qtree;
-
-/// Simple container for returning found elements
-typedef struct retlist {
-	uint32_t cnt; ///< Number of elements found
-	aabb range;   ///< Range to use for searching
-	void **list;  ///< Array of pointers to found elements
-} retlist;
-
-static void retlist_add(retlist *r, void *p) {
-	r->list = realloc(r->list, sizeof(void*)*(r->cnt+1));
-	r->list[r->cnt] = p;
-	r->cnt++;
-}
-
-static uint16_t qtree_getMaxNodeCnt(qtree q) {
-	uint16_t r;
-	r = q->maxnodecap;
-	return r;
-}
-
-static qnode* qnode_new(qtree p, float x, float y, float hW, float hH) {
-	qnode *q = malloc(sizeof(qnode));
-	memset(q, 0, sizeof(qnode));
-	q->bound.center.x = x;
-	q->bound.center.y = y;
-	q->bound.dims.w = hW;
-	q->bound.dims.h = hH;
-
-	return q;
-}
-
-static void qnode_free(qtree q, qnode *qn) {
-	if(qn->cnt)
-		free(qn->elist);
-
-	qn->cnt = 0;
-
-	if(qn->nw) {
-		qnode_free(q, qn->nw);
-		qnode_free(q, qn->ne);
-		qnode_free(q, qn->sw);
-		qnode_free(q, qn->se);
-	}
-
-	free(qn);
-}
-
-static void add(qnode *q, void *p) {
-	q->elist = realloc(q->elist, sizeof(void*)*(q->cnt+1));
-	q->elist[q->cnt] = p;
-	q->cnt++;
-}
-
-static void drop(qnode *q, uint16_t idx) {
-	void **narry = malloc(sizeof(void*)*(q->cnt-1));
-	
-	// This is a little (lot) ugly; a pair of memcpy's would be
-	// better, but I had some problems with it
-	for(uint16_t i=0,skip=0; i<q->cnt; i++) {
-		if(i == idx) { skip++; continue; }
-		narry[i-skip] = q->elist[i];
-	}
-	
-	void **old = q->elist;
-	q->elist = narry;
-	free(old);
-	q->cnt--;
-}
-
-static void subdivide(qtree p, qnode *q) {
-	float cx = q->bound.center.x;
-	float cy = q->bound.center.y;
-	float hw = q->bound.dims.w/2;
-	float hh = q->bound.dims.h/2;
-
-	q->nw = qnode_new(p, cx-hw, cy-hh, hw, hh);
-	q->ne = qnode_new(p, cx+hw, cy-hh, hw, hh);
-	q->sw = qnode_new(p, cx-hw, cy+hh, hw, hh);
-	q->se = qnode_new(p, cx+hw, cy+hh, hw, hh);
-}
-
-static int qnode_insert(qtree q, qnode *qn, void *ptr) {
-	int ret = 0;
-	
-	if(! (q->cmpfnc)(ptr, &qn->bound)) return 0;
-
-	if(qn->cnt < qtree_getMaxNodeCnt(q)) {
-		add(qn, ptr);
-    return 1;
-	}
-
-	if(! qn->nw)
-		subdivide(q, qn);
-
-	if(qnode_insert(q,qn->nw,ptr))
-		return 1;
-	else if(qnode_insert(q,qn->ne,ptr))
-		return 1;
-	else if(qnode_insert(q,qn->sw,ptr))
-		return 1;
-	else if(qnode_insert(q,qn->se,ptr))
-		return 1;
-}
-
-static void* qnode_remove(qtree q, qnode *qn, void *ptr) {
-	if(qn->cnt) {
-		for(uint16_t i=0; i<qn->cnt; i++) {
-			if(q->rmfnc(qn->elist[i], ptr)) {
-				drop(qn, i);
-				ptr = NULL;
-				goto QN_REM_EXIT;
-			}
-		}
-	}
-
-	if(! qn->nw)
-		return NULL;
-
-	if(qnode_remove(q, qn->nw, ptr)) return ptr;
-	if(qnode_remove(q, qn->ne, ptr)) return ptr;
-	if(qnode_remove(q, qn->sw, ptr)) return ptr;
-	if(qnode_remove(q, qn->se, ptr)) return ptr;
-
-	return NULL;
-	QN_REM_EXIT:
-	return ptr;
-}
-
-static void qnode_getInRange(qtree q, qnode *qn, retlist *r) {
-	if(qn->cnt) {
-		if(! aabb_intersects(&qn->bound, &r->range))
-			goto QN_GET_EXIT;
-
-		for(uint16_t i=0; i<qn->cnt; i++)
-			if((q->cmpfnc)(qn->elist[i], &r->range))
-				retlist_add(r, qn->elist[i]);
-	}
-
-	if(! qn->nw)
-		goto QN_GET_EXIT;
-
-	qnode_getInRange(q, qn->nw, r);
-	qnode_getInRange(q, qn->ne, r);
-	qnode_getInRange(q, qn->sw, r);
-	qnode_getInRange(q, qn->se, r);
-
-	QN_GET_EXIT:
-	return;
-}
-
-qtree qtree_new(float x, float y, float w, float h, qtree_fnc fnc, qtree_rm rm) {
-	qtree q = malloc(sizeof(_qtree));
-	memset(q, 0, sizeof(_qtree));
-
-	q->maxnodecap = QTREE_STDCAP;
-	q->cmpfnc = fnc;
-  q->rmfnc = rm;
-	q->root = qnode_new(q, x+(w/2),y+(h/2),w/2,h/2);
-
-	return q;
-}
-
-void qtree_destroy(qtree q) {
-	void *m;
-	if(q->root) qnode_free(q, q->root);
-	
-	memset(q, 0, sizeof(_qtree));
-	
-	free(q);
-}
-
-void qtree_insert(qtree q, void *ptr) {
-	qnode_insert(q, q->root, ptr);
-}
-
-void qtree_remove(qtree q, void *ptr) {
-	qnode_remove(q, q->root, ptr);
-}
-
-void qtree_setMaxNodeCnt(qtree q, uint16_t cnt) {
-	q->maxnodecap = cnt || 1;
-}
-
-void qtree_clear(qtree q) {
-	float x = q->root->bound.center.x;
-	float y = q->root->bound.center.y;
-	float w = q->root->bound.dims.w;
-	float h = q->root->bound.dims.h;
-	qnode *qn = q->root;
-	
-	q->root = qnode_new(q, x, y, w, h);
-
-	qnode_free(q, qn);
-}
-
-void** qtree_findInArea(qtree q, float x, float y, float w, float h, uint32_t *cnt) {
-	float hw = w/2;
-	float hh = h/2;
-
-	retlist ret;
-	memset(&ret, 0, sizeof(retlist));
-
-	ret.range.center.x = x+hw;
-	ret.range.center.y = y+hh;
-	ret.range.dims.w = hw;
-	ret.range.dims.h = hh;
-
-	qnode_getInRange(q, q->root, &ret);
-
-	*cnt = ret.cnt;
-	return ret.list;
-}
diff --git a/source/quadtree.h b/source/quadtree.h
deleted file mode 100644
index 35b28272..00000000
--- a/source/quadtree.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-  quadtree.h
-  2014 JSK (kutani@projectkutani.com)
-
-  Part of the Panic Panic project.
-
-  Released to the public domain. See LICENSE for details.
-*/
-#ifndef _QUADTREE_H
- #define _QUADTREE_H
-
-#ifndef _AABB_H
- #include "aabb.h"
-#endif
-
-/// Opaque pointer to a quadtree data structure
-typedef struct _qtree* qtree;
-
-/// A function pointer def for determining if an element exists in a range
-typedef int (*qtree_fnc)(void *ptr, aabb *range);
-
-typedef int (*qtree_rm)(void *ptr, void *cmp);
-
-/// Create a new qtree
-/*!
-  Creates a new qtree with a bound of w,h size, centered at x,y.
-  
-  Uses the passed function pointer fnc to test elements against nodes
-  for insertion, and finding.
-
-  Returns a new qtree pointer.
-*/
-qtree qtree_new(float x, float y, float w, float h, qtree_fnc fnc, qtree_rm rm);
-
-void qtree_destroy(qtree q);
-
-/// Insert an element
-/*!
-  Inserts the passed element into quadtree q.
-
-  Uses the function passed to qtree_new() to determine where the
-  element should go.
-*/
-void qtree_insert(qtree q, void *ptr);
-
-/// Removes an element from the quadtree
-/*!
-  Performs a selective removal of the passed element.
-
-  Performs a naive pointer comparison and a depth-first search of the
-  tree, so this isn't very fast.
-*/
-void qtree_remove(qtree q, void *ptr);
-
-/// Set the maximum number of elements per node
-/*!
-  Sets the maximum elements per quadtree node.
-
-  The default is 4.
-*/
-void qtree_setMaxNodeCnt(qtree q, uint16_t cnt);
-
-/// Resets a quadtree
-/*!
-  Clears all nodes held by the quadtree and creates a fresh root node
-  with no elements assigned.
-*/
-void qtree_clear(qtree q);
-
-/// Find all elements within a rectangular bound
-/*!
-  Performs a search for any elements within the given x,y + w,h
-  bound. Returns an array of pointers to any elements (which should be
-  freed by the user), and places the number of elements in cnt.
-*/
-void** qtree_findInArea(qtree q, float x, float y, float w, float h, uint32_t *cnt);
-
-#endif
diff --git a/source/script.c b/source/script.c
index b2057728..bab86fe1 100644
--- a/source/script.c
+++ b/source/script.c
@@ -27,7 +27,7 @@ static JSRuntime *rt = NULL;
 
 JSContext *global_js = NULL;
 
-JSValue on_exception = JS_UNDEFINED;
+JSValue on_exception;
 
 #define ENGINE "scripts/core/engine.js"
 
@@ -158,6 +158,8 @@ void script_startup(int argc, char **argv) {
   JS_AddIntrinsicBigDecimal(js);
   JS_AddIntrinsicOperators(js);
 
+  on_exception = JS_UNDEFINED;
+
   ffi_load(js, argc, argv);
 
   PHYSFS_File *eng = PHYSFS_openRead(ENGINE);
diff --git a/source/spline.c b/source/spline.c
index ae9a64c3..1115f44d 100644
--- a/source/spline.c
+++ b/source/spline.c
@@ -3,318 +3,447 @@
 #include "transform.h"
 #include "math.h"
 
+/* -------------------------------------------------------------------------
+   Cubic Spline Basis Matrices
+   ------------------------------------------------------------------------- */
 static const HMM_Mat4 cubic_hermite_m = {
-  2, -2, 1, 1,
-  -3, 3, -2, -1,
-  0, 0, 1, 0,
-  1, 0, 0, 0
+    2,  -2,  1,  1,
+   -3,   3, -2, -1,
+    0,   0,  1,  0,
+    1,   0,  0,  0
 };
 
 static const HMM_Mat4 cubic_hermite_dm = {
-  0, 0, 0, 0,
-  6, -6, 3, 3,
-  -6, 6, -4, -2,
-  0, 0, 1, 0
+    0,   0,  0,  0,
+    6,  -6,  3,  3,
+   -6,   6, -4, -2,
+    0,   0,  1,  0
 };
 
 static const HMM_Mat4 cubic_hermite_ddm = {
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  12, -12, 6, 6,
-  -6, 6, -4, -2
+    0,   0,  0,  0,
+    0,   0,  0,  0,
+   12, -12,  6,  6,
+   -6,   6, -4, -2
 };
 
 static const HMM_Mat4 cubic_hermite_dddm = {
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  12, -12, 6, 6
+    0,   0,  0,  0,
+    0,   0,  0,  0,
+    0,   0,  0,  0,
+   12, -12,  6,  6
 };
 
 static const HMM_Mat4 b_spline_m = {
-  -1/6, 3/6, -3/6, 1,
-  3/6, -6/6, 3/6, 0,
-  -3/6, 0, 3/6, 0,
-  1/6, 4/6, 1/6, 0
+  -1.0f/6,  3.0f/6, -3.0f/6,  1.0f,
+   3.0f/6, -6.0f/6,  3.0f/6,  0.0f,
+  -3.0f/6,  0.0f,    3.0f/6,  0.0f,
+   1.0f/6,  4.0f/6,  1.0f/6,  0.0f
 };
 
 static const HMM_Mat4 b_spline_dm = {
-  0, 0, 0, 0,
-  -3/6, 9/6, -9/6, 3,
-  6/6, -12/6, 6/6, 0,
-  -3/6, 0, 3/6, 0
+   0,     0,     0,    0,
+  -3.0f/6, 9.0f/6, -9.0f/6, 3.0f,
+   6.0f/6, -12.0f/6, 6.0f/6, 0.0f,
+  -3.0f/6,  0.0f,   3.0f/6, 0.0f
 };
 
 static const HMM_Mat4 b_spline_ddm = {
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  -6/6, 18/6, -18/6, 6,
-  6/6, -12/6, 6/6, 0
+   0,    0,    0,    0,
+   0,    0,    0,    0,
+  -6.0f/6, 18.0f/6, -18.0f/6, 6.0f,
+   6.0f/6, -12.0f/6,  6.0f/6, 0.0f
 };
 
 static const HMM_Mat4 b_spline_dddm = {
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  -6/6, 18/6, -18/6, 6
+   0,    0,    0,    0,
+   0,    0,    0,    0,
+   0,    0,    0,    0,
+  -6.0f/6, 18.0f/6, -18.0f/6, 6.0f
 };
 
 static const HMM_Mat4 bezier_m = {
-  -1, 3, -3, 1,
-  3, -6, 3, 0,
-  -3, 3, 0, 0,
-  1, 0, 0, 0
+   -1,  3, -3,  1,
+    3, -6,  3,  0,
+   -3,  3,  0,  0,
+    1,  0,  0,  0
 };
 
 static const HMM_Mat4 bezier_dm = {
-  0, 0, 0, 0,
-  -3, 9, -9, 3,
-  6, -12, 6, 0,
-  -3, 3, 0, 0,
+    0,  0,  0,  0,
+   -3,  9, -9,  3,
+    6, -12,  6,  0,
+   -3,  3,  0,  0
 };
 
 static const HMM_Mat4 bezier_ddm = {
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  -6, 18, -18, 6,
-  6, -12, 6, 0
+    0,  0,   0,  0,
+    0,  0,   0,  0,
+   -6, 18, -18,  6,
+    6, -12,  6,  0
 };
 
 static const HMM_Mat4 bezier_dddm = {
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  -6, 18, -18, 6
+    0,  0,  0,  0,
+    0,  0,  0,  0,
+    0,  0,  0,  0,
+   -6, 18, -18,  6
 };
 
-#define CAT_S 0.5
+/* Catmull–Rom (with tension = 0.5 by default) */
+#define CAT_S 0.5f
 
-/* Position */
 static const HMM_Mat4 catmull_rom_m = {
-  -CAT_S, 2-CAT_S, CAT_S-2, CAT_S,
-  2*CAT_S, CAT_S-3, 3-2*CAT_S, -CAT_S,
-  -CAT_S, 0, CAT_S, 0,
-  0, 1, 0, 0
+  -CAT_S,       2-CAT_S,      CAT_S-2,  CAT_S,
+   2*CAT_S,     CAT_S-3,      3-2*CAT_S, -CAT_S,
+  -CAT_S,       0,            CAT_S,     0,
+   0,           1,            0,         0
 };
 
-/* Tangent */
 static const HMM_Mat4 catmull_rom_dm = {
-  0, 0, 0, 0,
-  -3*CAT_S, 9*CAT_S, -9*CAT_S, 3*CAT_S,
-  4*CAT_S, -10*CAT_S, 8*CAT_S, -2*CAT_S,
-  -CAT_S, 0, CAT_S, 0,
+    0,          0,          0,          0,
+  -3*CAT_S,   9*CAT_S,    -9*CAT_S,    3*CAT_S,
+   4*CAT_S,  -10*CAT_S,    8*CAT_S,   -2*CAT_S,
+  -CAT_S,      0,          CAT_S,      0
 };
 
-/* Curvature */
 static const HMM_Mat4 catmull_rom_ddm = {
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  -9*CAT_S, 18*CAT_S, -18*CAT_S, 6*CAT_S,
-  4*CAT_S, -10*CAT_S, 8*CAT_S, -2*CAT_S
+    0,        0,         0,         0,
+    0,        0,         0,         0,
+  -9*CAT_S, 18*CAT_S, -18*CAT_S,  6*CAT_S,
+   4*CAT_S, -10*CAT_S, 8*CAT_S,  -2*CAT_S
 };
 
-/* Wiggle */
 static const HMM_Mat4 catmull_rom_dddm = {
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  0, 0, 0, 0,
-  -9*CAT_S, 18*CAT_S, -18*CAT_S, 6*CAT_S
+    0,         0,         0,         0,
+    0,         0,         0,         0,
+    0,         0,         0,         0,
+  -9*CAT_S, 18*CAT_S, -18*CAT_S,  6*CAT_S
 };
 
-/*
-  [t3 t2 t1 1] B [p1
-      	          p2  that is, point 1, tangent at point 1, point 2, tan and point 2
-		  t1
-		  t2]
-
-*/
-
+/* -------------------------------------------------------------------------
+   Core “C·T” multiplication:  [ t^3, t^2, t, 1 ] * C
+   ------------------------------------------------------------------------- */
 HMM_Vec4 spline_CT(HMM_Mat4 *C, float t)
 {
-  float t2 = t*t;
-  float t3 = t2*t;
-  HMM_Vec4 T = {t3, t2, t, 1};
-  return HMM_MulM4V4(*C, T);
+    float t2 = t * t;
+    float t3 = t2 * t;
+    HMM_Vec4 T = { t3, t2, t, 1.0f };
+    return HMM_MulM4V4(*C, T);
 }
 
+/* Construct the “geometry matrix” G from four 2D points, then multiply by B */
 HMM_Mat4 make_C(const HMM_Vec2 *p, const HMM_Mat4 *B)
 {
-  HMM_Mat4 G;
-  G.Columns[0].xy = p[0];
-  G.Columns[1].xy = p[1];
-  G.Columns[2].xy = p[2];
-  G.Columns[3].xy = p[3];
-  return HMM_MulM4(G, *B);
+    HMM_Mat4 G = HMM_M4(); // Zeroed out
+    // Only fill XY of each column; if you are storing 3D in HMM_Vec4, adapt as needed
+    G.Columns[0].XY = p[0];
+    G.Columns[1].XY = p[1];
+    G.Columns[2].XY = p[2];
+    G.Columns[3].XY = p[3];
+
+    return HMM_MulM4(G, *B);
 }
 
+/* Evaluate a single-segment cubic spline at parameter d in [0,1].
+   p must be 4 control points, m is the cubic basis matrix. */
 HMM_Vec2 cubic_spline_d(HMM_Vec2 *p, HMM_Mat4 *m, float d)
 {
-  HMM_Mat4 C = make_C(p, m);
-  return spline_CT(&C, d).xy;
+    HMM_Mat4 C = make_C(p, m);
+    HMM_Vec4 v4 = spline_CT(&C, d);
+    return v4.XY;
 }
 
+/* -------------------------------------------------------------------------
+   Convenience single-segment functions for each basis
+   (pos / tan / curv / wig all require the appropriate matrix)
+   Typically you pass p[0..3] as the 4 relevant control points.
+   ------------------------------------------------------------------------- */
+HMM_Vec2 cubic_hermite_pos(HMM_Vec2 *p, float d) { return cubic_spline_d(p, (HMM_Mat4 *)&cubic_hermite_m, d); }
+HMM_Vec2 cubic_hermite_tan(HMM_Vec2 *p, float d) { return cubic_spline_d(p, (HMM_Mat4 *)&cubic_hermite_dm, d); }
+HMM_Vec2 cubic_hermite_curv(HMM_Vec2 *p, float d){ return cubic_spline_d(p, (HMM_Mat4 *)&cubic_hermite_ddm, d); }
+HMM_Vec2 cubic_hermite_wig(HMM_Vec2 *p, float d) { return cubic_spline_d(p, (HMM_Mat4 *)&cubic_hermite_dddm, d); }
+
+HMM_Vec2 b_spline_pos(HMM_Vec2 *p, float d) { return cubic_spline_d(p, (HMM_Mat4 *)&b_spline_m, d); }
+HMM_Vec2 b_spline_tan(HMM_Vec2 *p, float d) { return cubic_spline_d(p, (HMM_Mat4 *)&b_spline_dm, d); }
+HMM_Vec2 b_spline_curv(HMM_Vec2 *p, float d){ return cubic_spline_d(p, (HMM_Mat4 *)&b_spline_ddm, d); }
+HMM_Vec2 b_spline_wig(HMM_Vec2 *p, float d) { return cubic_spline_d(p, (HMM_Mat4 *)&b_spline_dddm, d); }
+
+HMM_Vec2 bezier_pos(HMM_Vec2 *p, float d) { return cubic_spline_d(p, (HMM_Mat4 *)&bezier_m, d); }
+HMM_Vec2 bezier_tan(HMM_Vec2 *p, float d) { return cubic_spline_d(p, (HMM_Mat4 *)&bezier_dm, d); }
+HMM_Vec2 bezier_curv(HMM_Vec2 *p, float d){ return cubic_spline_d(p, (HMM_Mat4 *)&bezier_ddm, d); }
+HMM_Vec2 bezier_wig(HMM_Vec2 *p, float d) { return cubic_spline_d(p, (HMM_Mat4 *)&bezier_dddm, d); }
+
+/* -------------------------------------------------------------------------
+   Multi-segment sampling (“spline_v2”) for uniform division
+   ------------------------------------------------------------------------- */
 HMM_Vec2 *spline_v2(HMM_Vec2 *p, HMM_Mat4 *m, int segs)
 {
-  HMM_Vec2 *ret = NULL;
-  if (segs < 2) return NULL;
- 
-  HMM_Mat4 C = make_C(p, m);
-  float s = (float)1/segs;
+    // For a single 4-point segment, produce 'segs' points along [0..1).
+    // If you want the final 1.0 also, you can do <= 1.0 in the loop, etc.
+    HMM_Vec2 *ret = NULL;
+    if (segs < 2) return NULL;
 
-  for (float t = 0; t < 1; t += s)
-    arrput(ret, spline_CT(&C, t).xy);
-
-  return ret;
+    HMM_Mat4 C = make_C(p, m);
+    float s = 1.0f / (float)segs;
+    for (int i = 0; i <= segs; i++)
+    {
+        float t = s * i;
+        arrput(ret, spline_CT(&C, t).XY);
+    }
+    return ret;
 }
 
+/* -------------------------------------------------------------------------
+   Adaptive subdivision by min segment length
+   (spline2d_min_seg) – for a single 4-point segment
+   ------------------------------------------------------------------------- */
 HMM_Vec2 *spline2d_min_seg(float u0, float u1, float min_seg, HMM_Mat4 *C, HMM_Vec2 *ret)
 {
-  HMM_Vec2 a = spline_CT(C, u0).xy;
-  HMM_Vec2 b = spline_CT(C, u1).xy;
-  if (HMM_DistV2(a,b) > min_seg) {
-    float umid = (u0+u1)/2;
-    spline2d_min_seg(u0, umid, min_seg, C, ret);
-    spline2d_min_seg(umid, u1, min_seg, C, ret);
-  }
-  else
-    arrput(ret, b);
-
-  return ret;
+    HMM_Vec2 a = spline_CT(C, u0).XY;
+    HMM_Vec2 b = spline_CT(C, u1).XY;
+    if (HMM_DistV2(a, b) > min_seg)
+    {
+        float umid = 0.5f * (u0 + u1);
+        spline2d_min_seg(u0,  umid, min_seg, C, ret);
+        spline2d_min_seg(umid, u1,   min_seg, C, ret);
+    }
+    else
+    {
+        // We push 'b' so that we don't double-push 'a'
+        arrput(ret, b);
+    }
+    return ret;
 }
 
+/* Example: catmull_rom_min_seg -> subdiv for catmull–rom over one segment
+   You would decide how to pick the 4 points from a,b,c,d, then run. */
 HMM_Vec2 *catmull_rom_min_seg(HMM_Vec2 *a, HMM_Vec2 *b, HMM_Vec2 *c, HMM_Vec2 *d, float min_seg)
 {
-  HMM_Vec2 *ret = NULL;
-  arrsetcap(ret, 1000);
-  arrput(ret, *b);
-//  spline2d_min_seg(0, 1, min_seg, &C, ret);
-  return ret;
+    HMM_Vec2 *ret = NULL;
+    arrsetcap(ret, 1000);
+
+    // Build the matrix for these four points
+    HMM_Vec2 p[4] = { *a, *b, *c, *d };
+    HMM_Mat4 C = make_C(p, &catmull_rom_m);
+
+    // Always push the starting point (b in your original code was the second ctrl point, etc.)
+    // But usually we want the first actual point in the segment:
+    arrput(ret, cubic_spline_d(p, (HMM_Mat4*)&catmull_rom_m, 0.0f));
+
+    // Actually subdiv
+    spline2d_min_seg(0.0f, 1.0f, min_seg, &C, ret);
+
+    return ret;
 }
 
-HMM_Vec2 *spline2d_min_angle_2(float u0, float u1, float max_angle, HMM_Mat4 *C, HMM_Vec2 *arr) 
+/* -------------------------------------------------------------------------
+   Adaptive subdivision by “max angle” proxy
+   (spline2d_min_angle_2) – for single 4-point segment
+   ------------------------------------------------------------------------- */
+HMM_Vec2 *spline2d_min_angle_2(float u0, float u1, float max_angle, HMM_Mat4 *C, HMM_Vec2 *arr)
 {
-  float ustep = (u1-u0)/4;
-  float um0 = u0+ustep;
-  float um1 = u0+(ustep*2);
-  float um2 = u0+(ustep*3);
-  
-  HMM_Vec2 m0 = spline_CT(C, um0)._2;
-  HMM_Vec2 m1 = spline_CT(C, um1)._2;
-  HMM_Vec2 m2 = spline_CT(C,um2)._2;
+    // Heuristic approach: sample midpoints, check “chord vs polyline” difference
+    float ustep = (u1 - u0) / 4.0f;
+    float um0 = u0 + ustep;
+    float um1 = u0 + 2.0f * ustep;
+    float um2 = u0 + 3.0f * ustep;
 
-  HMM_Vec2 a = spline_CT(C,u0)._2;
-  HMM_Vec2 b = spline_CT(C,u1)._2;
-  
-  float ab = HMM_DistV2(a,b);
-  float cdist = HMM_DistV2(a,m0) + HMM_DistV2(m0,m1) + HMM_DistV2(m1,m2) + HMM_DistV2(m2,b);
+    HMM_Vec2 m0 = spline_CT(C, um0).XY;
+    HMM_Vec2 m1 = spline_CT(C, um1).XY;
+    HMM_Vec2 m2 = spline_CT(C, um2).XY;
 
-  if (cdist-ab > max_angle) {
-    arr = spline2d_min_angle_2(u0,um1,max_angle,C,arr);
-    arr = spline2d_min_angle_2(um1,u1,max_angle,C,arr);    
-  } else
-    arrput(arr,b);
-  
-  return arr;
+    HMM_Vec2 a  = spline_CT(C, u0).XY;
+    HMM_Vec2 b  = spline_CT(C, u1).XY;
+
+    // Chord = distance from a to b
+    float chord   = HMM_DistV2(a, b);
+    // Polyline = a->m0->m1->m2->b
+    float cdist   = HMM_DistV2(a,  m0)
+                  + HMM_DistV2(m0, m1)
+                  + HMM_DistV2(m1, m2)
+                  + HMM_DistV2(m2, b);
+
+    // If the difference is bigger than some threshold (max_angle),
+    // subdivide. Otherwise, keep it.
+    if ((cdist - chord) > max_angle)
+    {
+        arr = spline2d_min_angle_2(u0,  um1, max_angle, C, arr);
+        arr = spline2d_min_angle_2(um1, u1,  max_angle, C, arr);
+    }
+    else
+    {
+        // We accept “b” as a new point
+        arrput(arr, b);
+    }
+    return arr;
 }
 
 HMM_Vec2 *spline_min_angle(HMM_Vec2 *p, const HMM_Mat4 *B, float min_angle, HMM_Vec2 *arr)
 {
-  HMM_Mat4 C = make_C(p, B);
-  arr = spline2d_min_angle_2(0,1,min_angle, &C, arr);
-  return arr;
+    HMM_Mat4 C = make_C(p, B);
+    // Subdivide from 0..1
+    float u0 = 0.0f, u1 = 1.0f;
+    // Usually we want to ensure the start point is in arr:
+    HMM_Vec2 startPt = spline_CT(&C, u0).XY;
+    if (arrlen(arr) == 0) {
+        arrput(arr, startPt);
+    }
+    // Now subdiv for angle
+    arr = spline2d_min_angle_2(u0, u1, min_angle, &C, arr);
+    return arr;
 }
 
+/* Example: catmull_rom_ma_v2 – uses “min_angle” over multiple segments 
+   Each 4 consecutive points is one segment. We do this for all segments. */
 HMM_Vec2 *catmull_rom_ma_v2(HMM_Vec2 *cp, float ma)
 {
-  if (arrlen(cp) < 4) return NULL;
-  HMM_Vec2 *ret = NULL;
+    int n = arrlen(cp);
+    if (n < 4) return NULL;
 
-  int segments = arrlen(cp)-3;
-  arrsetcap(ret,segments*(ma>=2 ? 3 : 7));  
-  arrput(ret, cp[1]); 
-  for (int i = 0; i < arrlen(cp)-3; i++)
-    ret = spline_min_angle(&cp[i], &catmull_rom_m, ma, ret);
+    HMM_Vec2 *ret = NULL;
+    // Pre-allocate some capacity
+    arrsetcap(ret, (n-3) * 8);
 
-  return ret;
+    // For convenience, let's always ensure we push the very first point:
+    arrput(ret, cp[0]);
+
+    // For each segment [i, i+1, i+2, i+3], adaptively sample
+    // Then move i by 1 each time if you want Catmull–Rom in “overlapped” fashion
+    for (int i = 0; i < n - 3; i++)
+    {
+        // p[i..i+3]
+        ret = spline_min_angle(&cp[i], &catmull_rom_m, ma, ret);
+    }
+    return ret;
 }
 
+/* Example: do the same with Bezier in “cubic-bezier” style (control points in groups of 3 “handles”) */
 HMM_Vec2 *bezier_cb_ma_v2(HMM_Vec2 *cp, float ma)
 {
-  if (arrlen(cp) < 4) return NULL;
-  HMM_Vec2 *ret = NULL;
-  int segments = arrlen(cp)-3;
-  arrsetcap(ret,segments*(ma>=2?3:7));
-  arrput(ret,cp[0]);
-  for (int i = 0; i < arrlen(cp)-3; i+=3)
-    ret = spline_min_angle(&cp[i], &bezier_m, ma, ret);
+    int n = arrlen(cp);
+    // Typically a Bezier “chain” would use control points in multiples of 3 + 1, etc.
+    // E.g. p[0] is start, p[1..3] are control handles for first segment, then p[3..6] for second, etc.
+    // Adjust logic to your liking.
+    if (n < 4) return NULL;
 
-  return ret;
+    HMM_Vec2 *ret = NULL;
+    arrsetcap(ret, (n/3) * 8);
+
+    // First point
+    arrput(ret, cp[0]);
+
+    // For each cubic Bezier segment: i += 3
+    for (int i = 0; i < n - 3; i += 3)
+    {
+        ret = spline_min_angle(&cp[i], &bezier_m, ma, ret);
+    }
+    return ret;
 }
 
-HMM_Vec2 catmull_rom_query(HMM_Vec2 *cp, float d, const HMM_Mat4 *G)
+static HMM_Vec2 catmull_rom_query_internal(HMM_Vec2 *cp, float d, const HMM_Mat4 *M)
 {
-  if (arrlen(cp) < 4 || d < 0 || d > 1) return HMM_V2(0,0);
+    int n = arrlen(cp);
+    if (n < 4) return HMM_V2(0,0);
 
-  int segs = arrlen(cp)-3;
-  float d_per_seg = (float)1/segs;
-  float maxi = d_per_seg;
-  int p1 = 2;
-  while (maxi < d) {
-    maxi += d_per_seg;
-    p1++;
-  }
+    // Number of segments:
+    int seg_count = n - 3;
+    // Scale d in [0..1] -> which segment?
+    float segf = d * seg_count;
+    int seg_idx = (int) floorf(segf);
+    if (seg_idx >= seg_count) seg_idx = seg_count - 1;
+    if (seg_idx < 0) seg_idx = 0;
 
-   return cp[0];
-//  return cubic_spline_d(p0, cp[p1], cp[p1+1], p3, G, d);
+    // Local parameter in [0..1]
+    float u = segf - seg_idx;
+
+    // The control points for that segment are cp[ seg_idx .. seg_idx+3 ]
+    return cubic_spline_d(cp + seg_idx, (HMM_Mat4 *)M, u);
 }
 
-float spline_seglen(float t0, float t1, float max_angle, HMM_Mat4 *Cd, HMM_Mat4 *C)
+HMM_Vec2 catmull_rom_pos(HMM_Vec2 *cp, float d)
 {
-  float total = 0;
-  float step = 0.1;
-  for (float i = t0; i < t1; i += step)
-    total += HMM_LenV2(spline_CT(Cd, i).xy) * step;
-
-  return total;
-
-  /* Estimation via max angle */
-/*  float total = 0.0;
-  float tmid = (t0+t1)/2;
-  HMM_Vec2 a = spline_CT(C, t0).xy;
-  HMM_Vec2 b = spline_CT(C, t1).xy;
-  HMM_Vec2 m = spline_CT(C, tmid).xy;
-
-  if (HMM_AngleV2(m,b) > max_angle) {
-    total += spline_seglen(t0, tmid, max_angle, Cd, C);
-    total += spline_seglen(tmid, t1, max_angle, Cd, C);
-  } else
-   return HMM_LenV2(spline_CT(Cd, t0).xy)*(t1-t0);
-
-  return total;
-*/
+    return catmull_rom_query_internal(cp, d, &catmull_rom_m);
+}
+HMM_Vec2 catmull_rom_tan(HMM_Vec2 *cp, float d)
+{
+    return catmull_rom_query_internal(cp, d, &catmull_rom_dm);
+}
+HMM_Vec2 catmull_rom_curv(HMM_Vec2 *cp, float d)
+{
+    return catmull_rom_query_internal(cp, d, &catmull_rom_ddm);
+}
+HMM_Vec2 catmull_rom_wig(HMM_Vec2 *cp, float d)
+{
+    return catmull_rom_query_internal(cp, d, &catmull_rom_dddm);
 }
 
+/* -------------------------------------------------------------------------
+   Approximate length of a single 4-point cubic spline segment by
+   numeric integration (or sampling) of the velocity magnitude.
+   “spline_seglen” below does a quick sampling approach. 
+   ------------------------------------------------------------------------- */
+float spline_seglen(float t0, float t1, int steps, HMM_Mat4 *Cd, HMM_Mat4 *C)
+{
+    // Simple uniform sampling of the tangent magnitude
+    float total = 0.0f;
+    float dt = (t1 - t0) / (float) steps;
+    for (int i = 0; i < steps; i++)
+    {
+        float t = t0 + (i + 0.5f) * dt; // midpoint rule
+        // derivative at t
+        HMM_Vec2 vel = spline_CT(Cd, t).XY;
+        float speed = HMM_LenV2(vel);
+        total += speed * dt;
+    }
+    return total;
+}
+
+/* Summation of lengths across all Catmull–Rom segments. */
 float catmull_rom_len(HMM_Vec2 *cp)
 {
-  float len = 0.0;
-  int segs = arrlen(cp)-3;
-  float d_per_seg = (float)1/segs;
-  float maxi = d_per_seg;
-  for (int i = 0; i < arrlen(cp)-3; i++) {
-    HMM_Mat4 C = make_C(&cp[i], &catmull_rom_m);
-    HMM_Mat4 Cd = make_C(&cp[i], &catmull_rom_dm);
-    len += spline_seglen(0, 1, 0.1, &Cd, &C);
-  }
-  return len;
-}
+    int stepsPerSegment = 64;
+    float len = 0.0f;
+    int n = arrlen(cp);
+    if (n < 4) return 0.0f;
 
-/* d is from 0 to 1 for the entire spline */
-HMM_Vec2 catmull_rom_pos(HMM_Vec2 *cp, float d) { return catmull_rom_query(cp,d,&catmull_rom_m); }
-HMM_Vec2 catmull_rom_tan(HMM_Vec2 *cp, float d) { return catmull_rom_query(cp,d,&catmull_rom_dm); }
-HMM_Vec2 catmull_rom_curv(HMM_Vec2 *cp, float d) { return catmull_rom_query(cp,d,&catmull_rom_ddm); }
-HMM_Vec2 catmull_rom_wig(HMM_Vec2 *cp, float d) { return catmull_rom_query(cp,d,&catmull_rom_dddm); }
+    for (int i = 0; i < n - 3; i++)
+    {
+        // Build the position matrix & derivative matrix for this segment
+        HMM_Mat4 C  = make_C(&cp[i], &catmull_rom_m);
+        HMM_Mat4 Cd = make_C(&cp[i], &catmull_rom_dm);
+        // integrate from 0..1
+        len += spline_seglen(0.0f, 1.0f, stepsPerSegment, &Cd, &C);
+    }
+    return len;
+}
 
 HMM_Vec2 catmull_rom_closest(HMM_Vec2 *cp, HMM_Vec2 p)
 {
-  return p;
+    int n = arrlen(cp);
+    if (n < 4) return p;
+
+    float bestDist = FLT_MAX;
+    HMM_Vec2 bestPt = p;
+
+    int steps = 64; // more steps => more accurate
+    for (int seg = 0; seg < n - 3; seg++)
+    {
+        // Build a single-segment matrix
+        HMM_Vec2 segCP[4] = { cp[seg], cp[seg+1], cp[seg+2], cp[seg+3] };
+        HMM_Mat4 C  = make_C(segCP, &catmull_rom_m);
+        for (int i = 0; i <= steps; i++)
+        {
+            float t = (float)i / steps;
+            HMM_Vec2 pt = spline_CT(&C, t).XY;
+            float dist  = HMM_DistV2(p, pt);
+            if (dist < bestDist)
+            {
+                bestDist = dist;
+                bestPt   = pt;
+            }
+        }
+    }
+
+    return bestPt;
 }
diff --git a/source/spline.h b/source/spline.h
index 772843c2..4ee0e8c1 100644
--- a/source/spline.h
+++ b/source/spline.h
@@ -3,21 +3,83 @@
 
 #include "HandmadeMath.h"
 
-HMM_Vec2 *catmull_rom_ma_v2(HMM_Vec2 *cp, float ma);
-HMM_Vec3 *catmull_rom_ma_v3(HMM_Vec3 *cp, float ma);
-HMM_Vec4 *catmull_rom_ma_v4(HMM_Vec4 *cp, float ma);
+#ifdef __cplusplus
+extern "C" {
+#endif
 
+/*
+    These were already in your original header:
+*/
+
+// Adaptive Catmull–Rom in 2D / 3D / 4D (by minimum angle):
+HMM_Vec2 *catmull_rom_ma_v2(HMM_Vec2 *cp, float ma);
+HMM_Vec3 *catmull_rom_ma_v3(HMM_Vec3 *cp, float ma); /* not yet implemented in .c, placeholder */
+HMM_Vec4 *catmull_rom_ma_v4(HMM_Vec4 *cp, float ma); /* not yet implemented in .c, placeholder */
+
+// Adaptive Bezier in 2D (by minimum angle):
 HMM_Vec2 *bezier_cb_ma_v2(HMM_Vec2 *cp, float ma);
+
+// Generic “single-segment” query for 2D control points + basis matrix:
 HMM_Vec2 spline_query(HMM_Vec2 *cp, float d, HMM_Mat4 *basis);
 
-HMM_Vec2 catmull_rom_pos(HMM_Vec2 *cp, float d);
-HMM_Vec2 catmull_rom_tan(HMM_Vec2 *cp, float d);
-HMM_Vec2 catmull_rom_curv(HMM_Vec2 *cp, float d);
-HMM_Vec2 catmull_rom_wig(HMM_Vec2 *cp, float d);
+// Catmull–Rom “entire spline” queries:
+HMM_Vec2 catmull_rom_pos(HMM_Vec2 *cp, float d);   // position
+HMM_Vec2 catmull_rom_tan(HMM_Vec2 *cp, float d);   // tangent
+HMM_Vec2 catmull_rom_curv(HMM_Vec2 *cp, float d);  // curvature
+HMM_Vec2 catmull_rom_wig(HMM_Vec2 *cp, float d);   // 3rd derivative (“wiggle”)
 
+// Computes approximate length of a 2D Catmull–Rom spline:
 float catmull_rom_len(HMM_Vec2 *cp);
 
-/* Returns closest point on a curve given a point p */
+// Returns closest point on a 2D Catmull–Rom curve given an external 2D point `p`:
 HMM_Vec2 catmull_rom_closest(HMM_Vec2 *cp, HMM_Vec2 p);
 
+
+/*
+    Additional convenience functions for *single-segment* cubic splines:
+
+    Each of these expects exactly 4 control points in `p[0..3]`,
+    and a parameter t in [0..1]. They pick the appropriate matrix internally.
+*/
+
+// Hermite:
+HMM_Vec2 cubic_hermite_pos(HMM_Vec2 *p, float d);
+HMM_Vec2 cubic_hermite_tan(HMM_Vec2 *p, float d);
+HMM_Vec2 cubic_hermite_curv(HMM_Vec2 *p, float d);
+HMM_Vec2 cubic_hermite_wig(HMM_Vec2 *p, float d);
+
+// B-spline:
+HMM_Vec2 b_spline_pos(HMM_Vec2 *p, float d);
+HMM_Vec2 b_spline_tan(HMM_Vec2 *p, float d);
+HMM_Vec2 b_spline_curv(HMM_Vec2 *p, float d);
+HMM_Vec2 b_spline_wig(HMM_Vec2 *p, float d);
+
+// Bezier:
+HMM_Vec2 bezier_pos(HMM_Vec2 *p, float d);
+HMM_Vec2 bezier_tan(HMM_Vec2 *p, float d);
+HMM_Vec2 bezier_curv(HMM_Vec2 *p, float d);
+HMM_Vec2 bezier_wig(HMM_Vec2 *p, float d);
+
+
+/*
+    Uniform sampling of a *single* 4-point segment in 2D:
+    Returns an array of points (stb_ds dynamic array).
+*/
+HMM_Vec2 *spline_v2(HMM_Vec2 *p, HMM_Mat4 *m, int segs);
+
+/*
+    Adaptive subdivision routines (single-segment) in 2D:
+    - Subdivide by min segment length
+    - Subdivide by “max angle” proxy
+*/
+HMM_Vec2 *spline2d_min_seg(float u0, float u1, float min_seg, HMM_Mat4 *C, HMM_Vec2 *ret);
+HMM_Vec2 *catmull_rom_min_seg(HMM_Vec2 *a, HMM_Vec2 *b, HMM_Vec2 *c, HMM_Vec2 *d, float min_seg);
+
+HMM_Vec2 *spline2d_min_angle_2(float u0, float u1, float max_angle, HMM_Mat4 *C, HMM_Vec2 *arr);
+HMM_Vec2 *spline_min_angle(HMM_Vec2 *p, const HMM_Mat4 *B, float min_angle, HMM_Vec2 *arr);
+
+#ifdef __cplusplus
+}
 #endif
+
+#endif /* SPLINE_H */
diff --git a/source/sprite.c b/source/sprite.c
index e873b725..ad3ea069 100644
--- a/source/sprite.c
+++ b/source/sprite.c
@@ -1,17 +1,18 @@
 #include "sprite.h"
 
 static sprite model = {
-  .affine = {x:0,y:0,w:0,h:0},
-  .image = JS_UNDEFINED,
+  .affine = {.x = 0, .y = 0, .w = 0, .h = 0},
   .tex = NULL,
-  .uv = {x:0,y:0,w:1,h:1},
+  .uv = {.x = 0, .y = 0, .w = 1, .h = 1},
   .layer = 0,
-  .color = {1,1,1,1}
+  .color = {1, 1, 1, 1}
 };
+
 sprite *make_sprite()
 {
   sprite *sprite = malloc(sizeof(*sprite));
   *sprite = model;
+  sprite->image = JS_UNDEFINED;
   return sprite;
 }
 
diff --git a/source/thirdparty/stb/stb_c_lexer.h b/source/thirdparty/stb/stb_c_lexer.h
index bf89dca3..fd42f1c3 100644
--- a/source/thirdparty/stb/stb_c_lexer.h
+++ b/source/thirdparty/stb/stb_c_lexer.h
@@ -38,6 +38,7 @@
 // Contributors:
 //   Arpad Goretity (bugfix)
 //   Alan Hickman (hex floats)
+//   github:mundusnine (bugfix)
 //
 // LICENSE
 //
@@ -562,7 +563,6 @@ int stb_c_lexer_get_token(stb_lexer *lexer)
          {
             int n = 0;
             lexer->string = lexer->string_storage;
-            lexer->string_len = n;
             do {
                if (n+1 >= lexer->string_storage_len)
                   return stb__clex_token(lexer, CLEX_parse_error, p, p+n);
@@ -576,6 +576,7 @@ int stb_c_lexer_get_token(stb_lexer *lexer)
                 STB_C_LEX_DOLLAR_IDENTIFIER( || p[n] == '$' )
             );
             lexer->string[n] = 0;
+            lexer->string_len = n;
             return stb__clex_token(lexer, CLEX_id, p, p+n-1);
          }
 
diff --git a/source/thirdparty/stb/stb_image.h b/source/thirdparty/stb/stb_image.h
index a632d543..9eedabed 100644
--- a/source/thirdparty/stb/stb_image.h
+++ b/source/thirdparty/stb/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.29 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.30 - public domain image loader - http://nothings.org/stb
                                   no warranty implied; use at your own risk
 
    Do this:
@@ -48,6 +48,7 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
+      2.30  (2024-05-31) avoid erroneous gcc warning
       2.29  (2023-05-xx) optimizations
       2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
       2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
@@ -5159,9 +5160,11 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
                if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
                if (z->depth == 16) {
-                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+                  for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning
+                     tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
                } else {
-                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+                  for (k = 0; k < s->img_n && k < 3; ++k)
+                     tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
                }
             }
             break;
diff --git a/source/thirdparty/stb/stb_image_resize2.h b/source/thirdparty/stb/stb_image_resize2.h
index 1cd379a7..2f262746 100644
--- a/source/thirdparty/stb/stb_image_resize2.h
+++ b/source/thirdparty/stb/stb_image_resize2.h
@@ -1,4 +1,4 @@
-/* stb_image_resize2 - v2.06 - public domain image resizing
+/* stb_image_resize2 - v2.12 - public domain image resizing
 
    by Jeff Roberts (v2) and Jorge L Rodriguez
    http://github.com/nothings/stb
@@ -11,35 +11,6 @@
          #define STB_IMAGE_RESIZE_IMPLEMENTATION
       before the #include. That will create the implementation in that file.
 
-   PORTING FROM VERSION 1
-
-      The API has changed. You can continue to use the old version of stb_image_resize.h,
-      which is available in the "deprecated/" directory.
-
-      If you're using the old simple-to-use API, porting is straightforward.
-      (For more advanced APIs, read the documentation.)
-
-        stbir_resize_uint8():
-          - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
-
-        stbir_resize_float():
-          - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
-
-        stbir_resize_uint8_srgb():
-          - function name is unchanged
-          - cast channel count to `stbir_pixel_layout`
-          - above is sufficient unless your image has alpha and it's not RGBA/BGRA
-            - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
-
-        stbir_resize_uint8_srgb_edgemode()
-          - switch to the "medium complexity" API
-          - stbir_resize(), very similar API but a few more parameters:
-            - pixel_layout: cast channel count to `stbir_pixel_layout`
-            - data_type:    STBIR_TYPE_UINT8_SRGB
-            - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
-            - filter:       STBIR_FILTER_DEFAULT
-          - which channel is alpha is specified in stbir_pixel_layout, see enum for details
-
    EASY API CALLS:
      Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
 
@@ -296,6 +267,34 @@
       ASSERT
          Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
 
+     PORTING FROM VERSION 1
+        The API has changed. You can continue to use the old version of stb_image_resize.h,
+        which is available in the "deprecated/" directory.
+
+        If you're using the old simple-to-use API, porting is straightforward.
+        (For more advanced APIs, read the documentation.)
+
+          stbir_resize_uint8():
+            - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
+
+          stbir_resize_float():
+            - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
+
+          stbir_resize_uint8_srgb():
+            - function name is unchanged
+            - cast channel count to `stbir_pixel_layout`
+            - above is sufficient unless your image has alpha and it's not RGBA/BGRA
+              - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
+
+          stbir_resize_uint8_srgb_edgemode()
+            - switch to the "medium complexity" API
+            - stbir_resize(), very similar API but a few more parameters:
+              - pixel_layout: cast channel count to `stbir_pixel_layout`
+              - data_type:    STBIR_TYPE_UINT8_SRGB
+              - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
+              - filter:       STBIR_FILTER_DEFAULT
+            - which channel is alpha is specified in stbir_pixel_layout, see enum for details
+
       FUTURE TODOS
         *  For polyphase integral filters, we just memcpy the coeffs to dupe
            them, but we should indirect and use the same coeff memory.
@@ -320,7 +319,7 @@
 
    CONTRIBUTORS
       Jeff Roberts: 2.0 implementation, optimizations, SIMD
-      Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer.
+      Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer
       Fabian Giesen: half float and srgb converters
       Sean Barrett: API design, optimizations
       Jorge L Rodriguez: Original 1.0 implementation
@@ -328,15 +327,30 @@
       Nathan Reed: warning fixes for 1.0
 
    REVISIONS
-      2.06 (2024-02-10) fix for indentical width/height 3x or more down-scaling 
-                          undersampling a single row on rare resize ratios (about 1%)
-      2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras)
-                        fix for output callback (thanks Julien Koenen)
+      2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE
+      2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
+                          with AVX-2, fix some weird scaling edge conditions with
+                          point sample mode.
+      2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
+                          fix MSVC 32-bit arm half float routines.
+      2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
+                          hardware half floats).
+      2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
+                          to Ryan Salsbury), fix for sub-rect resizes, use the
+                          pragmas to control unrolling when they are available.
+      2.07 (2024-05-24) fix for slow final split during threaded conversions of very 
+                          wide scanlines when downsampling (caused by extra input 
+                          converting), fix for wide scanline resamples with many 
+                          splits (int overflow), fix GCC warning.
+      2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling 
+                          undersampling a single row on rare resize ratios (about 1%).
+      2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
+                        fix for output callback (thanks Julien Koenen).
       2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
       2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
       2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
-                          (2x-5x faster without simd, 4x-12x faster with simd)
-                          (in some cases, 20x to 40x faster - resizing to very small for example)
+                          2x-5x faster without simd, 4x-12x faster with simd,
+                          in some cases, 20x to 40x faster esp resizing large to very small.
       0.96 (2019-03-04) fixed warnings
       0.95 (2017-07-23) fixed warnings
       0.94 (2017-03-18) fixed warnings
@@ -406,13 +420,13 @@ typedef uint64_t stbir_uint64;
   #endif
 #endif
 
-#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(_M_ARM) || (__ARM_NEON_FP & 4) != 0 &&  __ARM_FP16_FORMAT_IEEE != 0
+#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
 #ifndef STBIR_NEON
 #define STBIR_NEON
 #endif
 #endif
 
-#if defined(_M_ARM)
+#if defined(_M_ARM) || defined(__arm__)
 #ifdef STBIR_USE_FMA
 #undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
 #endif
@@ -1068,7 +1082,7 @@ struct stbir__info
   stbir__alpha_unweight_func * alpha_unweight;
   stbir__encode_pixels_func * encode_pixels;
 
-  int alloced_total;
+  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
   int splits; // count of splits
 
   stbir_internal_pixel_layout input_pixel_layout_internal;
@@ -1079,7 +1093,7 @@ struct stbir__info
   int vertical_first;
   int channels;
   int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
-  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
+  size_t alloced_total;
 };
 
 
@@ -1090,10 +1104,11 @@ struct stbir__info
 #define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
 
 // min/max friendly
-#define STBIR_CLAMP(x, xmin, xmax) do { \
+#define STBIR_CLAMP(x, xmin, xmax) for(;;) { \
   if ( (x) < (xmin) ) (x) = (xmin);     \
   if ( (x) > (xmax) ) (x) = (xmax);     \
-} while (0)
+  break;                                \
+}
 
 static stbir__inline int stbir__min(int a, int b)
 {
@@ -1190,19 +1205,40 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
 #endif
 
-// restrict pointers for the output pointers
+// restrict pointers for the output pointers, other loop and unroll control
 #if defined( _MSC_VER ) && !defined(__clang__)
   #define STBIR_STREAMOUT_PTR( star ) star __restrict
   #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
-#elif defined(  __clang__ )
-  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
-  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
-#elif defined(  __GNUC__ )
+  #if _MSC_VER >= 1900
+    #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector )) 
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START 
+  #endif
+#elif defined( __clang__ )
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) 
+  #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
+    #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START
+  #endif 
+#elif defined( __GNUC__ )
   #define STBIR_STREAMOUT_PTR( star ) star __restrict__
   #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
+  #if __GNUC__ >= 14
+    #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector")
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START
+  #endif
+  #define STBIR_NO_UNROLL_LOOP_START_INF_FOR
 #else
   #define STBIR_STREAMOUT_PTR( star ) star
   #define STBIR_NO_UNROLL( ptr )
+  #define STBIR_NO_UNROLL_LOOP_START
+#endif
+
+#ifndef STBIR_NO_UNROLL_LOOP_START_INF_FOR
+#define STBIR_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START
 #endif
 
 #ifdef STBIR_NO_SIMD // force simd off for whatever reason
@@ -1754,11 +1790,19 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
           ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
         vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
           ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
+
+      static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb)
+      {
+        uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) };
+        return r;
+      }
     #else
       #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
+      #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}}
     #endif
 
     #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
+    #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) )
 
     #define stbir__simdi_16madd( out, reg0, reg1 ) \
     { \
@@ -2142,7 +2186,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #endif
 
 
-#if defined(STBIR_NEON) && !defined(_M_ARM)
+#if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__)
 
   #if defined( _MSC_VER ) && !defined(__clang__)
   typedef __int16 stbir__FP16;
@@ -2159,7 +2203,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
 #endif
 
-#if !defined(STBIR_NEON) && !defined(STBIR_FP16C) || defined(STBIR_NEON) && defined(_M_ARM)
+#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__))
 
   // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
 
@@ -2386,24 +2430,6 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     stbir__simdi_store( output,final );
   }
 
-#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM)) // WASM or 32-bit ARM on MSVC/clang
-
-  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
-  {
-    for (int i=0; i<8; i++)
-    {
-      output[i] = stbir__half_to_float(input[i]);
-    }
-  }
-
-  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
-  {
-    for (int i=0; i<8; i++)
-    {
-      output[i] = stbir__float_to_half(input[i]);
-    }
-  }
-
 #elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
 
   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
@@ -2432,7 +2458,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
   }
 
-#elif defined(STBIR_NEON) // 64-bit ARM
+#elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM
 
   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
   {
@@ -2458,6 +2484,23 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
   }
 
+#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__half_to_float(input[i]);
+    }
+  }
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__float_to_half(input[i]);
+    }
+  }
+
 #endif
 
 
@@ -2510,11 +2553,13 @@ static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
 //   Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
 #define STBIR_SIMD_STREAMOUT_PTR( star )  STBIR_STREAMOUT_PTR( star )
 #define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
+#define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START
+#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START_INF_FOR
 
 #ifdef STBIR_MEMCPY
 #undef STBIR_MEMCPY
-#define STBIR_MEMCPY stbir_simd_memcpy
 #endif
+#define STBIR_MEMCPY stbir_simd_memcpy
 
 // override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
 static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
@@ -2532,6 +2577,7 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
     {
       if ( bytes )
       {
+        STBIR_SIMD_NO_UNROLL_LOOP_START
         do
         {
           STBIR_SIMD_NO_UNROLL(d);
@@ -2546,8 +2592,9 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
       // do one unaligned to get us aligned for the stream out below
       stbir__simdf_load( x, ( d + ofs_to_src ) );
       stbir__simdf_store( d, x );
-      d = (char*)( ( ( (ptrdiff_t)d ) + 16 ) & ~15 );
+      d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
 
+      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
       for(;;)
       {
         STBIR_SIMD_NO_UNROLL(d);
@@ -2578,8 +2625,9 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
     stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
     stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
     stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
-    d = (char*)( ( ( (ptrdiff_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
+    d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
 
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       STBIR_SIMD_NO_UNROLL(d);
@@ -2616,6 +2664,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
   if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
   {
     char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do
     {
       stbir__simdf x;
@@ -2642,12 +2691,16 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 // when in scalar mode, we let unrolling happen, so this macro just does the __restrict
 #define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
 #define STBIR_SIMD_NO_UNROLL(ptr)
+#define STBIR_SIMD_NO_UNROLL_LOOP_START
+#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
 
 #endif // SSE2
 
 
 #ifdef STBIR_PROFILE
 
+#ifndef STBIR_PROFILE_FUNC
+
 #if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
 
 #ifdef _MSC_VER
@@ -2687,8 +2740,9 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #error Unknown platform for profiling.
 
-#endif  //x64 and
+#endif  // x64, arm
 
+#endif // STBIR_PROFILE_FUNC
 
 #define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
 #define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
@@ -2753,7 +2807,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #ifndef STBIR_SIMD
 
-// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
+// memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be
 //   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
 //   the diff between dest and src)
 static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
@@ -2765,6 +2819,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
   if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
   {
     char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
+    STBIR_NO_UNROLL_LOOP_START
     do
     {
       STBIR_NO_UNROLL(sd);
@@ -2776,6 +2831,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
       return;
   }
 
+  STBIR_NO_UNROLL_LOOP_START
   do
   {
     STBIR_NO_UNROLL(sd);
@@ -2880,13 +2936,6 @@ static float stbir__filter_mitchell(float x, float s, void * user_data)
   return (0.0f);
 }
 
-static float stbir__support_zero(float s, void * user_data)
-{
-  STBIR__UNUSED(s);
-  STBIR__UNUSED(user_data);
-  return 0;
-}
-
 static float stbir__support_zeropoint5(float s, void * user_data)
 {
   STBIR__UNUSED(s);
@@ -3201,6 +3250,7 @@ static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel
 
   first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
   last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
+  if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross
 
   if ( edge == STBIR_EDGE_WRAP )
   {
@@ -3236,6 +3286,11 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
 
     stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
 
+    // make sure we never generate a range larger than our precalculated coeff width
+    //   this only happens in point sample mode, but it's a good safe thing to do anyway
+    if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
+      in_last_pixel = in_first_pixel + coefficient_width - 1;
+
     last_non_zero = -1;
     for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
     {
@@ -3271,19 +3326,22 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
   }
 }
 
-static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff )
+static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
 {
   if ( new_pixel <= contribs->n1 )  // before the end
   {
     if ( new_pixel < contribs->n0 ) // before the front?
     {
-      int j, o = contribs->n0 - new_pixel;
-      for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
-        coeffs[ j + o ] = coeffs[ j ];
-      for ( j = 1 ; j < o ; j-- )
-        coeffs[ j ] = coeffs[ 0 ];
-      coeffs[ 0 ] = new_coeff;
-      contribs->n0 = new_pixel;
+      if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
+      { 
+        int j, o = contribs->n0 - new_pixel;
+        for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
+          coeffs[ j + o ] = coeffs[ j ];
+        for ( j = 1 ; j < o ; j-- )
+          coeffs[ j ] = coeffs[ 0 ];
+        coeffs[ 0 ] = new_coeff;
+        contribs->n0 = new_pixel;
+      }
     }
     else
     {
@@ -3292,12 +3350,15 @@ static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs,
   }
   else
   {
-    int j, e = new_pixel - contribs->n0;
-    for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
-      coeffs[j] = 0;
+    if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
+    {
+      int j, e = new_pixel - contribs->n0;
+      for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
+        coeffs[j] = 0;
 
-    coeffs[ e ] = new_coeff;
-    contribs->n1 = new_pixel;
+      coeffs[ e ] = new_coeff;
+      contribs->n1 = new_pixel;
+    }
   }
 }
 
@@ -3476,6 +3537,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
 
   coeffs = coefficient_group;
   contribs = contributors;
+
   for (n = 0; n < num_contributors; n++)
   {
     int i;
@@ -3515,7 +3577,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
         int endi = contribs->n1;
         contribs->n1 = input_last_n1;
         for( i = input_size; i <= endi; i++ )
-          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start] );
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
       }
 
       // now check left hand edge
@@ -3527,7 +3589,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
 
         // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
         for( i = -1 ; i > contribs->n0 ; i-- )
-          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c-- );
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
         save_n0 = contribs->n0;
         save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
 
@@ -3537,7 +3599,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
           coeffs[i] = coeffs[i-save_n0];
 
         // now that we have shrunk down the contribs, we insert the first one safely
-        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff );
+        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
       }
     }
 
@@ -3546,6 +3608,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
       int diff = contribs->n1 - contribs->n0 + 1;
       while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
         --diff;
+
       contribs->n1 = contribs->n0 + diff - 1;
 
       if ( contribs->n0 <= contribs->n1 )
@@ -3594,6 +3657,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
     switch( widest )
     {
       case 1:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_1( pc, coeffs );
           ++pc;
@@ -3601,6 +3665,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 2:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_2( pc, coeffs );
           pc += 2;
@@ -3608,6 +3673,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 3:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_2( pc, coeffs );
           STBIR_MOVE_1( pc+2, coeffs+2 );
@@ -3616,6 +3682,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 4:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           pc += 4;
@@ -3623,6 +3690,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 5:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_1( pc+4, coeffs+4 );
@@ -3631,6 +3699,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 6:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_2( pc+4, coeffs+4 );
@@ -3639,6 +3708,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 7:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_2( pc+4, coeffs+4 );
@@ -3648,6 +3718,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 8:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3656,6 +3727,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 9:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3665,6 +3737,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 10:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3674,6 +3747,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 11:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3684,6 +3758,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 12:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3693,6 +3768,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       default:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           float * copy_end = pc + widest - 4;
           float * c = coeffs;
@@ -3703,6 +3779,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
             c += 4;
           } while ( pc <= copy_end );
           copy_end += 4;
+          STBIR_NO_UNROLL_LOOP_START
           while ( pc < copy_end )
           {
             STBIR_MOVE_1( pc, c );
@@ -3904,7 +3981,7 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
               }
               else
               {
-                stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
+                stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
               }
               STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
             }
@@ -4013,6 +4090,7 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
 
   #ifdef STBIR_SIMD8
   decode += 16;
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode <= end_decode )
   {
     stbir__simdf8 d0,d1,a0,a1,p0,p1;
@@ -4037,6 +4115,7 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
   decode -= 16;
   #else
   decode += 8;
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode <= end_decode )
   {
     stbir__simdf d0,a0,d1,a1,p0,p1;
@@ -4059,12 +4138,14 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
 
   // might be one last odd pixel
   #ifdef STBIR_SIMD8
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode < end_decode )
   #else
   if ( decode < end_decode )
   #endif
   {
     stbir__simdf d,a,p;
+    STBIR_NO_UNROLL(decode);
     stbir__simdf_load( d, decode );
     stbir__simdf_0123to3333( a, d );
     stbir__simdf_mult( p, a, d );
@@ -4106,6 +4187,7 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c
   decode += 8;
   if ( decode <= end_decode )
   {
+    STBIR_NO_UNROLL_LOOP_START
     do {
       #ifdef STBIR_SIMD8
       stbir__simdf8 d0,a0,p0;
@@ -4149,6 +4231,7 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c
   decode -= 8;
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode < end_decode )
   {
     float x = decode[0], y = decode[1];
@@ -4169,6 +4252,7 @@ static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_ti
 
   // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float alpha = input[3];
 #ifdef STBIR_SIMD
@@ -4236,6 +4320,7 @@ static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_tim
   #ifdef STBIR_SIMD
   {
     decode += 2 * stbir__simdfX_float_count;
+    STBIR_NO_UNROLL_LOOP_START
     while ( decode <= end_decode )
     {
       stbir__simdfX d0,a0,d1,a1;
@@ -4254,6 +4339,7 @@ static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_tim
 
     // few last pixels remnants
     #ifdef STBIR_SIMD8
+    STBIR_NO_UNROLL_LOOP_START
     while ( decode < end_decode )
     #else
     if ( decode < end_decode )
@@ -4289,6 +4375,7 @@ static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_tim
 
   #ifdef STBIR_SIMD
   decode += 2 * stbir__simdfX_float_count;
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode <= end_decode )
   {
     stbir__simdfX d0,a0,d1,a1;
@@ -4306,6 +4393,7 @@ static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_tim
   decode -= 2 * stbir__simdfX_float_count;
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode < end_decode )
   {
     float alpha = decode[1];
@@ -4320,6 +4408,7 @@ static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_t
   float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
   float const * end_output = encode_buffer + width_times_channels;
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float alpha = encode[3];
 
@@ -4367,9 +4456,77 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
   float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
   float const * end_decode = decode_buffer + width_times_channels;
 
-  decode += 12;
+#ifdef STBIR_SIMD
+    #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
+      end_decode -= 12; 
+      STBIR_NO_UNROLL_LOOP_START
+      while( decode <= end_decode )
+      {
+        // on arm64 8 instructions, no overlapping stores
+        stbir__simdf a,b,c,na,nb;
+        STBIR_SIMD_NO_UNROLL(decode);
+        stbir__simdf_load( a, decode );
+        stbir__simdf_load( b, decode+4 );
+        stbir__simdf_load( c, decode+8 );
+
+        na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );   
+        b  = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );   
+        nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );   
+        c  = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );   
+
+        stbir__simdf_store( decode, na );
+        stbir__simdf_store( decode+4, nb ); 
+        stbir__simdf_store( decode+8, c );
+        decode += 12;
+      }
+      end_decode += 12;
+    #else
+      end_decode -= 24;
+      STBIR_NO_UNROLL_LOOP_START
+      while( decode <= end_decode )
+      {
+        // 26 instructions on x64
+        stbir__simdf a,b,c,d,e,f,g;
+        float i21, i23;
+        STBIR_SIMD_NO_UNROLL(decode);
+        stbir__simdf_load( a, decode );
+        stbir__simdf_load( b, decode+3 );
+        stbir__simdf_load( c, decode+6 );
+        stbir__simdf_load( d, decode+9 );
+        stbir__simdf_load( e, decode+12 );
+        stbir__simdf_load( f, decode+15 );
+        stbir__simdf_load( g, decode+18 );
+
+        a = stbir__simdf_swiz( a, 2, 1, 0, 3 );   
+        b = stbir__simdf_swiz( b, 2, 1, 0, 3 );   
+        c = stbir__simdf_swiz( c, 2, 1, 0, 3 );   
+        d = stbir__simdf_swiz( d, 2, 1, 0, 3 );   
+        e = stbir__simdf_swiz( e, 2, 1, 0, 3 );   
+        f = stbir__simdf_swiz( f, 2, 1, 0, 3 );   
+        g = stbir__simdf_swiz( g, 2, 1, 0, 3 );   
+
+        // stores overlap, need to be in order, 
+        stbir__simdf_store( decode,    a );
+        i21 = decode[21];
+        stbir__simdf_store( decode+3,  b ); 
+        i23 = decode[23];
+        stbir__simdf_store( decode+6,  c );
+        stbir__simdf_store( decode+9,  d );
+        stbir__simdf_store( decode+12, e );
+        stbir__simdf_store( decode+15, f );
+        stbir__simdf_store( decode+18, g );
+        decode[21] = i23;
+        decode[23] = i21;
+        decode += 24;
+      }
+      end_decode += 24;
+    #endif
+#else
+  end_decode -= 12;
+  STBIR_NO_UNROLL_LOOP_START
   while( decode <= end_decode )
   {
+    // 16 instructions
     float t0,t1,t2,t3;
     STBIR_NO_UNROLL(decode);
     t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
@@ -4377,8 +4534,10 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
     decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
     decode += 12;
   }
-  decode -= 12;
+  end_decode += 12;
+#endif
 
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < end_decode )
   {
     float t = decode[0];
@@ -4399,7 +4558,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
   stbir_edge edge_horizontal = stbir_info->horizontal.edge;
   stbir_edge edge_vertical = stbir_info->vertical.edge;
   int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
-  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (ptrdiff_t)row * (ptrdiff_t) stbir_info->input_stride_bytes;
+  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
   stbir__span const * spans = stbir_info->scanline_extents.spans;
   float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
 
@@ -4668,12 +4827,13 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
 
 #define stbir__1_coeff_remnant( ofs )                \
-    { stbir__simdf t;                                \
+    { stbir__simdf t,d;                              \
     stbir__simdf_load1z( t, hc + (ofs) );            \
+    stbir__simdf_load2( d, decode + (ofs) * 2 );     \
     stbir__simdf_0123to0011( t, t );                 \
-    stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
+    stbir__simdf_mult( t, t, d );                    \
     stbir__simdf8_add4( tot0, tot0, t ); }
-
+ 
 #define stbir__2_coeff_remnant( ofs )                \
     { stbir__simdf t;                                \
     stbir__simdf_load2( t, hc + (ofs) );             \
@@ -6052,7 +6212,7 @@ static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbi
     stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
   }
 
-  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((ptrdiff_t)n * (ptrdiff_t)stbir_info->output_stride_bytes),
+  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
                           encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 }
 
@@ -6093,7 +6253,7 @@ static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__
 
   // initialize the ring buffer for gathering
   split_info->ring_buffer_begin_index = 0;
-  split_info->ring_buffer_first_scanline = stbir_info->vertical.extent_info.lowest;
+  split_info->ring_buffer_first_scanline = vertical_contributors->n0;
   split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
 
   for (y = start_output_y; y < end_output_y; y++)
@@ -6147,7 +6307,7 @@ static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_
   float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
 
   // dump the scanline out
-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
@@ -6168,7 +6328,7 @@ static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(st
   stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
   // dump the scanline out
-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
@@ -6572,7 +6732,7 @@ static void stbir__free_internal_mem( stbir__info *info )
     STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
     STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
     STBIR__FREE_AND_CLEAR( info->alloced_mem );
-    STBIR__FREE_AND_CLEAR( info );
+    STBIR_FREE( info, info->user_data );
   #endif
   }
 
@@ -6765,7 +6925,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 
   stbir__info * info = 0;
   void * alloced = 0;
-  int alloced_total = 0;
+  size_t alloced_total = 0;
   int vertical_first;
   int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
 
@@ -6970,6 +7130,11 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 
 #ifdef STBIR__SEPARATE_ALLOCATIONS
       temp_mem_amt = decode_buffer_size;
+
+      #ifdef STBIR_SIMD8
+      if ( effective_channels == 3 )
+        --temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif
 #else
       temp_mem_amt = ( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * splits;
 #endif
@@ -7067,36 +7232,33 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
         info->ring_buffer_num_entries = conservative_split_output_size;
       STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
 
-      // a few of the horizontal gather functions read one dword past the end (but mask it out), so put in a normal value so no snans or denormals accidentally sneak in
+      // a few of the horizontal gather functions read past the end of the decode (but mask it out), 
+      //   so put in normal values so no snans or denormals accidentally sneak in (also, in the ring 
+      //   buffer for vertical first)
       for( i = 0 ; i < splits ; i++ )
       {
-        int width, ofs;
+        int t, ofs, start;
 
-        // find the right most span
-        if ( info->scanline_extents.spans[0].n1 > info->scanline_extents.spans[1].n1 )
-          width = info->scanline_extents.spans[0].n1 - info->scanline_extents.spans[0].n0;
-        else
-          width = info->scanline_extents.spans[1].n1 - info->scanline_extents.spans[1].n0;
+        ofs = decode_buffer_size / 4;
 
-        // this calc finds the exact end of the decoded scanline for all filter modes.
-        //   usually this is just the width * effective channels.  But we have to account
-        //   for the area to the left of the scanline for wrap filtering and alignment, this
-        //   is stored as a negative value in info->scanline_extents.conservative.n0. Next,
-        //   we need to skip the exact size of the right hand size filter area (again for
-        //   wrap mode), this is in info->scanline_extents.edge_sizes[1]).
-        ofs = ( width + 1 - info->scanline_extents.conservative.n0 + info->scanline_extents.edge_sizes[1] ) * effective_channels;
+        #if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
+        if ( effective_channels == 3 ) 
+          --ofs; // avx in 3 channel mode needs one float at the start of the buffer, so we snap back for clearing
+        #endif
 
-        // place a known, but numerically valid value in the decode buffer
-        info->split_info[i].decode_buffer[ ofs ] = 9999.0f;
+        start = ofs - 4;
+        if ( start < 0 ) start = 0;
+
+        for( t = start ; t < ofs; t++ )
+          info->split_info[i].decode_buffer[ t ] = 9999.0f;
 
-        // if vertical filtering first, place a known, but numerically valid value in the all
-        //   of the ring buffer accumulators
         if ( vertical_first )
         {
           int j;
           for( j = 0; j < info->ring_buffer_num_entries ; j++ )
           {
-            stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ ofs ] = 9999.0f;
+            for( t = start ; t < ofs; t++ )
+              stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ t ] = 9999.0f;
           }
         }
       }
@@ -7108,7 +7270,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
     // is this the first time through loop?
     if ( info == 0 )
     {
-      alloced_total = (int) ( 15 + (size_t)advance_mem );
+      alloced_total = ( 15 + (size_t)advance_mem );
       alloced = STBIR_MALLOC( alloced_total, user_data );
       if ( alloced == 0 )
         return 0;
@@ -7225,7 +7387,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
     info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
 
   // calc offset
-  info->output_data = ( (char*) resize->output_pixels ) + ( (ptrdiff_t) info->offset_y * (ptrdiff_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
+  info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
 
   info->in_pixels_cb = resize->input_cb;
   info->user_data = resize->user_data;
@@ -7797,7 +7959,7 @@ static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * o
   if ( output_stride_in_bytes < pitch )
     return 0;
 
-  size = output_stride_in_bytes * output_h;
+  size = (size_t)output_stride_in_bytes * (size_t)output_h;
   if ( size == 0 )
     return 0;
 
@@ -8075,6 +8237,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
   if ( width_times_channels >= 16 )
   {
     decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8130,6 +8293,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -8145,6 +8309,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8171,6 +8336,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
   {
     float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
     end_output -= stbir__simdfX_float_count*2;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdfX e0, e1;
@@ -8202,6 +8368,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e0;
@@ -8220,6 +8387,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     stbir__simdf e0;
@@ -8256,6 +8424,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -8285,6 +8454,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
   if ( width_times_channels >= 16 )
   {
     decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8334,6 +8504,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -8349,6 +8520,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8375,6 +8547,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
   {
     float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
     end_output -= stbir__simdfX_float_count*2;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdfX e0, e1;
@@ -8406,6 +8579,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e0;
@@ -8444,6 +8618,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -8484,6 +8659,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8569,12 +8745,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
     float const * end_encode_m16 = encode + width_times_channels - 16;
     end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
@@ -8588,7 +8764,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
       stbir__min_max_shift20( i2, f2 );
       stbir__min_max_shift20( i3, f3 );
 
-      stbir__simdi_table_lookup4( i0, i1, i2, i3, to_srgb );
+      stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
 
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
@@ -8613,6 +8789,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while ( output <= end_output )
   {
     STBIR_SIMD_NO_UNROLL(encode);
@@ -8630,6 +8807,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     STBIR_NO_UNROLL(encode);
@@ -8670,12 +8848,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
     float const * end_encode_m16 = encode + width_times_channels - 16;
     end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
@@ -8689,7 +8867,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
       stbir__min_max_shift20( i2, f2 );
       stbir__scale_and_convert( i3, f3 );
 
-      stbir__simdi_table_lookup3( i0, i1, i2, to_srgb );
+      stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
 
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
@@ -8711,6 +8889,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
   }
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float f;
     STBIR_SIMD_NO_UNROLL(encode);
@@ -8761,12 +8940,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
     float const * end_encode_m16 = encode + width_times_channels - 16;
     end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
@@ -8780,7 +8959,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
       stbir__min_max_shift20( i2, f2 );
       stbir__scale_and_convert( i3, f3 );
 
-      stbir__simdi_table_lookup2( i0, i2, to_srgb );
+      stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
 
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i2, f2 );
@@ -8800,6 +8979,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
   }
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float f;
     STBIR_SIMD_NO_UNROLL(encode);
@@ -8828,6 +9008,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
   if ( width_times_channels >= 8 )
   {
     decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8871,6 +9052,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -8886,6 +9068,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8914,6 +9097,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
     {
       float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
       end_output -= stbir__simdfX_float_count*2;
+      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
       for(;;)
       {
         stbir__simdfX e0, e1;
@@ -8941,6 +9125,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e;
@@ -8959,6 +9144,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     stbir__simdf e;
@@ -8980,6 +9166,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     float f;
@@ -8996,6 +9183,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -9025,6 +9213,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
   if ( width_times_channels >= 8 )
   {
     decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -9065,6 +9254,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -9080,6 +9270,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -9107,6 +9298,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
     {
       float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
       end_output -= stbir__simdfX_float_count*2;
+      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
       for(;;)
       {
         stbir__simdfX e0, e1;
@@ -9134,6 +9326,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e;
@@ -9155,6 +9348,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
   // try to do blocks of 4 when you can
   #if  stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     float f;
@@ -9173,6 +9367,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -9201,6 +9396,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
   {
     stbir__FP16 const * end_input_m8 = input + width_times_channels - 8;
     decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       STBIR_NO_UNROLL(decode);
@@ -9242,6 +9438,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -9257,6 +9454,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -9283,6 +9481,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
   {
     float const * end_encode_m8 = encode + width_times_channels - 8;
     end_output -= 8;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       STBIR_SIMD_NO_UNROLL(encode);
@@ -9323,6 +9522,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     STBIR_SIMD_NO_UNROLL(output);
@@ -9338,6 +9538,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     STBIR_NO_UNROLL(output);
@@ -9366,6 +9567,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
   {
     float const * end_input_m16 = input + width_times_channels - 16;
     decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       STBIR_NO_UNROLL(decode);
@@ -9414,6 +9616,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -9429,6 +9632,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -9488,6 +9692,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
   {
     float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 );
     end_output -= ( stbir__simdfX_float_count * 2 );
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdfX e0, e1;
@@ -9521,6 +9726,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e0;
@@ -9545,6 +9751,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     float e;
@@ -9564,6 +9771,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float e;
@@ -9674,6 +9882,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
     stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
     stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) )
     {
       stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
@@ -9728,6 +9937,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
       input += (4*stbir__simdfX_float_count);
       stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
     }
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     while ( ( (char*)input_end - (char*) input ) >= 16 )
     {
       stbir__simdf o0, r0;
@@ -9760,6 +9970,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     }
   }
   #else
+  STBIR_NO_UNROLL_LOOP_START
   while ( ( (char*)input_end - (char*) input ) >= 16 )
   {
     float r0, r1, r2, r3;
@@ -9791,6 +10002,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
   }
   #endif
+  STBIR_NO_UNROLL_LOOP_START
   while ( input < input_end )
   {
     float r = input[0];
@@ -9854,6 +10066,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
     stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
 
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) )
     {
       stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
@@ -9898,6 +10111,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
       stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
     }
 
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
     {
       stbir__simdf o0, r0;
@@ -9922,6 +10136,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     }
   }
   #else
+  STBIR_NO_UNROLL_LOOP_START
   while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
   {
     float o0, o1, o2, o3;
@@ -9943,6 +10158,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
   }
   #endif
+  STBIR_NO_UNROLL_LOOP_START
   while ( input0 < input0_end )
   {
     float o0;
@@ -10035,6 +10251,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( floa
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10047,6 +10264,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10059,6 +10277,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10071,6 +10290,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10083,6 +10303,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10096,6 +10317,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10110,6 +10332,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10124,6 +10347,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10137,6 +10361,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10151,6 +10376,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( fl
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10166,6 +10392,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( fl
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10180,6 +10407,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( fl
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
@@ -10194,12 +10422,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
@@ -10214,12 +10444,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
@@ -10235,12 +10467,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
@@ -10258,12 +10492,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
diff --git a/source/thirdparty/stb/stb_truetype.h b/source/thirdparty/stb/stb_truetype.h
index bbf2284b..90a5c2e2 100644
--- a/source/thirdparty/stb/stb_truetype.h
+++ b/source/thirdparty/stb/stb_truetype.h
@@ -54,7 +54,7 @@
 //       Hou Qiming                 Derek Vinyard
 //       Rob Loach                  Cort Stratton
 //       Kenney Phillis Jr.         Brian Costabile
-//       Ken Voskuil (kaesve)
+//       Ken Voskuil (kaesve)       Yakov Galka
 //
 // VERSION HISTORY
 //
@@ -4604,6 +4604,8 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
    scale_y = -scale_y;
 
    {
+      // distance from singular values (in the same units as the pixel grid)
+      const float eps = 1./1024, eps2 = eps*eps;
       int x,y,i,j;
       float *precompute;
       stbtt_vertex *verts;
@@ -4616,15 +4618,15 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
             float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y;
             float x1 = verts[j].x*scale_x, y1 = verts[j].y*scale_y;
             float dist = (float) STBTT_sqrt((x1-x0)*(x1-x0) + (y1-y0)*(y1-y0));
-            precompute[i] = (dist == 0) ? 0.0f : 1.0f / dist;
+            precompute[i] = (dist < eps) ? 0.0f : 1.0f / dist;
          } else if (verts[i].type == STBTT_vcurve) {
             float x2 = verts[j].x *scale_x, y2 = verts[j].y *scale_y;
             float x1 = verts[i].cx*scale_x, y1 = verts[i].cy*scale_y;
             float x0 = verts[i].x *scale_x, y0 = verts[i].y *scale_y;
             float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2;
             float len2 = bx*bx + by*by;
-            if (len2 != 0.0f)
-               precompute[i] = 1.0f / (bx*bx + by*by);
+            if (len2 >= eps2)
+               precompute[i] = 1.0f / len2;
             else
                precompute[i] = 0.0f;
          } else
@@ -4689,8 +4691,8 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
                         float a = 3*(ax*bx + ay*by);
                         float b = 2*(ax*ax + ay*ay) + (mx*bx+my*by);
                         float c = mx*ax+my*ay;
-                        if (a == 0.0) { // if a is 0, it's linear
-                           if (b != 0.0) {
+                        if (STBTT_fabs(a) < eps2) { // if a is 0, it's linear
+                           if (STBTT_fabs(b) >= eps2) {
                               res[num++] = -c/b;
                            }
                         } else {
diff --git a/source/transform.c b/source/transform.c
index 67f3f423..8302b093 100644
--- a/source/transform.c
+++ b/source/transform.c
@@ -13,14 +13,14 @@ static transform model = {
   .cache = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1},
   .gcache = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1},
   .dirty = 0,
-  .jsparent = JS_UNDEFINED,
-  .change_hook = JS_UNDEFINED
 };
 
 transform *make_transform()
 {
   transform *t = malloc(sizeof(transform));
   *t = model;
+  t->jsparent = JS_UNDEFINED;
+  t->change_hook = JS_UNDEFINED;
   return t;
 }