// sdl_gpu.cm - SDL3 GPU Backend for fx_graph
//
// Direct SDL3 GPU implementation - does NOT use prosperon.cm
// Handles window creation, GPU init, texture loading, and rendering

var video = use('sdl3/video')
var gpu_mod = use('sdl3/gpu')
var blob_mod = use('blob')
var io = use('fd')
var png = use('image/png')
var qoi = use('image/qoi')
var gif = use('image/gif')
var aseprite = use('image/aseprite')
var staef = use('staef')
var res = use('resources')
var geometry = use('geometry')

var sdl_gpu = {}

// Private state
var _gpu = null
var _window = null
var _swapchain_format = null
var _window_width = 1280
var _window_height = 720

// Shaders
var _sprite_vert = null
var _sprite_frag = null
var _blit_vert = null
var _blit_frag = null
var _threshold_frag = null
var _blur_frag = null
var _mask_frag = null
var _crt_frag = null
var _accumulator_frag = null
var _text_sdf_frag = null
var _text_msdf_frag = null
var _shape2d_frag = null

// Pipelines
var _pipelines = {}

// Samplers
var _sampler_nearest = null
var _sampler_linear = null

// Texture cache: path -> {texture, width, height}
var _texture_cache = {}
var _white_texture = null

// Font cache: path.size -> font
var _font_cache = {}

// Render target pool
var _target_pool = {}

// ========================================================================
// INITIALIZATION
// ========================================================================

sdl_gpu.init = function(opts) {
  opts = opts || {}
  _window_width = opts.width || 1280
  _window_height = opts.height || 720
  
  _window = video.window({
    title: opts.title || "Prosperon",
    width: _window_width,
    height: _window_height,
    resizable: true
  })
  
  _gpu =gpu_mod.gpu({debug: true, shaders_msl: true, lowpower: true})
  _gpu.claim_window(_window)
  
  _swapchain_format = _gpu.swapchain_format(_window)
  
  // Load shaders
  if (!_load_shaders()) {
    log.console("sdl_gpu: Failed to load shaders")
    return false
  }
  
  // Create samplers
  _sampler_nearest =gpu_mod.sampler(_gpu, {
    min_filter: "nearest",
    mag_filter: "nearest",
    u: "clamp_to_edge",
    v: "clamp_to_edge"
  })
  
  _sampler_linear =gpu_mod.sampler(_gpu, {
    min_filter: "linear",
    mag_filter: "linear",
    u: "clamp_to_edge",
    v: "clamp_to_edge"
  })
  
  // Create white texture for untextured draws
  var white_pixels = blob_mod(32, true)
  _white_texture = _create_gpu_texture(1, 1, stone(white_pixels))
  
  // Create pipelines
  _create_pipelines()
  
  log.console("sdl_gpu: Initialized")
  return true
}

sdl_gpu.get_window = function() {
  return _window
}

sdl_gpu.get_device = function() {
  return _gpu
}

sdl_gpu.set_window_size = function(w, h) {
  _window_width = w
  _window_height = h
}

sdl_gpu.get_window_size = function() {
  return {width: _window_width, height: _window_height}
}

// ========================================================================
// SHADER LOADING
// ========================================================================

function _load_shaders() {
  var sprite_vert_code = io.slurp("shaders/msl/sprite2d.vert.msl")
  var sprite_frag_code = io.slurp("shaders/msl/sprite2d.frag.msl")
  var blit_vert_code = io.slurp("shaders/msl/blit.vert.msl")
  var blit_frag_code = io.slurp("shaders/msl/blit.frag.msl")
  var threshold_frag_code = io.slurp("shaders/msl/threshold.frag.msl")
  var blur_frag_code = io.slurp("shaders/msl/blur.frag.msl")
  var mask_frag_code = io.slurp("shaders/msl/mask.frag.msl")
  var text_sdf_frag_code = io.slurp("shaders/msl/text_sdf.frag.msl")
  
  if (!sprite_vert_code || !sprite_frag_code) {
    log.console("sdl_gpu: Missing sprite shaders")
    return false
  }
  
  _sprite_vert =gpu_mod.shader(_gpu, {
    code: sprite_vert_code,
    stage: "vertex",
    format: "msl",
    entrypoint: "vertex_main",
    num_uniform_buffers: 1
  })
  
  _sprite_frag =gpu_mod.shader(_gpu, {
    code: sprite_frag_code,
    stage: "fragment",
    format: "msl",
    entrypoint: "fragment_main",
    num_uniform_buffers: 0,
    num_samplers: 1
  })
  
  if (blit_vert_code && blit_frag_code) {
    _blit_vert =gpu_mod.shader(_gpu, {
      code: blit_vert_code,
      stage: "vertex",
      format: "msl",
      entrypoint: "vertex_main",
      num_uniform_buffers: 0
    })
    
    _blit_frag =gpu_mod.shader(_gpu, {
      code: blit_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 0,
      num_samplers: 1
    })
  }
  
  if (threshold_frag_code) {
    _threshold_frag =gpu_mod.shader(_gpu, {
      code: threshold_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }
  
  if (blur_frag_code) {
    _blur_frag =gpu_mod.shader(_gpu, {
      code: blur_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }
  
  if (mask_frag_code) {
    _mask_frag =gpu_mod.shader(_gpu, {
      code: mask_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 2,
      num_samplers: 2
    })
  }

  if (text_sdf_frag_code) {
    _text_sdf_frag =gpu_mod.shader(_gpu, {
      code: text_sdf_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }
  
  var text_msdf_frag_code = io.slurp("shaders/msl/text_msdf.frag.msl")
  if (text_msdf_frag_code) {
    _text_msdf_frag =gpu_mod.shader(_gpu, {
      code: text_msdf_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }
  
  var crt_frag_code = io.slurp("shaders/msl/crt.frag.msl")
  if (crt_frag_code) {
    _crt_frag =gpu_mod.shader(_gpu, {
      code: crt_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }

  var accumulator_frag_code = io.slurp("shaders/msl/accumulator.frag.msl")
  if (accumulator_frag_code) {
    _accumulator_frag =gpu_mod.shader(_gpu, {
      code: accumulator_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 2
    })
  }
  
  var shape2d_frag_code = io.slurp("shaders/msl/shape2d.frag.msl")
  if (shape2d_frag_code) {
    _shape2d_frag =gpu_mod.shader(_gpu, {
      code: shape2d_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 0
    })
  }
  
  return true
}

// ========================================================================
// PIPELINE CREATION
// ========================================================================

function _create_pipelines() {
  // Sprite pipeline (alpha blend)
  _pipelines.sprite_alpha =gpu_mod.graphics_pipeline(_gpu, {
    vertex: _sprite_vert,
    fragment: _sprite_frag,
    primitive: "triangle",
    cull: "none",
    face: "counter_clockwise",
    fill: "fill",
    vertex_buffer_descriptions: [{
      slot: 0,
      pitch: 32, // pos(2) + uv(2) + color(4) = 8 floats = 32 bytes
      input_rate: "vertex"
    }],
    vertex_attributes: [
      {location: 0, buffer_slot: 0, format: "float2", offset: 0},  // pos
      {location: 1, buffer_slot: 0, format: "float2", offset: 8},  // uv
      {location: 2, buffer_slot: 0, format: "float4", offset: 16}  // color
    ],
    target: {
      color_targets: [{
        format: _swapchain_format,
        blend: {
          enabled: true,
          src_rgb: "src_alpha",
          dst_rgb: "one_minus_src_alpha",
          op_rgb: "add",
          src_alpha: "one",
          dst_alpha: "one_minus_src_alpha",
          op_alpha: "add"
        }
      }]
    }
  })
  
  // Sprite pipeline (additive blend for bloom)
  _pipelines.sprite_add =gpu_mod.graphics_pipeline(_gpu, {
    vertex: _sprite_vert,
    fragment: _sprite_frag,
    primitive: "triangle",
    cull: "none",
    face: "counter_clockwise",
    fill: "fill",
    vertex_buffer_descriptions: [{
      slot: 0,
      pitch: 32,
      input_rate: "vertex"
    }],
    vertex_attributes: [
      {location: 0, buffer_slot: 0, format: "float2", offset: 0},
      {location: 1, buffer_slot: 0, format: "float2", offset: 8},
      {location: 2, buffer_slot: 0, format: "float4", offset: 16}
    ],
    target: {
      color_targets: [{
        format: _swapchain_format,
        blend: {
          enabled: true,
          src_rgb: "one",
          dst_rgb: "one",
          op_rgb: "add",
          src_alpha: "one",
          dst_alpha: "one",
          op_alpha: "add"
        }
      }]
    }
  })
  
  // Blit pipeline (for fullscreen passes)
  if (_blit_vert && _blit_frag) {
    _pipelines.blit =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _blit_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16, // pos(2) + uv(2) = 4 floats = 16 bytes
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "src_alpha",
            dst_rgb: "one_minus_src_alpha",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one_minus_src_alpha",
            op_alpha: "add"
          }
        }]
      }
    })
  }
  
  // Threshold pipeline (for bloom extraction)
  if (_blit_vert && _threshold_frag) {
    _pipelines.threshold =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _threshold_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{format: _swapchain_format, blend: {enabled: false}}]
      }
    })
  }
  
  // Blur pipeline
  if (_blit_vert && _blur_frag) {
    _pipelines.blur =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _blur_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{format: _swapchain_format, blend: {enabled: false}}]
      }
    })
  }
  
  // Mask pipeline
  if (_blit_vert && _mask_frag) {
    _pipelines.mask =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _mask_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "src_alpha",
            dst_rgb: "one_minus_src_alpha",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one_minus_src_alpha",
            op_alpha: "add"
          }
        }]
      }
    })
  }
  
  // CRT pipeline
  if (_blit_vert && _crt_frag) {
    _pipelines.crt =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _crt_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{format: _swapchain_format, blend: {enabled: false}}]
      }
    })
  }
  
  // Blit additive pipeline (for bloom compositing)
  if (_blit_vert && _blit_frag) {
    _pipelines.blit_add =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _blit_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "one",
            dst_rgb: "one",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one",
            op_alpha: "add"
          }
        }]
      }
    })
  }
  
  // SDF text pipeline
  if (_sprite_vert && _text_sdf_frag) {
    _pipelines.text_sdf =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _sprite_vert,
      fragment: _text_sdf_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 32,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8},
        {location: 2, buffer_slot: 0, format: "float4", offset: 16}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "src_alpha",
            dst_rgb: "one_minus_src_alpha",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one_minus_src_alpha",
            op_alpha: "add"
          }
        }]
      }
    })
  }
  
  // MSDF text pipeline
  if (_sprite_vert && _text_msdf_frag) {
    _pipelines.text_msdf =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _sprite_vert,
      fragment: _text_msdf_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 32,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8},
        {location: 2, buffer_slot: 0, format: "float4", offset: 16}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "src_alpha",
            dst_rgb: "one_minus_src_alpha",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one_minus_src_alpha",
            op_alpha: "add"
          }
        }]
      }
    })
  }
    // Accumulator pipeline
  if (_blit_vert && _accumulator_frag) {
    _pipelines.accumulator =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _accumulator_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{format: _swapchain_format, blend: {enabled: false}}]
      }
    })
  }
  
  // Shape2D pipeline
  if (_sprite_vert && _shape2d_frag) {
    _pipelines.shape2d =gpu_mod.graphics_pipeline(_gpu, {
      vertex: _sprite_vert,
      fragment: _shape2d_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 32,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8},
        {location: 2, buffer_slot: 0, format: "float4", offset: 16}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "src_alpha",
            dst_rgb: "one_minus_src_alpha",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one_minus_src_alpha",
            op_alpha: "add"
          }
        }]
      }
    })
  }
}

// ========================================================================
// TEXTURE MANAGEMENT
// ========================================================================

function _create_gpu_texture(w, h, pixels) {
  var tex =gpu_mod.texture(_gpu, {
    width: w,
    height: h,
    format: "rgba8",
    type: "2d",
    layers: 1,
    mip_levels: 1,
    sampler: true
  })
  
  var size = w * h * 4
  var transfer =gpu_mod.transfer_buffer(_gpu, {
    size: size,
    usage: "upload"
  })
  
  transfer.copy_blob(_gpu, pixels)
  
  var cmd = _gpu.acquire_cmd_buffer()
  var copy = cmd.copy_pass()
  copy.upload_to_texture(
    {transfer_buffer: transfer, offset: 0, pixels_per_row: w, rows_per_layer: h},
    {texture: tex, x: 0, y: 0, z: 0, w: w, h: h, d: 1},
    false
  )
  copy.end()
  cmd.submit()
  
  tex.width = w
  tex.height = h
  return tex
}

function _load_image_file(path) {
  var bytes = io.slurp(path)
  var decoded
  if (!bytes) return null
  
  var ext = lower(pop(array(path, '.')))
  var surface = null
  
  switch (ext) {
    case 'png':
    case 'jpg':
    case 'jpeg':
    case 'bmp':
      surface = png.decode(bytes)
      break
    case 'qoi':
      surface = qoi.decode(bytes)
      break
    case 'gif':
      decoded = gif.decode(bytes)
      if (decoded && decoded.frames && length(decoded.frames) > 0) {
        surface = decoded.frames[0]
      }
      break
    case 'ase':
    case 'aseprite':
      decoded = aseprite.decode(bytes)
      if (decoded && decoded.frames && length(decoded.frames) > 0) {
        surface = decoded.frames[0]
      }
      break
  }
  
  return surface
}

sdl_gpu.get_texture = function(path) {
  if (!path) return _white_texture
  
  // Check cache
  if (_texture_cache[path]) {
    return _texture_cache[path]
  }
  
  // Find and load image
  var fullpath = res.find_image(path)
  if (!fullpath) {
    log.console(`sdl_gpu: Image not found: ${path}`)
    return _white_texture
  }
  
  var surface = _load_image_file(fullpath)
  if (!surface || !surface.pixels) {
    log.console(`sdl_gpu: Failed to load image: ${path}`)
    return _white_texture
  }
  
  var tex = _create_gpu_texture(surface.width, surface.height, surface.pixels)
  _texture_cache[path] = tex
  
  return tex
}

// Get texture info (dimensions) for a path
sdl_gpu.get_texture_info = function(path) {
  var tex = sdl_gpu.get_texture(path)
  if (!tex) return null
  return {width: tex.width, height: tex.height}
}

// ========================================================================
// RENDER TARGET MANAGEMENT
// ========================================================================

sdl_gpu.get_or_create_target = function(width, height, key) {
  // Clamp dimensions to minimum 1x1 to prevent GPU errors
  if (!width || width < 1) width = 1
  if (!height || height < 1) height = 1
  
  var pool_key = `${width}x${height}`
  
  if (!_target_pool[pool_key])
    _target_pool[pool_key] = []
  
  // Reuse from pool if available
  // 1. Check if a target with this exact key already exists
  if (key) {
    var pool = _target_pool[pool_key]
    var idx = find(pool, function(t) { return t.key == key })
    if (idx != null) {
      pool[idx].in_use = true
      return pool[idx]
    }
  }
  
  // 2. Otherwise prefer most recently used (LIFO) or just first available
  var pool = _target_pool[pool_key]
  var idx = find(pool, function(t) { return !t.in_use })
  if (idx != null) {
    pool[idx].in_use = true
    pool[idx].key = key
    return pool[idx]
  }
  
  // Create new render target texture
  var tex =gpu_mod.texture(_gpu, {
    width: width,
    height: height,
    format: _swapchain_format,
    type: "2d",
    layers: 1,
    mip_levels: 1,
    sampler: true,
    color_target: true
  })
  
  tex.width = width
  tex.height = height
  
  var target = {
    texture: tex,
    width: width,
    height: height,
    in_use: true,
    key: key
  }
  
  push(_target_pool[pool_key], target)
  return target
}

sdl_gpu.release_all_targets = function() {
  arrfor(array(_target_pool), function(pool_key) {
    arrfor(_target_pool[pool_key], function(target) {
      target.in_use = false
    })
  })
}

// ========================================================================
// GEOMETRY BUILDING
// ========================================================================

// Build vertex data for sprites
// Vertex format: pos(2) + uv(2) + color(4) = 8 floats = 32 bytes
function _build_sprite_vertices(sprites, camera) {
  var floats_per_vertex = 8
  var vertices_per_sprite = 4
  var indices_per_sprite = 6

  var vertex_data = blob_mod(length(sprites) * vertices_per_sprite * floats_per_vertex * 4)
  var index_data = blob_mod(length(sprites) * indices_per_sprite * 2)
  
  var vertex_count = 0

  var white = {r: 1, g: 1, b: 1, a: 1}
  
  for(var i = 0; i < length(sprites); i++) {
    var s = sprites[i]
    var px = s.pos.x
    var py = s.pos.y
    var w = s.width || 1
    var h = s.height || 1
    var ax = s.anchor_x || 0
    var ay = s.anchor_y || 0
    var c = s.color || white
    
    // Apply tint and opacity
    var tint = s.tint || white
    var opacity = s.opacity != null ? s.opacity : 1
    var final_r = c.r * tint.r
    var final_g = c.g * tint.g
    var final_b = c.b * tint.b
    var final_a = c.a * (tint.a != null ? tint.a : 1) * opacity
    
    // Apply anchor
    var x = px - w * ax
    var y = py - h * ay
    
    // UV coordinates (handle sprite rect if present)
    var u0 = s.uv_rect ? s.uv_rect.x : 0
    var v0 = s.uv_rect ? s.uv_rect.y : 0
    var u1 = s.uv_rect ? (s.uv_rect.x + s.uv_rect.width) : 1
    var v1 = s.uv_rect ? (s.uv_rect.y + s.uv_rect.height) : 1
    
    // Apply UV transform (offset, scale, rotate)
    var uv = s.uv
    if (uv) {
      var uv_off = uv.offset || {x: 0, y: 0}
      var uv_scale = uv.scale || {x: 1, y: 1}
      // Apply scale and offset to UVs
      u0 = u0 * uv_scale.x + uv_off.x
      v0 = v0 * uv_scale.y + uv_off.y
      u1 = u1 * uv_scale.x + uv_off.x
      v1 = v1 * uv_scale.y + uv_off.y
    }
    
    // Apply flip
    var flip = s.flip
    if (flip) {
      if (flip.x) {
        var tmp = u0
        u0 = u1
        u1 = tmp
      }
      if (flip.y) {
        var tmp = v0
        v0 = v1
        v1 = tmp
      }
    }

    // Quad vertices (bottom-left, bottom-right, top-right, top-left)
    // v0: bottom-left
    vertex_data.wf(x)
    vertex_data.wf(y)
    vertex_data.wf(u0)
    vertex_data.wf(v1) // Flip V
    vertex_data.wf(final_r)
    vertex_data.wf(final_g)
    vertex_data.wf(final_b)
    vertex_data.wf(final_a)
    
    // v1: bottom-right
    vertex_data.wf(x + w)
    vertex_data.wf(y)
    vertex_data.wf(u1)
    vertex_data.wf(v1) // Flip V
    vertex_data.wf(final_r)
    vertex_data.wf(final_g)
    vertex_data.wf(final_b)
    vertex_data.wf(final_a)
    
    // v2: top-right
    vertex_data.wf(x + w)
    vertex_data.wf(y + h)
    vertex_data.wf(u1)
    vertex_data.wf(v0) // Flip V
    vertex_data.wf(final_r)
    vertex_data.wf(final_g)
    vertex_data.wf(final_b)
    vertex_data.wf(final_a)
    
    // v3: top-left
    vertex_data.wf(x)
    vertex_data.wf(y + h)
    vertex_data.wf(u0)
    vertex_data.wf(v0) // Flip V
    vertex_data.wf(final_r)
    vertex_data.wf(final_g)
    vertex_data.wf(final_b)
    vertex_data.wf(final_a)
    
    // Indices (two triangles)
    index_data.w16(vertex_count + 0)
    index_data.w16(vertex_count + 1)
    index_data.w16(vertex_count + 2)
    index_data.w16(vertex_count + 0)
    index_data.w16(vertex_count + 2)
    index_data.w16(vertex_count + 3)
    
    vertex_count += 4
  }
  
  return {
    vertices: stone(vertex_data),
    indices: stone(index_data),
    vertex_count: vertex_count,
    index_count: length(sprites) * 6
  }
}

// Build fullscreen quad for blit/post-processing
function _build_fullscreen_quad(dst_rect, target_width, target_height) {
  // Convert pixel rect to NDC
  var x0 = (dst_rect.x / target_width) * 2 - 1
  var y0 = (dst_rect.y / target_height) * 2 - 1
  var x1 = ((dst_rect.x + dst_rect.width) / target_width) * 2 - 1
  var y1 = ((dst_rect.y + dst_rect.height) / target_height) * 2 - 1
  
  var vertex_data = blob_mod(4 * 4 * 4) // 4 verts * 4 floats * 4 bytes
  var index_data = blob_mod(6 * 2) // 6 indices * 2 bytes
  
  // Metal textures have origin at top-left (uv 0,0 = top-left of texture)
  // NDC has origin at center (y=-1 is bottom, y=1 is top)
  // So we need to flip V: screen bottom (y0) samples texture bottom (v=1)
  //                       screen top (y1) samples texture top (v=0)
  
  // v0: bottom-left (NDC) -> sample texture bottom-left (u=0, v=1)
  vertex_data.wf(x0);
  vertex_data.wf(y0);
  vertex_data.wf(0);
  vertex_data.wf(1);  // v=1 (bottom of texture)
  
  // v1: bottom-right (NDC) -> sample texture bottom-right (u=1, v=1)
  vertex_data.wf(x1);
  vertex_data.wf(y0);
  vertex_data.wf(1);
  vertex_data.wf(1);  // v=1 (bottom of texture)
  
  // v2: top-right (NDC) -> sample texture top-right (u=1, v=0)
  vertex_data.wf(x1);
  vertex_data.wf(y1);
  vertex_data.wf(1);
  vertex_data.wf(0);  // v=0 (top of texture)
  
  // v3: top-left (NDC) -> sample texture top-left (u=0, v=0)
  vertex_data.wf(x0);
  vertex_data.wf(y1);
  vertex_data.wf(0);
  vertex_data.wf(0);  // v=0 (top of texture)
  
  index_data.w16(0)
  index_data.w16(1)
  index_data.w16(2)
  index_data.w16(0)
  index_data.w16(2)
  index_data.w16(3)
  
  return {
    vertices: stone(vertex_data),
    indices: stone(index_data),
    vertex_count: 4,
    index_count: 6
  }
}

// ========================================================================
// MATRIX BUILDING
// ========================================================================

function _build_ortho_matrix(left, right, bottom, top, near, far) {
  var data = blob_mod(64)
  var m = []
  
  m[0] = 2 / (right - left)
  m[1] = 0
  m[2] = 0
  m[3] = 0
  
  m[4] = 0
  m[5] = 2 / (top - bottom)
  m[6] = 0
  m[7] = 0
  
  m[8] = 0
  m[9] = 0
  m[10] = -2 / (far - near)
  m[11] = 0
  
  m[12] = -(right + left) / (right - left)
  m[13] = -(top + bottom) / (top - bottom)
  m[14] = -(far + near) / (far - near)
  m[15] = 1
  
  for (var i = 0; i < 16; i++)
    data.wf(m[i])
  
  return stone(data)
}

function _build_camera_matrix(camera, target_width, target_height) {
  var pos = camera.pos || {x: 0, y: 0}
  var cam_width = camera.width || target_width
  var cam_height = camera.height || target_height
  var anchor = camera.anchor || {x: 0.5, y: 0.5}

  var left = pos.x - cam_width * anchor.x
  var right = pos.x + cam_width * (1 - anchor.x)
  var bottom = pos.y - cam_height * anchor.y
  var top = pos.y + cam_height * (1 - anchor.y)

  return _build_ortho_matrix(left, right, bottom, top, -1, 1)
}

// ========================================================================
// GRAPH EXECUTION
// ========================================================================
var ex = 0
sdl_gpu.execute_graph = function(graph, window_size, dbg = false) {
  _window_width = window_size.width
  _window_height = window_size.height
  
  // Execute graph to get all commands
  var result = graph.execute(this)
  if (dbg) {
    log.console(result)
    return
  }
  var all_commands = result.commands || []

  // Pre-load all textures needed
  _preload_textures(all_commands)
  
  // Execute commands
  _execute_commands(all_commands, window_size)
  
  // Release targets for next frame
  this.release_all_targets()
}

// Execute commands directly (from compositor)
sdl_gpu.execute_commands = function(commands, window_size, dbg = false) {
  _window_width = window_size.width
  _window_height = window_size.height
  
  if (dbg) {
    log.console(commands)
    return
  }
  
  // Pre-load all textures needed
  _preload_textures(commands)
  
  // Execute commands
  _execute_commands(commands, window_size)
  
  // Release targets for next frame
  this.release_all_targets()
}

function _preload_textures(commands) {
  var paths = {}
  
  arrfor(commands, function(cmd) {
    if (cmd.cmd == 'draw_batch' && cmd.texture) {
      if (is_text(cmd.texture))
        paths[cmd.texture] = true
    }
  })
  
  // Load all textures
  arrfor(array(paths, function(path) {
    sdl_gpu.get_texture(path)
  }))
}

function _execute_commands(commands, window_size) {
  var cmd_buffer = _gpu.acquire_cmd_buffer()
  var current_pass = null
  var current_target = null
  var current_camera = null
  var pending_draws = []
  var target
  
  // Cache swapchain texture for the duration of this command buffer
  var _swapchain_tex = null
  function get_swapchain_tex() {
    if (_swapchain_tex) return _swapchain_tex
    _swapchain_tex = cmd_buffer.acquire_swapchain_texture(_window)
    return _swapchain_tex
  }

  arrfor(commands, function(cmd) {
    switch (cmd.cmd) {
      case 'begin_render':
        // Flush pending draws
        if (current_pass && length(pending_draws) > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End previous pass
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        // Start new pass
        target = cmd.target
        var clear = cmd.clear
        
        if (target == 'screen') {
          var swap_tex = get_swapchain_tex()
          if (swap_tex) {
            current_pass = cmd_buffer.render_pass({
              color_targets: [{
                texture: swap_tex,
                load: clear ? "clear" : "load",
                store: "store",
                clear_color: clear ? {r: clear.r, g: clear.g, b: clear.b, a: clear.a} : {r: 0, g: 0, b: 0, a: 0}
              }]
            })
            current_target = {texture: swap_tex, width: swap_tex.width, height: swap_tex.height}
          } else {
            log.console("sdl_gpu: Failed to acquire swapchain texture")
            current_pass = null
            current_target = window_size
          }
        } else {
          current_pass = cmd_buffer.render_pass({
            color_targets: [{
              texture: target.texture,
              load: clear ? "clear" : "load",
              store: "store",
              clear_color: clear ? {r: clear.r, g: clear.g, b: clear.b, a: clear.a} : {r: 0, g: 0, b: 0, a: 0}
            }]
          })
          current_target = target
        }
        break
        
      case 'set_camera':
        current_camera = cmd.camera
        break
        
      case 'draw_batch':
        push(pending_draws, cmd)
        break

      case 'draw_text':
        push(pending_draws, cmd)
        break

      case 'draw_texture_ref':
        push(pending_draws, cmd)
        break

      case 'draw_shape':
        push(pending_draws, cmd)
        break

      case 'draw_mesh2d':
        push(pending_draws, cmd)
        break

      case 'blit':
        // Flush pending draws first
        if (current_pass && length(pending_draws) > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End current pass - SDL blit works outside render passes
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        _do_blit(cmd_buffer, cmd, current_target, get_swapchain_tex)
        break
        
      case 'apply_mask':
        // Flush pending draws first
        if (current_pass && length(pending_draws) > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End current pass - mask works as blit outside render pass
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        _do_mask(cmd_buffer, cmd)
        break
        
      case 'shader_pass':
        // Flush pending draws first
        if (current_pass && length(pending_draws) > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End current pass - shader passes need their own render pass
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        _do_shader_pass(cmd_buffer, cmd, get_swapchain_tex)
        break
        
      case 'composite_textures':
        // Flush pending draws first
        if (current_pass && length(pending_draws) > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End current pass
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        _do_composite(cmd_buffer, cmd, window_size)
        break
        
      case 'end_render':
        // Flush pending draws
        if (current_pass && length(pending_draws) > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        break

      case 'imgui':
        // Flush pending draws first
        if (current_pass && length(pending_draws) > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // ImGui needs to be outside a render pass for prepare, but inside for endframe
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        var imgui_mod = use('imgui')
        if (cmd.draw) {
          cmd.draw(imgui_mod)
        }
        imgui_mod.prepare(cmd_buffer)
        
        // Restart pass to the same target for rendering
        target = cmd.target
        var swap_tex = null
        if (target == 'screen') {
          swap_tex = get_swapchain_tex()
          if (swap_tex) {
            current_pass = cmd_buffer.render_pass({
              color_targets: [{
                texture: swap_tex,
                load: "load",
                store: "store"
              }]
            })
          }
        } else if (target && target.texture) {
          current_pass = cmd_buffer.render_pass({
            color_targets: [{
              texture: target.texture,
              load: "load",
              store: "store"
            }]
          })
        }
        
        if (current_pass) {
          imgui_mod.endframe(cmd_buffer, current_pass)
          current_pass.end()
          current_pass = null
        }
        break
        
      case 'present':
        // Submit command buffer
        break
    }
  })
  
  // Final flush
  if (current_pass && length(pending_draws) > 0) {
    _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
  }
  
  if (current_pass) {
    current_pass.end()
  }
  
  cmd_buffer.submit()
}

function _flush_draws(cmd_buffer, pass, draws, camera, target) {
  var current_batch = null
  
  // Iterate draws preserving order
  arrfor(draws, function(draw) {
    if (draw.cmd == 'draw_batch') {
      // Sprite batch handling
      var tex_path = draw.texture || '_white'
      var blend = draw.material ? draw.material.blend : 'alpha'
      var sampler = draw.material ? draw.material.sampler : 'nearest'
      
      // Check if we can append to current batch
      if (current_batch && 
          current_batch.type == 'sprites' && 
          current_batch.texture_path == tex_path && 
          current_batch.blend == blend &&
          current_batch.sampler == sampler) {
        
        // Append sprites
        if (draw.geometry && draw.geometry.sprites) {
          current_batch.sprites = array(current_batch.sprites, draw.geometry.sprites)
        }
      } else {
        // Flush current batch
        if (current_batch) _render_batch(cmd_buffer, pass, current_batch, camera, target)
        
        // Start new sprite batch
        current_batch = {
          type: 'sprites',
          texture_path: tex_path,
          blend: blend,
          sampler: sampler,
          sprites: []
        }
        
        if (draw.geometry && draw.geometry.sprites)
          current_batch.sprites = array(current_batch.sprites, draw.geometry.sprites)
      }
    } else if (draw.cmd == 'draw_text') {
      // Flush current batch
      if (current_batch) _render_batch(cmd_buffer, pass, current_batch, camera, target)
      current_batch = null

      // Render text immediately
      _render_text(cmd_buffer, pass, draw.drawable, camera, target)
    } else if (draw.cmd == 'draw_texture_ref') {
      // Flush current batch
      if (current_batch) _render_batch(cmd_buffer, pass, current_batch, camera, target)
      current_batch = null

      // Render pre-rendered effect texture
      _render_texture_ref(cmd_buffer, pass, draw.drawable, camera, target)
    } else if (draw.cmd == 'draw_shape') {
      // Flush current batch
      if (current_batch) _render_batch(cmd_buffer, pass, current_batch, camera, target)
      current_batch = null

      // Render shape immediately
      _render_shape(cmd_buffer, pass, draw.drawable, camera, target)
    } else if (draw.cmd == 'draw_mesh2d') {
      // Flush current batch
      if (current_batch) _render_batch(cmd_buffer, pass, current_batch, camera, target)
      current_batch = null

      // Render mesh2d batch
      _render_mesh2d(cmd_buffer, pass, draw, camera, target)
    }
  })

  // Flush final batch
  if (current_batch) _render_batch(cmd_buffer, pass, current_batch, camera, target)
}

function _render_batch(cmd_buffer, pass, batch, camera, target) {
  if (batch.type == 'sprites') {
    if (length(batch.sprites) == 0) return
    
    var tex = batch.texture_path == '_white' ? _white_texture : sdl_gpu.get_texture(batch.texture_path)
    var geom = _build_sprite_vertices(batch.sprites, camera)
    
    // Upload geometry
    var vb_size = length(geom.vertices) / 8
    var ib_size = length(geom.indices) / 8
    
    var vb =gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
    var ib =gpu_mod.buffer(_gpu, {size: ib_size, index: true})
    
    var vb_transfer =gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
    var ib_transfer =gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
    
    vb_transfer.copy_blob(_gpu, geom.vertices)
    ib_transfer.copy_blob(_gpu, geom.indices)
    
    var copy_cmd = _gpu.acquire_cmd_buffer()
    var copy = copy_cmd.copy_pass()
    copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
    copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
    copy.end()
    copy_cmd.submit()
    
    // Build camera matrix
    var proj = _build_camera_matrix(camera, target.width, target.height)
    
    // Select pipeline
    var pipeline = batch.blend == 'add' ? _pipelines.sprite_add : _pipelines.sprite_alpha
    
    // Select sampler based on filter
    var sampler = (batch.sampler == 'linear') ? _sampler_linear : _sampler_nearest
    
    // Draw
    pass.bind_pipeline(pipeline)
    pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
    pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
    pass.bind_fragment_samplers(0, [{texture: tex, sampler: sampler}])
    cmd_buffer.push_vertex_uniform_data(0, proj)
    pass.draw_indexed(geom.index_count, 1, 0, 0, 0)
  }
}

// Render a pre-rendered texture from an effect group
function _render_texture_ref(cmd_buffer, pass, drawable, camera, target) {
  var tex_target = drawable.texture_target
  if (!tex_target) return

  // The texture_target is a compositor target reference - resolve it
  // It should have already been rendered to and we just need to blit it
  var pos = drawable.pos || {x: 0, y: 0}
  var width = drawable.width || target.width
  var height = drawable.height || target.height

  // Build a single sprite for the texture reference
  var sprites = [{
    pos: pos,
    width: width,
    height: height,
    anchor_x: 0,
    anchor_y: 0,
    color: {r: 1, g: 1, b: 1, a: 1}
  }]

  var geom = _build_sprite_vertices(sprites, camera)

  // Upload geometry
  var vb_size = length(geom.vertices) / 8
  var ib_size = length(geom.indices) / 8

  var vb =gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib =gpu_mod.buffer(_gpu, {size: ib_size, index: true})

  var vb_transfer =gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer =gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})

  vb_transfer.copy_blob(_gpu, geom.vertices)
  ib_transfer.copy_blob(_gpu, geom.indices)

  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()

  // Build camera matrix
  var proj = _build_camera_matrix(camera, target.width, target.height)

  // Select pipeline based on blend mode
  var blend = drawable.blend || 'over'
  var pipeline = blend == 'add' ? _pipelines.sprite_add : _pipelines.sprite_alpha

  // The texture_target has a .texture property from the target pool
  var tex = tex_target.texture || tex_target
  if (!tex) return

  pass.bind_pipeline(pipeline)
  pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  pass.bind_fragment_samplers(0, [{texture: tex, sampler: _sampler_linear}])
  cmd_buffer.push_vertex_uniform_data(0, proj)
  pass.draw_indexed(geom.index_count, 1, 0, 0, 0)
}

function _render_shape(cmd_buffer, pass, drawable, camera, target) {
  if (!_pipelines.shape2d) return
  
  var pos = drawable.pos || {x: 0, y: 0}
  var w = drawable.width || 100
  var h = drawable.height || 100
  var ax = drawable.anchor_x != null ? drawable.anchor_x : 0.5
  var ay = drawable.anchor_y != null ? drawable.anchor_y : 0.5
  
  // Calculate padding required for stroke and feather
  var stroke_thickness = drawable.stroke_thickness || 0
  var feather = drawable.feather != null ? drawable.feather : 0.5
  
  // Resolve stroke alignment (0=inside, 0.5=center, 1=outside)
  var stroke_aligns = {inside: 0, center: 0.5, outside: 1}
  var sa = drawable.stroke_align in stroke_aligns ? stroke_aligns[drawable.stroke_align] : 0.5
  
  var pad = feather
  if (stroke_thickness > 0) {
    if (sa > 0.75) pad += stroke_thickness        // Outside
    else if (sa > 0.25) pad += stroke_thickness * 0.5 // Center
    // Inside adds 0
  }
  
  // Expand quad by padding
  var x = pos.x - w * ax - pad
  var y = pos.y - h * ay - pad
  var qw = w + pad * 2
  var qh = h + pad * 2
  
  // Expand UVs to match padding (p calculation depends on this)
  // logical size is w, h. padded size is qw, qh.
  // 0..1 maps to w, h.
  // We need UVs such that (uv - 0.5) * w spans the padded area logic.
  // u=0 -> -0.5 * w = left edge of shape
  // target left edge is -pad relative to shape left edge.
  // So we need uv such that (uv - 0.5) * w = -w/2 - pad
  // uv * w - w/2 = -w/2 - pad
  // uv * w = -pad => uv = -pad / w
  var u0 = -pad / w
  var v0 = -pad / h
  var u1 = 1.0 + pad / w
  var v1 = 1.0 + pad / h
  
  var fill = drawable.fill || {r: 1, g: 1, b: 1, a: 1}
  var opacity = drawable.opacity != null ? drawable.opacity : 1
  
  // Vertex data: pos(2) + uv(2) + color(4) = 8 floats = 32 bytes per vertex
  var vertex_data = blob_mod(4 * 32)
  var index_data = blob_mod(6 * 2)
  
  // v0: bottom-left
  vertex_data.wf(x); vertex_data.wf(y)
  vertex_data.wf(u0); vertex_data.wf(v1)
  vertex_data.wf(1); vertex_data.wf(1); vertex_data.wf(1); vertex_data.wf(1)
  
  // v1: bottom-right
  vertex_data.wf(x + qw); vertex_data.wf(y)
  vertex_data.wf(u1); vertex_data.wf(v1)
  vertex_data.wf(1); vertex_data.wf(1); vertex_data.wf(1); vertex_data.wf(1)
  
  // v2: top-right
  vertex_data.wf(x + qw); vertex_data.wf(y + qh)
  vertex_data.wf(u1); vertex_data.wf(v0)
  vertex_data.wf(1); vertex_data.wf(1); vertex_data.wf(1); vertex_data.wf(1)
  
  // v3: top-left
  vertex_data.wf(x); vertex_data.wf(y + qh)
  vertex_data.wf(u0); vertex_data.wf(v0)
  vertex_data.wf(1); vertex_data.wf(1); vertex_data.wf(1); vertex_data.wf(1)
  
  // Indices
  index_data.w16(0); index_data.w16(1); index_data.w16(2)
  index_data.w16(0); index_data.w16(2); index_data.w16(3)
  
  // Upload geometry
  var vb_size = 4 * 32
  var ib_size = 6 * 2
  
  var vb =gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib =gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer =gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer =gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, stone(vertex_data))
  ib_transfer.copy_blob(_gpu, stone(index_data))
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // Build camera matrix
  var proj = _build_camera_matrix(camera, target.width, target.height)
  
  // Build shape uniforms - must match ShapeParams struct in shader
  // Total size: 112 bytes
  var stroke = drawable.stroke || {r: 0, g: 0, b: 0, a: 0}
  var shape_types = {rect: 0, circle: 1, ellipse: 2, pill: 3}
  var shape_type = shape_types[drawable.shape_type] || 0
  
  var u_data = blob_mod(112)
  u_data.wf(fill.r); u_data.wf(fill.g); u_data.wf(fill.b); u_data.wf(fill.a)       // fill_color (16)
  u_data.wf(stroke.r); u_data.wf(stroke.g); u_data.wf(stroke.b); u_data.wf(stroke.a) // stroke_color (16)
  u_data.wf(w); u_data.wf(h)                                                        // size (8) - PASS LOGICAL SIZE
  u_data.wf(drawable.radius || 0)                                                   // radius (4)
  u_data.wf(feather)                                                                // feather (4)
  u_data.wf(stroke_thickness)                                                       // stroke_thickness (4)
  u_data.wf(sa)                                                                     // stroke_align (4)
  u_data.w32(shape_type)                                                            // shape_type (4)
  u_data.w32(0)                                                                     // corner_style (4)
  u_data.wf(drawable.dash_len || 0)                                                 // dash_len (4)
  u_data.wf(drawable.gap_len || 0)                                                  // gap_len (4)
  u_data.wf(drawable.dash_offset || 0)                                              // dash_offset (4)
  u_data.w32(0)                                                                     // cap_type (4)
  
  // uv_transform (16)
  if (drawable.uv && drawable.uv.scale) {
    u_data.wf(drawable.uv.scale.x); u_data.wf(drawable.uv.scale.y)
    u_data.wf(drawable.uv.offset.x); u_data.wf(drawable.uv.offset.y)
  } else {
    u_data.wf(1); u_data.wf(1); u_data.wf(0); u_data.wf(0)
  }
  
  u_data.wf(drawable.uv && drawable.uv.rotate ? drawable.uv.rotate : 0)             // uv_rotate (4)
  u_data.w32(0)                                                                     // has_texture (4)
  u_data.wf(opacity)                                                                // opacity (4)
  u_data.wf(0)                                                                      // _pad (4)
  
  pass.bind_pipeline(_pipelines.shape2d)
  pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  pass.bind_fragment_samplers(0, [{texture: _white_texture, sampler: _sampler_linear}])
  cmd_buffer.push_vertex_uniform_data(0, proj)
  cmd_buffer.push_fragment_uniform_data(0, stone(u_data))
  pass.draw_indexed(6, 1, 0, 0, 0)
}

function _render_mesh2d(cmd_buffer, pass, draw, camera, target) {
  var meshes = draw.meshes || []
  if (length(meshes) == 0) return
  
  var tex_path = draw.texture
  var tex = tex_path ? sdl_gpu.get_texture(tex_path) : _white_texture
  var blend = draw.material ? draw.material.blend : 'alpha'
  var sampler_type = draw.material ? draw.material.sampler : 'linear'
  
  // Count total vertices and indices
  var total_verts = 0
  var total_indices = 0
  arrfor(meshes, function(m) {
    if (m.verts) total_verts += length(m.verts)
    if (m.indices) total_indices += length(m.indices)
  })
  
  if (total_verts == 0 || total_indices == 0) return
  
  // Build combined vertex/index buffers
  // Vertex format: pos(2) + uv(2) + color(4) = 8 floats = 32 bytes
  var vertex_data = blob_mod(total_verts * 32)
  var index_data = blob_mod(total_indices * 2)
  
  var vertex_offset = 0
  arrfor(meshes, function(m) {
    var verts = m.verts || []
    var indices = m.indices || []
    var opacity = m.opacity != null ? m.opacity : 1
    var tint = m.tint || {r: 1, g: 1, b: 1, a: 1}
    
    // Write vertices
    arrfor(verts, function(v) {
      vertex_data.wf(v.x)
      vertex_data.wf(v.y)
      vertex_data.wf(v.u != null ? v.u : 0)
      vertex_data.wf(v.v != null ? v.v : 0)
      var r = (v.r != null ? v.r : 1) * tint.r
      var g = (v.g != null ? v.g : 1) * tint.g
      var b = (v.b != null ? v.b : 1) * tint.b
      var a = (v.a != null ? v.a : 1) * (tint.a != null ? tint.a : 1) * opacity
      vertex_data.wf(r)
      vertex_data.wf(g)
      vertex_data.wf(b)
      vertex_data.wf(a)
    })
    
    // Write indices (offset by current vertex count)
    arrfor(indices, function(idx) {
      index_data.w16(vertex_offset + idx)
    })
    
    vertex_offset += length(verts)
  })
  
  // Upload geometry
  var vb_size = total_verts * 32
  var ib_size = total_indices * 2
  
  var vb =gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib =gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer =gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer =gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, stone(vertex_data))
  ib_transfer.copy_blob(_gpu, stone(index_data))
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // Build camera matrix
  var proj = _build_camera_matrix(camera, target.width, target.height)
  
  // Select pipeline
  var pipeline = blend == 'add' ? _pipelines.sprite_add : _pipelines.sprite_alpha
  
  // Select sampler
  var sampler = sampler_type == 'linear' ? _sampler_linear : _sampler_nearest
  
  // Draw
  pass.bind_pipeline(pipeline)
  pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  pass.bind_fragment_samplers(0, [{texture: tex, sampler: sampler}])
  cmd_buffer.push_vertex_uniform_data(0, proj)
  pass.draw_indexed(total_indices, 1, 0, 0, 0)
}

function _render_text(cmd_buffer, pass, drawable, camera, target) {
  // Get font - support mode tag: 'bitmap', 'sdf', 'msdf'
  var font_path = drawable.font
  var size = drawable.size || 16
  var mode = drawable.mode || (drawable.sdf ? 'sdf' : 'bitmap')
  var font = _get_font_cache(font_path, size, mode)
  if (!font) return
  
  // Generate vertices using staef
  var pos = drawable.pos
  var text_pos = {x: pos.x, y: pos.y, width: 0, height: 0}
  var color = drawable.color || {r:1, g:1, b:1, a:1}
  
  // Handle anchor
  var ax = drawable.anchor_x || 0
  var ay = drawable.anchor_y || 0
  
  if (ax != 0 || ay != 0) {
    var dim = font.text_size(drawable.text)
    if (dim) {
      text_pos.x -= dim.x * ax
      text_pos.y -= dim.y * ay
    }
  }
  
  var mesh = font.make_text_buffer(drawable.text, text_pos, color)
  if (!mesh || !mesh.num_vertices) return

  // Interlace buffers manually
  var num_verts = mesh.num_vertices
  var interleaved = geometry.weave([{data:mesh.xy, stride: mesh.xy_stride}, {data:mesh.uv, stride: mesh.uv_stride}, {data:mesh.color, stride: mesh.color_stride}])
  
  var indices = mesh.indices
  var num_indices = mesh.num_indices
  
  // Upload
  var vb_size = num_verts * 32
  var ib_size = num_indices * 2
  
  var vb =gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib =gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer =gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer =gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, interleaved)
  ib_transfer.copy_blob(_gpu, indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // Setup pipeline
  var proj = _build_camera_matrix(camera, target.width, target.height)
  
  // Select pipeline based on mode
  var is_sdf = (mode == 'sdf')
  var is_msdf = (mode == 'msdf')
  
  if (is_msdf && _pipelines.text_msdf) {
    pass.bind_pipeline(_pipelines.text_msdf)
    
    // Build uniforms for MSDF
    // Struct: float outline_width, float sharpness, float2 _pad, float4 outline_color
    var u_data = blob_mod(32)
    
    // Convert outline_width from pixel-ish units to normalized SDF units
    // outline_width in drawable is in "visual" units, we need to normalize
    // A typical range is 0.0-0.3 in SDF units
    var outline_w = drawable.outline_width || 0
    if (outline_w > 0) outline_w = outline_w / 100.0 // Scale down from user units
    
    u_data.wf(outline_w)                    // outline_width
    u_data.wf(font.sharpness || 1.0)        // sharpness from font
    u_data.wf(0)                            // _pad.x
    u_data.wf(0)                            // _pad.y
    
    var oc = drawable.outline_color || {r:0, g:0, b:0, a:1}
    u_data.wf(oc.r)                         // outline_color.r
    u_data.wf(oc.g)                         // outline_color.g
    u_data.wf(oc.b)                         // outline_color.b
    u_data.wf(oc.a || 1)                    // outline_color.a
    
    cmd_buffer.push_fragment_uniform_data(0, stone(u_data))
    
  } else if (is_sdf && _pipelines.text_sdf) {
    pass.bind_pipeline(_pipelines.text_sdf)
    
    // Build uniforms for SDF
    // Struct: float outline_width, float sharpness, float2 _pad, float4 outline_color
    var u_data = blob_mod(32)
    
    var outline_w = drawable.outline_width || 0
    if (outline_w > 0) outline_w = outline_w / 100.0
    
    u_data.wf(outline_w)                    // outline_width
    u_data.wf(font.sharpness || 1.0)        // sharpness from font
    u_data.wf(0)                            // _pad.x
    u_data.wf(0)                            // _pad.y
    
    var oc = drawable.outline_color || {r:0, g:0, b:0, a:1}
    u_data.wf(oc.r)
    u_data.wf(oc.g)
    u_data.wf(oc.b)
    u_data.wf(oc.a || 1)
    
    cmd_buffer.push_fragment_uniform_data(0, stone(u_data))
    
  } else {
    pass.bind_pipeline(_pipelines.sprite_alpha)
  }

  pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  
  // Bind font texture - use linear filtering for SDF/MSDF
  var font_tex = _get_font_texture(font, mode)
  var sampler = (is_sdf || is_msdf) ? _sampler_linear : _sampler_nearest
  
  pass.bind_fragment_samplers(0, [{texture: font_tex, sampler: sampler}])
  cmd_buffer.push_vertex_uniform_data(0, proj)
  pass.draw_indexed(num_indices, 1, 0, 0, 0)
}

function _get_font_cache(path, size, mode) {
  // mode can be 'bitmap', 'sdf', 'msdf', or boolean (legacy)
  if (mode == true) mode = 'sdf'
  else if (mode == false || !mode) mode = 'bitmap'
  
  var key = `${path}.${size}.${mode}`
  if (_font_cache[key]) return _font_cache[key]
  
  var fullpath = res.find_font(path)
  if (!fullpath) return null
  
  var data = io.slurp(fullpath)
  if (!data) return null
  
  // Create staef font based on mode
  try {
    var font
    if (mode == 'msdf') {
      // MSDF: em_px=size, range_px=4, padding_px=6, sharpness=1.0
      font =staef.msdf_font(data, size, 4.0, 6, 1.0)
    } else if (mode == 'sdf') {
      // SDF: em_px=size, range_px=12, padding_px=14, sharpness=1.0
      font =staef.sdf_font(data, size, 12.0, 14, 1.0)
    } else {
      // Bitmap
      font =staef.font(data, size, false)
    }
    _font_cache[key] = font
    return font
  } catch(e) {
    log.console(`sdl_gpu: Failed to load font ${path}:${size}:${mode}: ${e.message}`)
    return null
  }
}


function _get_font_texture(font, is_sdf) {
  if (font._gpu_texture) return font._gpu_texture
  
  // Create texture from font.texture (pixels, width, height)
  var ftex = font.texture
  if (!ftex) return _white_texture
  
  // Use linear filtering for SDF? The tex creation just sets format, sampler state is in pipeline/bind.
  // We can reuse creation logic.
  
  var tex = _create_gpu_texture(ftex.width, ftex.height, ftex.pixels)
  font._gpu_texture = tex
  return tex
}

function _do_blit(cmd_buffer, cmd, current_target, get_swapchain_tex) {
  var src = cmd.texture
  var dst_rect = cmd.dst_rect
  var filter = cmd.filter || 'nearest'
  var target = cmd.target || current_target
  
  if (!src || !src.texture) return
  if (!target) return

  if (target == 'screen' || (!target.texture && target.width)) {
    // Cannot use SDL_BlitGPUTexture for screen/swapchain, must use render pass
    var swap_tex = (target == 'screen') ? get_swapchain_tex() : target.texture
    if (!swap_tex && target == 'screen') swap_tex = get_swapchain_tex()
    if (!swap_tex) return
    
    var pass = cmd_buffer.render_pass({
      color_targets: [{
        texture: swap_tex,
        load: "load",  // Load existing content to blend layers properly
        store: "store"
      }]
    })

    var win_size = sdl_gpu.get_window_size()
    var geom = _build_fullscreen_quad(dst_rect, win_size.width, win_size.height)
    
    _draw_textured_quad(pass, geom, src.texture, _pipelines.blit, filter)
    pass.end()
  } else {
    // Use render pass with alpha blending instead of SDL blit (which overwrites)
    if (!target || !target.texture) return
    
    var geom = _build_fullscreen_quad(dst_rect, target.width, target.height)
    
    var pass = cmd_buffer.render_pass({
      color_targets: [{
        texture: target.texture,
        load: "load",  // IMPORTANT: Load existing content so we blend on top
        store: "store"
      }]
    })
    
    _draw_textured_quad(pass, geom, src.texture, _pipelines.blit, filter)
    pass.end()
  }
}

function _draw_textured_quad(pass, geom, texture, pipeline, filter) {
  var vb_size = length(geom.vertices) / 8
  var ib_size = length(geom.indices) / 8
  
  var vb =gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib =gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer =gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer =gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, geom.vertices)
  ib_transfer.copy_blob(_gpu, geom.indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  pass.bind_pipeline(pipeline)
  pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  
  var sampler = filter == 'linear' ? _sampler_linear : _sampler_nearest
  pass.bind_fragment_samplers(0, [{texture: texture, sampler: sampler}])
  
  pass.draw_indexed(6, 1, 0, 0, 0)
}

function _do_mask(cmd_buffer, cmd) {
  var content = cmd.content_texture
  var mask = cmd.mask_texture
  var output = cmd.output
  var mode = cmd.mode || 'alpha'
  var invert = cmd.invert || false
  
  if (!content || !content.texture) return
  if (!mask || !mask.texture) return
  if (!output || !output.texture) return
  
  // Check if mask pipeline is available
  if (!_pipelines.mask) {
    log.console("sdl_gpu: Mask pipeline not available, falling back to blit")
    cmd_buffer.blit({
      src: {texture: content.texture, x: 0, y: 0, width: content.width, height: content.height},
      dst: {texture: output.texture, x: 0, y: 0, width: output.width, height: output.height},
      load: "clear",
      filter: "nearest"
    })
    return
  }
  
  // Build fullscreen quad
  var geom = _build_fullscreen_quad({x: 0, y: 0, width: output.width, height: output.height}, output.width, output.height)
  
  var vb_size = length(geom.vertices) / 8
  var ib_size = length(geom.indices) / 8
  
  var vb =gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib =gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer =gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer =gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, geom.vertices)
  ib_transfer.copy_blob(_gpu, geom.indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // Build uniforms: invert, mode
  var uniform_data = blob_mod(16)
  uniform_data.wf(invert ? 1.0 : 0.0)  // invert
  uniform_data.wf(mode == 'binary' ? 1.0 : 0.0)  // mode (0=alpha, 1=binary)
  uniform_data.wf(0)  // padding
  uniform_data.wf(0)  // padding
  
  // Render to output
  var mask_pass = cmd_buffer.render_pass({
    color_targets: [{
      texture: output.texture,
      load: "clear",
      store: "store",
      clear_color: {r: 0, g: 0, b: 0, a: 0}
    }]
  })
  
  mask_pass.bind_pipeline(_pipelines.mask)
  mask_pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  mask_pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  // Bind both content texture (slot 0) and mask texture (slot 1)
  mask_pass.bind_fragment_samplers(0, [
    {texture: content.texture, sampler: _sampler_nearest},
    {texture: mask.texture, sampler: _sampler_nearest}
  ])
  cmd_buffer.push_fragment_uniform_data(0, stone(uniform_data))
  mask_pass.draw_indexed(6, 1, 0, 0, 0)
  mask_pass.end()
}

function _do_shader_pass(cmd_buffer, cmd, get_swapchain_tex) {
  var shader = cmd.shader
  var input = cmd.input
  var output = cmd.output
  var uniforms = cmd.uniforms || {}
  
  if (!input || !input.texture) return
  if (output != 'screen' && (!output || !output.texture)) return

  // Select pipeline based on shader type
  var pipeline = null
  switch (shader) {
    case 'threshold':
      pipeline = _pipelines.threshold
      break
    case 'blur':
      pipeline = _pipelines.blur
      break
    case 'crt':
      pipeline = _pipelines.crt
      break
    case 'accumulator':
      pipeline = _pipelines.accumulator
      break
    case 'mask':
      pipeline = _pipelines.mask
      break
    default:
      log.console(`sdl_gpu: Unknown shader: ${shader}`)
      return
  }
  
  if (!pipeline) {
    log.console(`sdl_gpu: Pipeline not available for shader: ${shader}`)
    return
  }
  
  // Build fullscreen quad
  var out_w = output == 'screen' ? _window_width : output.width
  var out_h = output == 'screen' ? _window_height : output.height
  var geom = _build_fullscreen_quad({x: 0, y: 0, width: out_w, height: out_h}, out_w, out_h)
  
  // Upload geometry
  var vb_size = length(geom.vertices) / 8
  var ib_size = length(geom.indices) / 8
  
  var vb =gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib =gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer =gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer =gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, geom.vertices)
  ib_transfer.copy_blob(_gpu, geom.indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // Build uniform buffer based on shader type
  var uniform_data = _build_shader_uniforms(shader, uniforms)
  
  // Start render pass to output target
  var pass
  
  if (output == 'screen') {
    var swap_tex = get_swapchain_tex()
    if (swap_tex) {
      pass = cmd_buffer.render_pass({
        color_targets: [{
          texture: swap_tex,
          load: "clear",
          store: "store",
          clear_color: {r: 0, g: 0, b: 0, a: 1}
        }]
      })
    } else {
      return
    }
    // pass = cmd_buffer.swapchain_pass(_window, {
    //   color_targets: [{
    //     load: "clear",
    //     store: "store",
    //     clear_color: {r: 0, g: 0, b: 0, a: 1}
    //   }]
    // })
  } else {
    pass = cmd_buffer.render_pass({
      color_targets: [{
        texture: output.texture,
        load: "clear",
        store: "store",
        clear_color: {r: 0, g: 0, b: 0, a: 0}
      }]
    })
  }
  
  pass.bind_pipeline(pipeline)
  pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  
  // Bind samplers
  var samplers = [{texture: input.texture, sampler: _sampler_linear}]
  if (cmd.extra_inputs) {
    arrfor(cmd.extra_inputs, function(extra) {
      push(samplers, {texture: extra.texture, sampler: _sampler_linear})
    })
  }
  pass.bind_fragment_samplers(0, samplers)
  
  if (uniform_data) {
    cmd_buffer.push_fragment_uniform_data(0, uniform_data)
  }
  
  pass.draw_indexed(6, 1, 0, 0, 0)
  pass.end()
}

function _build_shader_uniforms(shader, uniforms) {
  var data = blob_mod(64) // 16 floats max

  switch (shader) {
    case 'threshold':
      data.wf(uniforms.threshold || 0.8)
      data.wf(uniforms.intensity || 1.0)
      data.wf(0) // padding
      data.wf(0) // padding
      break
    case 'blur':
      var dir = uniforms.direction || {x: 1, y: 0}
      var texel = uniforms.texel_size || {x: 0.001, y: 0.001}
      data.wf(dir.x)
      data.wf(dir.y)
      data.wf(texel.x)
      data.wf(texel.y)
      break
    case 'crt':
      data.wf(uniforms.curvature || 0.1)
      data.wf(uniforms.scanline_intensity || 0.3)
      data.wf(uniforms.vignette || 0.2)
      data.wf(0) // padding
      var res = uniforms.resolution || {width: 1280, height: 720}
      data.wf(res.width)
      data.wf(res.height)
      data.wf(0) // padding
      data.wf(0) // padding
      break
    case 'accumulator':
      data.wf(uniforms.decay != null ? uniforms.decay : 0.9)
      data.wf(0) // padding
      data.wf(0) // padding
      data.wf(0) // padding
      break
    case 'mask':
      // channel: 0=alpha, 1=luminance
      // invert: 0=normal, 1=inverted
      data.wf(uniforms.channel != null ? uniforms.channel : 0)
      data.wf(uniforms.invert != null ? uniforms.invert : 0)
      data.wf(0) // padding
      data.wf(0) // padding
      break
    default:
      return null
  }

  return stone(data)
}

function _do_composite(cmd_buffer, cmd) {
  var base = cmd.base
  var overlay = cmd.overlay
  var output = cmd.output
  var mode = cmd.mode || 'over'
  
  if (!base || !base.texture || !overlay || !overlay.texture || !output || !output.texture) return
  
  // Build fullscreen quad
  var geom = _build_fullscreen_quad({x: 0, y: 0, width: output.width, height: output.height}, output.width, output.height)
  
  var vb_size = length(geom.vertices) / 8
  var ib_size = length(geom.indices) / 8
  
  var vb =gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib =gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer =gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer =gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, geom.vertices)
  ib_transfer.copy_blob(_gpu, geom.indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // First render base to output (clear and draw)
  var base_pass = cmd_buffer.render_pass({
    color_targets: [{
      texture: output.texture,
      load: "clear",
      store: "store",
      clear_color: {r: 0, g: 0, b: 0, a: 0}
    }]
  })
  
  base_pass.bind_pipeline(_pipelines.blit)
  base_pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  base_pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  base_pass.bind_fragment_samplers(0, [{texture: base.texture, sampler: _sampler_nearest}])
  base_pass.draw_indexed(6, 1, 0, 0, 0)
  base_pass.end()
  
  // Then render overlay with blend mode (load and blend on top)
  var overlay_pass = cmd_buffer.render_pass({
    color_targets: [{
      texture: output.texture,
      load: "load",
      store: "store"
    }]
  })
  
  // Use additive blend pipeline for bloom - use blit_add which has correct 16-byte vertex format
  var pipeline = mode == 'add' ? _pipelines.blit_add : _pipelines.blit
  
  overlay_pass.bind_pipeline(pipeline)
  overlay_pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  overlay_pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  overlay_pass.bind_fragment_samplers(0, [{texture: overlay.texture, sampler: _sampler_linear}])
  overlay_pass.draw_indexed(6, 1, 0, 0, 0)
  overlay_pass.end()
}

return sdl_gpu