// sdl_gpu.cm - SDL3 GPU Backend for fx_graph
//
// Direct SDL3 GPU implementation - does NOT use prosperon.cm
// Handles window creation, GPU init, texture loading, and rendering

var video = use('sdl3/video')
var gpu_mod = use('sdl3/gpu')
var blob_mod = use('blob')
var io = use('fd')
var png = use('image/png')
var qoi = use('image/qoi')
var gif = use('image/gif')
var aseprite = use('image/aseprite')
var aseprite = use('image/aseprite')
var staef = use('staef')
var res = use('resources')
var geometry = use('geometry')

var sdl_gpu = {}

// Private state
var _gpu = null
var _window = null
var _swapchain_format = null
var _window_width = 1280
var _window_height = 720

// Shaders
var _sprite_vert = null
var _sprite_frag = null
var _blit_vert = null
var _blit_frag = null
var _threshold_frag = null
var _blur_frag = null
var _mask_frag = null
var _mask_frag = null
var _crt_frag = null
var _text_sdf_frag = null
var _text_msdf_frag = null

// Pipelines
var _pipelines = {}

// Samplers
var _sampler_nearest = null
var _sampler_linear = null

// Texture cache: path -> {texture, width, height}
var _texture_cache = {}
var _white_texture = null

// Font cache: path.size -> font
var _font_cache = {}

// Render target pool
var _target_pool = {}

// ========================================================================
// INITIALIZATION
// ========================================================================

sdl_gpu.init = function(opts) {
  opts = opts || {}
  _window_width = opts.width || 1280
  _window_height = opts.height || 720
  
  _window = new video.window({
    title: opts.title || "Prosperon",
    width: _window_width,
    height: _window_height
  })
  
  _gpu = new gpu_mod.gpu({debug: true, shaders_msl: true, lowpower: true})
  _gpu.claim_window(_window)
  
  _swapchain_format = _gpu.swapchain_format(_window)
  
  // Load shaders
  if (!_load_shaders()) {
    log.console("sdl_gpu: Failed to load shaders")
    return false
  }
  
  // Create samplers
  _sampler_nearest = new gpu_mod.sampler(_gpu, {
    min_filter: "nearest",
    mag_filter: "nearest",
    u: "clamp_to_edge",
    v: "clamp_to_edge"
  })
  
  _sampler_linear = new gpu_mod.sampler(_gpu, {
    min_filter: "linear",
    mag_filter: "linear",
    u: "clamp_to_edge",
    v: "clamp_to_edge"
  })
  
  // Create white texture for untextured draws
  var white_pixels = new blob_mod(32, true)
  _white_texture = _create_gpu_texture(1, 1, stone(white_pixels))
  
  // Create pipelines
  _create_pipelines()
  
  log.console("sdl_gpu: Initialized")
  return true
}

sdl_gpu.get_window = function() {
  return _window
}

sdl_gpu.set_window_size = function(w, h) {
  _window_width = w
  _window_height = h
}

sdl_gpu.get_window_size = function() {
  return {width: _window_width, height: _window_height}
}

// ========================================================================
// SHADER LOADING
// ========================================================================

function _load_shaders() {
  var sprite_vert_code = io.slurp("shaders/msl/sprite2d.vert.msl")
  var sprite_frag_code = io.slurp("shaders/msl/sprite2d.frag.msl")
  var blit_vert_code = io.slurp("shaders/msl/blit.vert.msl")
  var blit_frag_code = io.slurp("shaders/msl/blit.frag.msl")
  var threshold_frag_code = io.slurp("shaders/msl/threshold.frag.msl")
  var blur_frag_code = io.slurp("shaders/msl/blur.frag.msl")
  var mask_frag_code = io.slurp("shaders/msl/mask.frag.msl")
  var text_sdf_frag_code = io.slurp("shaders/msl/text_sdf.frag.msl")
  
  if (!sprite_vert_code || !sprite_frag_code) {
    log.console("sdl_gpu: Missing sprite shaders")
    return false
  }
  
  _sprite_vert = new gpu_mod.shader(_gpu, {
    code: sprite_vert_code,
    stage: "vertex",
    format: "msl",
    entrypoint: "vertex_main",
    num_uniform_buffers: 1
  })
  
  _sprite_frag = new gpu_mod.shader(_gpu, {
    code: sprite_frag_code,
    stage: "fragment",
    format: "msl",
    entrypoint: "fragment_main",
    num_uniform_buffers: 0,
    num_samplers: 1
  })
  
  if (blit_vert_code && blit_frag_code) {
    _blit_vert = new gpu_mod.shader(_gpu, {
      code: blit_vert_code,
      stage: "vertex",
      format: "msl",
      entrypoint: "vertex_main",
      num_uniform_buffers: 0
    })
    
    _blit_frag = new gpu_mod.shader(_gpu, {
      code: blit_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 0,
      num_samplers: 1
    })
  }
  
  if (threshold_frag_code) {
    _threshold_frag = new gpu_mod.shader(_gpu, {
      code: threshold_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }
  
  if (blur_frag_code) {
    _blur_frag = new gpu_mod.shader(_gpu, {
      code: blur_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }
  
  if (mask_frag_code) {
    _mask_frag = new gpu_mod.shader(_gpu, {
      code: mask_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 2,
      num_samplers: 2
    })
  }

  if (text_sdf_frag_code) {
    _text_sdf_frag = new gpu_mod.shader(_gpu, {
      code: text_sdf_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }
  
  var text_msdf_frag_code = io.slurp("shaders/msl/text_msdf.frag.msl")
  if (text_msdf_frag_code) {
    _text_msdf_frag = new gpu_mod.shader(_gpu, {
      code: text_msdf_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }
  
  var crt_frag_code = io.slurp("shaders/msl/crt.frag.msl")
  if (crt_frag_code) {
    _crt_frag = new gpu_mod.shader(_gpu, {
      code: crt_frag_code,
      stage: "fragment",
      format: "msl",
      entrypoint: "fragment_main",
      num_uniform_buffers: 1,
      num_samplers: 1
    })
  }
  
  return true
}

// ========================================================================
// PIPELINE CREATION
// ========================================================================

function _create_pipelines() {
  // Sprite pipeline (alpha blend)
  _pipelines.sprite_alpha = new gpu_mod.graphics_pipeline(_gpu, {
    vertex: _sprite_vert,
    fragment: _sprite_frag,
    primitive: "triangle",
    cull: "none",
    face: "counter_clockwise",
    fill: "fill",
    vertex_buffer_descriptions: [{
      slot: 0,
      pitch: 32, // pos(2) + uv(2) + color(4) = 8 floats = 32 bytes
      input_rate: "vertex"
    }],
    vertex_attributes: [
      {location: 0, buffer_slot: 0, format: "float2", offset: 0},  // pos
      {location: 1, buffer_slot: 0, format: "float2", offset: 8},  // uv
      {location: 2, buffer_slot: 0, format: "float4", offset: 16}  // color
    ],
    target: {
      color_targets: [{
        format: _swapchain_format,
        blend: {
          enabled: true,
          src_rgb: "src_alpha",
          dst_rgb: "one_minus_src_alpha",
          op_rgb: "add",
          src_alpha: "one",
          dst_alpha: "one_minus_src_alpha",
          op_alpha: "add"
        }
      }]
    }
  })
  
  // Sprite pipeline (additive blend for bloom)
  _pipelines.sprite_add = new gpu_mod.graphics_pipeline(_gpu, {
    vertex: _sprite_vert,
    fragment: _sprite_frag,
    primitive: "triangle",
    cull: "none",
    face: "counter_clockwise",
    fill: "fill",
    vertex_buffer_descriptions: [{
      slot: 0,
      pitch: 32,
      input_rate: "vertex"
    }],
    vertex_attributes: [
      {location: 0, buffer_slot: 0, format: "float2", offset: 0},
      {location: 1, buffer_slot: 0, format: "float2", offset: 8},
      {location: 2, buffer_slot: 0, format: "float4", offset: 16}
    ],
    target: {
      color_targets: [{
        format: _swapchain_format,
        blend: {
          enabled: true,
          src_rgb: "one",
          dst_rgb: "one",
          op_rgb: "add",
          src_alpha: "one",
          dst_alpha: "one",
          op_alpha: "add"
        }
      }]
    }
  })
  
  // Blit pipeline (for fullscreen passes)
  if (_blit_vert && _blit_frag) {
    _pipelines.blit = new gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _blit_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16, // pos(2) + uv(2) = 4 floats = 16 bytes
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "src_alpha",
            dst_rgb: "one_minus_src_alpha",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one_minus_src_alpha",
            op_alpha: "add"
          }
        }]
      }
    })
  }
  
  // Threshold pipeline (for bloom extraction)
  if (_blit_vert && _threshold_frag) {
    _pipelines.threshold = new gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _threshold_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{format: _swapchain_format, blend: {enabled: false}}]
      }
    })
  }
  
  // Blur pipeline
  if (_blit_vert && _blur_frag) {
    _pipelines.blur = new gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _blur_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{format: _swapchain_format, blend: {enabled: false}}]
      }
    })
  }
  
  // Mask pipeline
  if (_blit_vert && _mask_frag) {
    _pipelines.mask = new gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _mask_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "src_alpha",
            dst_rgb: "one_minus_src_alpha",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one_minus_src_alpha",
            op_alpha: "add"
          }
        }]
      }
    })
  }
  
  // CRT pipeline
  if (_blit_vert && _crt_frag) {
    _pipelines.crt = new gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _crt_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{format: _swapchain_format, blend: {enabled: false}}]
      }
    })
  }
  
  // Blit additive pipeline (for bloom compositing)
  if (_blit_vert && _blit_frag) {
    _pipelines.blit_add = new gpu_mod.graphics_pipeline(_gpu, {
      vertex: _blit_vert,
      fragment: _blit_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 16,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "one",
            dst_rgb: "one",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one",
            op_alpha: "add"
          }
        }]
      }
    })
  }
  
  // SDF text pipeline
  if (_sprite_vert && _text_sdf_frag) {
    _pipelines.text_sdf = new gpu_mod.graphics_pipeline(_gpu, {
      vertex: _sprite_vert,
      fragment: _text_sdf_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 32,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8},
        {location: 2, buffer_slot: 0, format: "float4", offset: 16}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "src_alpha",
            dst_rgb: "one_minus_src_alpha",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one_minus_src_alpha",
            op_alpha: "add"
          }
        }]
      }
    })
  }
  
  // MSDF text pipeline
  if (_sprite_vert && _text_msdf_frag) {
    _pipelines.text_msdf = new gpu_mod.graphics_pipeline(_gpu, {
      vertex: _sprite_vert,
      fragment: _text_msdf_frag,
      primitive: "triangle",
      cull: "none",
      face: "counter_clockwise",
      fill: "fill",
      vertex_buffer_descriptions: [{
        slot: 0,
        pitch: 32,
        input_rate: "vertex"
      }],
      vertex_attributes: [
        {location: 0, buffer_slot: 0, format: "float2", offset: 0},
        {location: 1, buffer_slot: 0, format: "float2", offset: 8},
        {location: 2, buffer_slot: 0, format: "float4", offset: 16}
      ],
      target: {
        color_targets: [{
          format: _swapchain_format,
          blend: {
            enabled: true,
            src_rgb: "src_alpha",
            dst_rgb: "one_minus_src_alpha",
            op_rgb: "add",
            src_alpha: "one",
            dst_alpha: "one_minus_src_alpha",
            op_alpha: "add"
          }
        }]
      }
    })
  }
}

// ========================================================================
// TEXTURE MANAGEMENT
// ========================================================================

function _create_gpu_texture(w, h, pixels) {
  var tex = new gpu_mod.texture(_gpu, {
    width: w,
    height: h,
    format: "rgba8",
    type: "2d",
    layers: 1,
    mip_levels: 1,
    sampler: true
  })
  
  var size = w * h * 4
  var transfer = new gpu_mod.transfer_buffer(_gpu, {
    size: size,
    usage: "upload"
  })
  
  transfer.copy_blob(_gpu, pixels)
  
  var cmd = _gpu.acquire_cmd_buffer()
  var copy = cmd.copy_pass()
  copy.upload_to_texture(
    {transfer_buffer: transfer, offset: 0, pixels_per_row: w, rows_per_layer: h},
    {texture: tex, x: 0, y: 0, z: 0, w: w, h: h, d: 1},
    false
  )
  copy.end()
  cmd.submit()
  
  tex.width = w
  tex.height = h
  return tex
}

function _load_image_file(path) {
  var bytes = io.slurp(path)
  if (!bytes) return null
  
  var ext = path.split('.').pop().toLowerCase()
  var surface = null
  
  switch (ext) {
    case 'png':
    case 'jpg':
    case 'jpeg':
    case 'bmp':
      surface = png.decode(bytes)
      break
    case 'qoi':
      surface = qoi.decode(bytes)
      break
    case 'gif':
      var decoded = gif.decode(bytes)
      if (decoded && decoded.frames && decoded.frames.length > 0) {
        surface = decoded.frames[0]
      }
      break
    case 'ase':
    case 'aseprite':
      var decoded = aseprite.decode(bytes)
      if (decoded && decoded.frames && decoded.frames.length > 0) {
        surface = decoded.frames[0]
      }
      break
  }
  
  return surface
}

sdl_gpu.get_texture = function(path) {
  if (!path) return _white_texture
  
  // Check cache
  if (_texture_cache[path]) {
    return _texture_cache[path]
  }
  
  // Find and load image
  var fullpath = res.find_image(path)
  if (!fullpath) {
    log.console(`sdl_gpu: Image not found: ${path}`)
    return _white_texture
  }
  
  var surface = _load_image_file(fullpath)
  if (!surface || !surface.pixels) {
    log.console(`sdl_gpu: Failed to load image: ${path}`)
    return _white_texture
  }
  
  var tex = _create_gpu_texture(surface.width, surface.height, surface.pixels)
  _texture_cache[path] = tex
  
  return tex
}

// ========================================================================
// RENDER TARGET MANAGEMENT
// ========================================================================

sdl_gpu.get_or_create_target = function(width, height, key) {
  // Clamp dimensions to minimum 1x1 to prevent GPU errors
  if (!width || width < 1) width = 1
  if (!height || height < 1) height = 1
  
  var pool_key = `${width}x${height}`
  
  if (!_target_pool[pool_key])
    _target_pool[pool_key] = []
  
  // Reuse from pool if available
  // Simple optimization: Prefer most recently used (LIFO) or just first available
  for (var target of _target_pool[pool_key]) {
    if (!target.in_use) {
      target.in_use = true
      target.key = key
      return target
    }
  }
  
  // Create new render target texture
  var tex = new gpu_mod.texture(_gpu, {
    width: width,
    height: height,
    format: _swapchain_format,
    type: "2d",
    layers: 1,
    mip_levels: 1,
    sampler: true,
    color_target: true
  })
  
  tex.width = width
  tex.height = height
  
  var target = {
    texture: tex,
    width: width,
    height: height,
    in_use: true,
    key: key
  }
  
  _target_pool[pool_key].push(target)
  return target
}

sdl_gpu.release_all_targets = function() {
  for (var pool_key in _target_pool) {
    for (var target of _target_pool[pool_key])
      target.in_use = false
  }
}

// ========================================================================
// GEOMETRY BUILDING
// ========================================================================

// Build vertex data for sprites
// Vertex format: pos(2) + uv(2) + color(4) = 8 floats = 32 bytes
function _build_sprite_vertices(sprites, camera) {
  var floats_per_vertex = 8
  var vertices_per_sprite = 4
  var indices_per_sprite = 6

  var vertex_data = new blob_mod(sprites.length * vertices_per_sprite * floats_per_vertex * 4)
  var index_data = new blob_mod(sprites.length * indices_per_sprite * 2)
  
  var vertex_count = 0
  
  for (var s of sprites) {
    var px = s.pos.x != null ? s.pos.x : (s.pos[0] || 0)
    var py = s.pos.y != null ? s.pos.y : (s.pos[1] || 0)
    var w = s.width || 1
    var h = s.height || 1
    var ax = s.anchor_x || 0
    var ay = s.anchor_y || 0
    var c = s.color || {r: 1, g: 1, b: 1, a: 1}
    
    // Apply anchor
    var x = px - w * ax
    var y = py - h * ay
    
    // UV coordinates (handle sprite rect if present)
    var u0 = s.uv_rect ? s.uv_rect.x : 0
    var v0 = s.uv_rect ? s.uv_rect.y : 0
    var u1 = s.uv_rect ? (s.uv_rect.x + s.uv_rect.width) : 1
    var v1 = s.uv_rect ? (s.uv_rect.y + s.uv_rect.height) : 1
    
    // Quad vertices (bottom-left, bottom-right, top-right, top-left)
    // v0: bottom-left
    vertex_data.wf(x)
    vertex_data.wf(y)
    vertex_data.wf(u0)
    vertex_data.wf(v1) // Flip V
    vertex_data.wf(c.r)
    vertex_data.wf(c.g)
    vertex_data.wf(c.b)
    vertex_data.wf(c.a)
    
    // v1: bottom-right
    vertex_data.wf(x + w)
    vertex_data.wf(y)
    vertex_data.wf(u1)
    vertex_data.wf(v1) // Flip V
    vertex_data.wf(c.r)
    vertex_data.wf(c.g)
    vertex_data.wf(c.b)
    vertex_data.wf(c.a)
    
    // v2: top-right
    vertex_data.wf(x + w)
    vertex_data.wf(y + h)
    vertex_data.wf(u1)
    vertex_data.wf(v0) // Flip V
    vertex_data.wf(c.r)
    vertex_data.wf(c.g)
    vertex_data.wf(c.b)
    vertex_data.wf(c.a)
    
    // v3: top-left
    vertex_data.wf(x)
    vertex_data.wf(y + h)
    vertex_data.wf(u0)
    vertex_data.wf(v0) // Flip V
    vertex_data.wf(c.r)
    vertex_data.wf(c.g)
    vertex_data.wf(c.b)
    vertex_data.wf(c.a)
    
    // Indices (two triangles)
    index_data.w16(vertex_count + 0)
    index_data.w16(vertex_count + 1)
    index_data.w16(vertex_count + 2)
    index_data.w16(vertex_count + 0)
    index_data.w16(vertex_count + 2)
    index_data.w16(vertex_count + 3)
    
    vertex_count += 4
  }
  
  return {
    vertices: stone(vertex_data),
    indices: stone(index_data),
    vertex_count: vertex_count,
    index_count: sprites.length * 6
  }
}

// Build fullscreen quad for blit/post-processing
function _build_fullscreen_quad(dst_rect, target_width, target_height) {
  // Convert pixel rect to NDC
  var x0 = (dst_rect.x / target_width) * 2 - 1
  var y0 = (dst_rect.y / target_height) * 2 - 1
  var x1 = ((dst_rect.x + dst_rect.width) / target_width) * 2 - 1
  var y1 = ((dst_rect.y + dst_rect.height) / target_height) * 2 - 1
  
  var vertex_data = new blob_mod(4 * 4 * 4) // 4 verts * 4 floats * 4 bytes
  var index_data = new blob_mod(6 * 2) // 6 indices * 2 bytes
  
  // Metal textures have origin at top-left (uv 0,0 = top-left of texture)
  // NDC has origin at center (y=-1 is bottom, y=1 is top)
  // So we need to flip V: screen bottom (y0) samples texture bottom (v=1)
  //                       screen top (y1) samples texture top (v=0)
  
  // v0: bottom-left (NDC) -> sample texture bottom-left (u=0, v=1)
  vertex_data.wf(x0);
  vertex_data.wf(y0);
  vertex_data.wf(0);
  vertex_data.wf(1);  // v=1 (bottom of texture)
  
  // v1: bottom-right (NDC) -> sample texture bottom-right (u=1, v=1)
  vertex_data.wf(x1);
  vertex_data.wf(y0);
  vertex_data.wf(1);
  vertex_data.wf(1);  // v=1 (bottom of texture)
  
  // v2: top-right (NDC) -> sample texture top-right (u=1, v=0)
  vertex_data.wf(x1);
  vertex_data.wf(y1);
  vertex_data.wf(1);
  vertex_data.wf(0);  // v=0 (top of texture)
  
  // v3: top-left (NDC) -> sample texture top-left (u=0, v=0)
  vertex_data.wf(x0);
  vertex_data.wf(y1);
  vertex_data.wf(0);
  vertex_data.wf(0);  // v=0 (top of texture)
  
  index_data.w16(0)
  index_data.w16(1)
  index_data.w16(2)
  index_data.w16(0)
  index_data.w16(2)
  index_data.w16(3)
  
  return {
    vertices: stone(vertex_data),
    indices: stone(index_data),
    vertex_count: 4,
    index_count: 6
  }
}

// ========================================================================
// MATRIX BUILDING
// ========================================================================

function _build_ortho_matrix(left, right, bottom, top, near, far) {
  var data = new blob_mod(64)
  var m = []
  
  m[0] = 2 / (right - left)
  m[1] = 0
  m[2] = 0
  m[3] = 0
  
  m[4] = 0
  m[5] = 2 / (top - bottom)
  m[6] = 0
  m[7] = 0
  
  m[8] = 0
  m[9] = 0
  m[10] = -2 / (far - near)
  m[11] = 0
  
  m[12] = -(right + left) / (right - left)
  m[13] = -(top + bottom) / (top - bottom)
  m[14] = -(far + near) / (far - near)
  m[15] = 1
  
  for (var i = 0; i < 16; i++)
    data.wf(m[i])
  
  return stone(data)
}

function _build_camera_matrix(camera, target_width, target_height) {
  var pos = camera.pos || [0, 0]
  var cam_width = camera.width || target_width
  var cam_height = camera.height || target_height
  var anchor = camera.anchor || [0.5, 0.5]
  
  var left = pos[0] - cam_width * anchor[0]
  var right = pos[0] + cam_width * (1 - anchor[0])
  var bottom = pos[1] - cam_height * anchor[1]
  var top = pos[1] + cam_height * (1 - anchor[1])
  
  return _build_ortho_matrix(left, right, bottom, top, -1, 1)
}

// ========================================================================
// GRAPH EXECUTION
// ========================================================================
var ex = 0
sdl_gpu.execute_graph = function(graph, window_size, dbg = false) {
  _window_width = window_size.width
  _window_height = window_size.height
  
  // Execute graph to get all commands
  var result = graph.execute(this)
  if (dbg) {
    log.console(result)
    return
  }
  var all_commands = result.commands || []

  // Pre-load all textures needed
  _preload_textures(all_commands)
  
  // Execute commands
  _execute_commands(all_commands, window_size)
  
  // Release targets for next frame
  this.release_all_targets()
}

function _preload_textures(commands) {
  var paths = {}
  
  for (var cmd of commands) {
    if (cmd.cmd == 'draw_batch' && cmd.texture) {
      if (typeof cmd.texture == 'string') {
        paths[cmd.texture] = true
      }
    }
  }
  
  // Load all textures
  for (var path in paths) {
    sdl_gpu.get_texture(path)
  }
}

function _execute_commands(commands, window_size) {
  var cmd_buffer = _gpu.acquire_cmd_buffer()
  var current_pass = null
  var current_target = null
  var current_camera = null
  var pending_draws = []
  
  for (var cmd of commands) {
    switch (cmd.cmd) {
      case 'begin_render':
        // Flush pending draws
        if (current_pass && pending_draws.length > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End previous pass
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        // Start new pass
        var target = cmd.target
        var clear = cmd.clear
        
        var pass_desc = {
          color_targets: [{
            texture: target == 'screen' ? null : target.texture,
            load: clear ? "clear" : "load",
            store: "store",
            clear_color: clear ? {r: clear.r, g: clear.g, b: clear.b, a: clear.a} : {r: 0, g: 0, b: 0, a: 0}
          }]
        }
        
        if (target == 'screen') {
          var swap_tex = cmd_buffer.acquire_swapchain_texture(_window)
          if (swap_tex) {
            pass_desc.color_targets[0].texture = swap_tex
            current_pass = cmd_buffer.render_pass(pass_desc)
            current_target = {width: swap_tex.width, height: swap_tex.height}
          } else {
            log.console("sdl_gpu: Failed to acquire swapchain texture")
            current_pass = null
            current_target = window_size
          }
        } else {
          current_pass = cmd_buffer.render_pass(pass_desc)
          current_target = target
        }
        break
        
      case 'set_camera':
        current_camera = cmd.camera
        break
        
      case 'draw_batch':
        pending_draws.push(cmd)
        break

      case 'draw_text':
        pending_draws.push(cmd)
        break
        
      case 'blit':
        // Flush pending draws first
        if (current_pass && pending_draws.length > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End current pass - SDL blit works outside render passes
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        _do_blit(cmd_buffer, cmd, current_target)
        break
        
      case 'apply_mask':
        // Flush pending draws first
        if (current_pass && pending_draws.length > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End current pass - mask works as blit outside render pass
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        _do_mask(cmd_buffer, cmd)
        break
        
      case 'shader_pass':
        // Flush pending draws first
        if (current_pass && pending_draws.length > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End current pass - shader passes need their own render pass
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        _do_shader_pass(cmd_buffer, cmd)
        break
        
      case 'composite_textures':
        // Flush pending draws first
        if (current_pass && pending_draws.length > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        // End current pass
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        
        _do_composite(cmd_buffer, cmd, window_size)
        break
        
      case 'end_render':
        // Flush pending draws
        if (current_pass && pending_draws.length > 0) {
          _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
          pending_draws = []
        }
        
        if (current_pass) {
          current_pass.end()
          current_pass = null
        }
        break
        
      case 'present':
        // Submit command buffer
        break
    }
  }
  
  // Final flush
  if (current_pass && pending_draws.length > 0) {
    _flush_draws(cmd_buffer, current_pass, pending_draws, current_camera, current_target)
  }
  
  if (current_pass) {
    current_pass.end()
  }
  
  cmd_buffer.submit()
}

function _flush_draws(cmd_buffer, pass, draws, camera, target) {
  var current_batch = null
  
  // Iterate draws preserving order
  for (var draw of draws) {
    if (draw.cmd == 'draw_batch') {
      // Sprite batch handling
      var tex_path = draw.texture || '_white'
      var blend = draw.material ? draw.material.blend : 'alpha'
      
      // Check if we can append to current batch
      if (current_batch && 
          current_batch.type == 'sprites' && 
          current_batch.texture_path == tex_path && 
          current_batch.blend == blend) {
        
        // Append sprites
        if (draw.geometry && draw.geometry.sprites) {
          for (var s of draw.geometry.sprites) {
            current_batch.sprites.push(s)
          }
        }
      } else {
        // Flush current batch
        if (current_batch) _render_batch(cmd_buffer, pass, current_batch, camera, target)
        
        // Start new sprite batch
        current_batch = {
          type: 'sprites',
          texture_path: tex_path,
          blend: blend,
          sprites: []
        }
        
        if (draw.geometry && draw.geometry.sprites) {
          for (var s of draw.geometry.sprites) {
            current_batch.sprites.push(s)
          }
        }
      }
    } else if (draw.cmd == 'draw_text') {
      // Flush current batch
      if (current_batch) _render_batch(cmd_buffer, pass, current_batch, camera, target)
      current_batch = null
      
      // Render text immediately
      _render_text(cmd_buffer, pass, draw.drawable, camera, target)
    }
  }
  
  // Flush final batch
  if (current_batch) _render_batch(cmd_buffer, pass, current_batch, camera, target)
}

function _render_batch(cmd_buffer, pass, batch, camera, target) {
  if (batch.type == 'sprites') {
    if (batch.sprites.length == 0) return
    
    var tex = batch.texture_path == '_white' ? _white_texture : sdl_gpu.get_texture(batch.texture_path)
    var geom = _build_sprite_vertices(batch.sprites, camera)
    
    // Upload geometry
    var vb_size = geom.vertices.length / 8
    var ib_size = geom.indices.length / 8
    
    var vb = new gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
    var ib = new gpu_mod.buffer(_gpu, {size: ib_size, index: true})
    
    var vb_transfer = new gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
    var ib_transfer = new gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
    
    vb_transfer.copy_blob(_gpu, geom.vertices)
    ib_transfer.copy_blob(_gpu, geom.indices)
    
    var copy_cmd = _gpu.acquire_cmd_buffer()
    var copy = copy_cmd.copy_pass()
    copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
    copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
    copy.end()
    copy_cmd.submit()
    
    // Build camera matrix
    var proj = _build_camera_matrix(camera, target.width, target.height)
    
    // Select pipeline
    var pipeline = batch.blend == 'add' ? _pipelines.sprite_add : _pipelines.sprite_alpha
    
    // Draw
    pass.bind_pipeline(pipeline)
    pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
    pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
    pass.bind_fragment_samplers(0, [{texture: tex, sampler: _sampler_nearest}])
    cmd_buffer.push_vertex_uniform_data(0, proj)
    pass.draw_indexed(geom.index_count, 1, 0, 0, 0)
  }
}

function _render_text(cmd_buffer, pass, drawable, camera, target) {
  // Get font - support mode tag: 'bitmap', 'sdf', 'msdf'
  var font_path = drawable.font
  var size = drawable.size || 16
  var mode = drawable.mode || (drawable.sdf ? 'sdf' : 'bitmap')
  var font = _get_font_cache(font_path, size, mode)
  if (!font) return
  
  // Generate vertices using staef
  var pos = drawable.pos
  var text_pos = {x: pos.x, y: pos.y, width: 0, height: 0}
  var color = drawable.color || {r:1, g:1, b:1, a:1}
  
  // Handle anchor
  var ax = drawable.anchor_x || 0
  var ay = drawable.anchor_y || 0
  
  if (ax != 0 || ay != 0) {
    var dim = font.text_size(drawable.text)
    if (dim) {
      text_pos.x -= dim.x * ax
      text_pos.y -= dim.y * ay
    }
  }
  
  var mesh = font.make_text_buffer(drawable.text, text_pos, color)
  if (!mesh || !mesh.num_vertices) return

  // Interlace buffers manually
  var num_verts = mesh.num_vertices
  var interleaved = geometry.weave([{data:mesh.xy, stride: mesh.xy_stride}, {data:mesh.uv, stride: mesh.uv_stride}, {data:mesh.color, stride: mesh.color_stride}])
  
  var indices = mesh.indices
  var num_indices = mesh.num_indices
  
  // Upload
  var vb_size = num_verts * 32
  var ib_size = num_indices * 2
  
  var vb = new gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib = new gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer = new gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer = new gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, interleaved)
  ib_transfer.copy_blob(_gpu, indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // Setup pipeline
  var proj = _build_camera_matrix(camera, target.width, target.height)
  
  // Select pipeline based on mode
  var is_sdf = (mode == 'sdf')
  var is_msdf = (mode == 'msdf')
  
  if (is_msdf && _pipelines.text_msdf) {
    pass.bind_pipeline(_pipelines.text_msdf)
    
    // Build uniforms for MSDF
    // Struct: float outline_width, float sharpness, float2 _pad, float4 outline_color
    var u_data = new blob_mod(32)
    
    // Convert outline_width from pixel-ish units to normalized SDF units
    // outline_width in drawable is in "visual" units, we need to normalize
    // A typical range is 0.0-0.3 in SDF units
    var outline_w = drawable.outline_width || 0
    if (outline_w > 0) outline_w = outline_w / 100.0 // Scale down from user units
    
    u_data.wf(outline_w)                    // outline_width
    u_data.wf(font.sharpness || 1.0)        // sharpness from font
    u_data.wf(0)                            // _pad.x
    u_data.wf(0)                            // _pad.y
    
    var oc = drawable.outline_color || {r:0, g:0, b:0, a:1}
    u_data.wf(oc.r)                         // outline_color.r
    u_data.wf(oc.g)                         // outline_color.g
    u_data.wf(oc.b)                         // outline_color.b
    u_data.wf(oc.a || 1)                    // outline_color.a
    
    cmd_buffer.push_fragment_uniform_data(0, stone(u_data))
    
  } else if (is_sdf && _pipelines.text_sdf) {
    pass.bind_pipeline(_pipelines.text_sdf)
    
    // Build uniforms for SDF
    // Struct: float outline_width, float sharpness, float2 _pad, float4 outline_color
    var u_data = new blob_mod(32)
    
    var outline_w = drawable.outline_width || 0
    if (outline_w > 0) outline_w = outline_w / 100.0
    
    u_data.wf(outline_w)                    // outline_width
    u_data.wf(font.sharpness || 1.0)        // sharpness from font
    u_data.wf(0)                            // _pad.x
    u_data.wf(0)                            // _pad.y
    
    var oc = drawable.outline_color || {r:0, g:0, b:0, a:1}
    u_data.wf(oc.r)
    u_data.wf(oc.g)
    u_data.wf(oc.b)
    u_data.wf(oc.a || 1)
    
    cmd_buffer.push_fragment_uniform_data(0, stone(u_data))
    
  } else {
    pass.bind_pipeline(_pipelines.sprite_alpha)
  }

  pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  
  // Bind font texture - use linear filtering for SDF/MSDF
  var font_tex = _get_font_texture(font, mode)
  var sampler = (is_sdf || is_msdf) ? _sampler_linear : _sampler_nearest
  
  pass.bind_fragment_samplers(0, [{texture: font_tex, sampler: sampler}])
  cmd_buffer.push_vertex_uniform_data(0, proj)
  pass.draw_indexed(num_indices, 1, 0, 0, 0)
}

function _get_font_cache(path, size, mode) {
  // mode can be 'bitmap', 'sdf', 'msdf', or boolean (legacy)
  if (mode == true) mode = 'sdf'
  else if (mode == false || !mode) mode = 'bitmap'
  
  var key = `${path}.${size}.${mode}`
  if (_font_cache[key]) return _font_cache[key]
  
  var fullpath = res.find_font(path)
  if (!fullpath) return null
  
  var data = io.slurp(fullpath)
  if (!data) return null
  
  // Create staef font based on mode
  try {
    var font
    if (mode == 'msdf') {
      // MSDF: em_px=size, range_px=4, padding_px=6, sharpness=1.0
      font = new staef.msdf_font(data, size, 4.0, 6, 1.0)
    } else if (mode == 'sdf') {
      // SDF: em_px=size, range_px=12, padding_px=14, sharpness=1.0
      font = new staef.sdf_font(data, size, 12.0, 14, 1.0)
    } else {
      // Bitmap
      font = new staef.font(data, size, false)
    }
    _font_cache[key] = font
    return font
  } catch(e) {
    log.console(`sdl_gpu: Failed to load font ${path}:${size}:${mode}: ${e.message}`)
    return null
  }
}


function _get_font_texture(font, is_sdf) {
  if (font._gpu_texture) return font._gpu_texture
  
  // Create texture from font.texture (pixels, width, height)
  var ftex = font.texture
  if (!ftex) return _white_texture
  
  // Use linear filtering for SDF? The tex creation just sets format, sampler state is in pipeline/bind.
  // We can reuse creation logic.
  
  var tex = _create_gpu_texture(ftex.width, ftex.height, ftex.pixels)
  font._gpu_texture = tex
  return tex
}

// Use render pass with blending for proper compositing (SDL blit overwrites, doesn't blend)
function _do_blit(cmd_buffer, cmd, current_target) {
  var src = cmd.texture
  var dst_rect = cmd.dst_rect
  var filter = cmd.filter || 'nearest'
  var target = cmd.target || current_target
  
  if (!src || !src.texture) return

  if (target == 'screen') {
    // Cannot use SDL_BlitGPUTexture for screen/swapchain, must use render pass
    var swap_tex = cmd_buffer.acquire_swapchain_texture(_window)
    if (!swap_tex) return
    
    var pass = cmd_buffer.render_pass({
      color_targets: [{
        texture: swap_tex,
        load: "clear",
        store: "store",
        clear_color: {r: 0, g: 0, b: 0, a: 1}
      }]
    })

    var win_size = sdl_gpu.get_window_size()
    var geom = _build_fullscreen_quad(dst_rect, win_size.width, win_size.height)
    
    _draw_textured_quad(pass, geom, src.texture, _pipelines.blit, filter)
    pass.end()
  } else {
    // Use render pass with alpha blending instead of SDL blit (which overwrites)
    if (!target || !target.texture) return
    
    var geom = _build_fullscreen_quad(dst_rect, target.width, target.height)
    
    var pass = cmd_buffer.render_pass({
      color_targets: [{
        texture: target.texture,
        load: "load",  // IMPORTANT: Load existing content so we blend on top
        store: "store"
      }]
    })
    
    _draw_textured_quad(pass, geom, src.texture, _pipelines.blit, filter)
    pass.end()
  }
}

function _draw_textured_quad(pass, geom, texture, pipeline, filter) {
  var vb_size = geom.vertices.length / 8
  var ib_size = geom.indices.length / 8
  
  var vb = new gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib = new gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer = new gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer = new gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, geom.vertices)
  ib_transfer.copy_blob(_gpu, geom.indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  pass.bind_pipeline(pipeline)
  pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  
  var sampler = filter == 'linear' ? _sampler_linear : _sampler_nearest
  pass.bind_fragment_samplers(0, [{texture: texture, sampler: sampler}])
  
  pass.draw_indexed(6, 1, 0, 0, 0)
}

function _do_mask(cmd_buffer, cmd) {
  var content = cmd.content_texture
  var mask = cmd.mask_texture
  var output = cmd.output
  var mode = cmd.mode || 'alpha'
  var invert = cmd.invert || false
  
  if (!content || !content.texture) return
  if (!mask || !mask.texture) return
  if (!output || !output.texture) return
  
  // Check if mask pipeline is available
  if (!_pipelines.mask) {
    log.console("sdl_gpu: Mask pipeline not available, falling back to blit")
    cmd_buffer.blit({
      src: {texture: content.texture, x: 0, y: 0, width: content.width, height: content.height},
      dst: {texture: output.texture, x: 0, y: 0, width: output.width, height: output.height},
      load: "clear",
      filter: "nearest"
    })
    return
  }
  
  // Build fullscreen quad
  var geom = _build_fullscreen_quad({x: 0, y: 0, width: output.width, height: output.height}, output.width, output.height)
  
  var vb_size = geom.vertices.length / 8
  var ib_size = geom.indices.length / 8
  
  var vb = new gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib = new gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer = new gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer = new gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, geom.vertices)
  ib_transfer.copy_blob(_gpu, geom.indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // Build uniforms: invert, mode
  var uniform_data = new blob_mod(16)
  uniform_data.wf(invert ? 1.0 : 0.0)  // invert
  uniform_data.wf(mode == 'binary' ? 1.0 : 0.0)  // mode (0=alpha, 1=binary)
  uniform_data.wf(0)  // padding
  uniform_data.wf(0)  // padding
  
  // Render to output
  var mask_pass = cmd_buffer.render_pass({
    color_targets: [{
      texture: output.texture,
      load: "clear",
      store: "store",
      clear_color: {r: 0, g: 0, b: 0, a: 0}
    }]
  })
  
  mask_pass.bind_pipeline(_pipelines.mask)
  mask_pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  mask_pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  // Bind both content texture (slot 0) and mask texture (slot 1)
  mask_pass.bind_fragment_samplers(0, [
    {texture: content.texture, sampler: _sampler_nearest},
    {texture: mask.texture, sampler: _sampler_nearest}
  ])
  cmd_buffer.push_fragment_uniform_data(0, stone(uniform_data))
  mask_pass.draw_indexed(6, 1, 0, 0, 0)
  mask_pass.end()
}

function _do_shader_pass(cmd_buffer, cmd) {
  var shader = cmd.shader
  var input = cmd.input
  var output = cmd.output
  var uniforms = cmd.uniforms || {}
  
  if (!input || !input.texture) return
  if (output != 'screen' && (!output || !output.texture)) return

  // Select pipeline based on shader type
  var pipeline = null
  switch (shader) {
    case 'threshold':
      pipeline = _pipelines.threshold
      break
    case 'blur':
      pipeline = _pipelines.blur
      break
    case 'crt':
      pipeline = _pipelines.crt
      break
    default:
      log.console(`sdl_gpu: Unknown shader: ${shader}`)
      return
  }
  
  if (!pipeline) {
    log.console(`sdl_gpu: Pipeline not available for shader: ${shader}`)
    return
  }
  
  // Build fullscreen quad
  var out_w = output == 'screen' ? _window_width : output.width
  var out_h = output == 'screen' ? _window_height : output.height
  var geom = _build_fullscreen_quad({x: 0, y: 0, width: out_w, height: out_h}, out_w, out_h)
  
  // Upload geometry
  var vb_size = geom.vertices.length / 8
  var ib_size = geom.indices.length / 8
  
  var vb = new gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib = new gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer = new gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer = new gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, geom.vertices)
  ib_transfer.copy_blob(_gpu, geom.indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // Build uniform buffer based on shader type
  var uniform_data = _build_shader_uniforms(shader, uniforms)
  
  // Start render pass to output target
  var pass
  
  if (output == 'screen') {
    var swap_tex = cmd_buffer.acquire_swapchain_texture(_window)
    if (swap_tex) {
      pass = cmd_buffer.render_pass({
        color_targets: [{
          texture: swap_tex,
          load: "clear",
          store: "store",
          clear_color: {r: 0, g: 0, b: 0, a: 1}
        }]
      })
    } else {
      return
    }
    // pass = cmd_buffer.swapchain_pass(_window, {
    //   color_targets: [{
    //     load: "clear",
    //     store: "store",
    //     clear_color: {r: 0, g: 0, b: 0, a: 1}
    //   }]
    // })
  } else {
    pass = cmd_buffer.render_pass({
      color_targets: [{
        texture: output.texture,
        load: "clear",
        store: "store",
        clear_color: {r: 0, g: 0, b: 0, a: 0}
      }]
    })
  }
  
  pass.bind_pipeline(pipeline)
  pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  pass.bind_fragment_samplers(0, [{texture: input.texture, sampler: _sampler_linear}])
  
  if (uniform_data) {
    cmd_buffer.push_fragment_uniform_data(0, uniform_data)
  }
  
  pass.draw_indexed(6, 1, 0, 0, 0)
  pass.end()
}

function _build_shader_uniforms(shader, uniforms) {
  var data = new blob_mod(64) // 16 floats max
  
  switch (shader) {
    case 'threshold':
      data.wf(uniforms.threshold || 0.8)
      data.wf(uniforms.intensity || 1.0)
      data.wf(0) // padding
      data.wf(0) // padding
      break
    case 'blur':
      var dir = uniforms.direction || [1, 0]
      var texel = uniforms.texel_size || [0.001, 0.001]
      data.wf(dir[0])
      data.wf(dir[1])
      data.wf(texel[0])
      data.wf(texel[1])
      break
    case 'crt':
      data.wf(uniforms.curvature || 0.1)
      data.wf(uniforms.scanline_intensity || 0.3)
      data.wf(uniforms.vignette || 0.2)
      data.wf(0) // padding
      var res = uniforms.resolution || [1280, 720]
      data.wf(res[0])
      data.wf(res[1])
      data.wf(0) // padding
      data.wf(0) // padding
      break
    default:
      return null
  }
  
  return stone(data)
}

function _do_composite(cmd_buffer, cmd) {
  var base = cmd.base
  var overlay = cmd.overlay
  var output = cmd.output
  var mode = cmd.mode || 'over'
  
  if (!base || !base.texture || !overlay || !overlay.texture || !output || !output.texture) return
  
  // Build fullscreen quad
  var geom = _build_fullscreen_quad({x: 0, y: 0, width: output.width, height: output.height}, output.width, output.height)
  
  var vb_size = geom.vertices.length / 8
  var ib_size = geom.indices.length / 8
  
  var vb = new gpu_mod.buffer(_gpu, {size: vb_size, vertex: true})
  var ib = new gpu_mod.buffer(_gpu, {size: ib_size, index: true})
  
  var vb_transfer = new gpu_mod.transfer_buffer(_gpu, {size: vb_size, usage: "upload"})
  var ib_transfer = new gpu_mod.transfer_buffer(_gpu, {size: ib_size, usage: "upload"})
  
  vb_transfer.copy_blob(_gpu, geom.vertices)
  ib_transfer.copy_blob(_gpu, geom.indices)
  
  var copy_cmd = _gpu.acquire_cmd_buffer()
  var copy = copy_cmd.copy_pass()
  copy.upload_to_buffer({transfer_buffer: vb_transfer, offset: 0}, {buffer: vb, offset: 0, size: vb_size}, false)
  copy.upload_to_buffer({transfer_buffer: ib_transfer, offset: 0}, {buffer: ib, offset: 0, size: ib_size}, false)
  copy.end()
  copy_cmd.submit()
  
  // First render base to output (clear and draw)
  var base_pass = cmd_buffer.render_pass({
    color_targets: [{
      texture: output.texture,
      load: "clear",
      store: "store",
      clear_color: {r: 0, g: 0, b: 0, a: 0}
    }]
  })
  
  base_pass.bind_pipeline(_pipelines.blit)
  base_pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  base_pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  base_pass.bind_fragment_samplers(0, [{texture: base.texture, sampler: _sampler_nearest}])
  base_pass.draw_indexed(6, 1, 0, 0, 0)
  base_pass.end()
  
  // Then render overlay with blend mode (load and blend on top)
  var overlay_pass = cmd_buffer.render_pass({
    color_targets: [{
      texture: output.texture,
      load: "load",
      store: "store"
    }]
  })
  
  // Use additive blend pipeline for bloom - use blit_add which has correct 16-byte vertex format
  var pipeline = mode == 'add' ? _pipelines.blit_add : _pipelines.blit
  
  overlay_pass.bind_pipeline(pipeline)
  overlay_pass.bind_vertex_buffers(0, [{buffer: vb, offset: 0}])
  overlay_pass.bind_index_buffer({buffer: ib, offset: 0}, 16)
  overlay_pass.bind_fragment_samplers(0, [{texture: overlay.texture, sampler: _sampler_linear}])
  overlay_pass.draw_indexed(6, 1, 0, 0, 0)
  overlay_pass.end()
}

return sdl_gpu