Performance

Optimization strategies for getting the most out of your GPU.

Profiling

Measure GPU Time

javascript

const start = Date.now()

device.queueSubmit(encoder.finish())
device.poll(true)  // Wait for GPU to finish

const elapsed = Date.now() - start
console.log(`GPU time: ${elapsed}ms`)

Identify Bottlenecks

javascript

// Test different workgroup sizes
for (const size of [32, 64, 128, 256]) {
  const start = Date.now()
  runCompute(size)
  const elapsed = Date.now() - start
  console.log(`Workgroup ${size}: ${elapsed}ms`)
}

Buffer Optimization

1. Minimize Transfers

javascript

// ❌ Don't: Transfer every frame
for (let i = 0; i < 1000; i++) {
  device.queueWriteBuffer(buffer, 0, data)
  // ... use buffer
}

// ✅ Do: Transfer once
device.queueWriteBuffer(buffer, 0, data)
for (let i = 0; i < 1000; i++) {
  // ... use buffer
}

2. Use Appropriate Usage Flags

javascript

// ❌ Don't: Unnecessary flags
const buffer = device.createBuffer(
  size,
  BufferUsage.STORAGE | BufferUsage.MAP_READ | BufferUsage.MAP_WRITE,
  false
)

// ✅ Do: Only what you need
const buffer = device.createBuffer(
  size,
  BufferUsage.STORAGE,
  false
)

3. Batch Updates

javascript

// ❌ Don't: Many small writes
for (const item of items) {
  device.queueWriteBuffer(buffer, item.offset, item.data)
}

// ✅ Do: Single large write
const combinedData = combineData(items)
device.queueWriteBuffer(buffer, 0, combinedData)

4. Reuse Buffers

javascript

// ✅ Create once
const buffer = device.createBuffer(maxSize, usage, false)

// Reuse for different data
for (const dataset of datasets) {
  device.queueWriteBuffer(buffer, 0, dataset)
  runComputation()
}

// Cleanup once
buffer.destroy()

Compute Optimization

1. Choose Optimal Workgroup Size

Test different sizes to find optimal for your GPU:

javascript

const sizes = [32, 64, 128, 256]
let bestSize = 64
let bestTime = Infinity

for (const size of sizes) {
  const time = benchmarkWorkgroupSize(size)
  if (time < bestTime) {
    bestTime = time
    bestSize = size
  }
}

console.log(`Optimal workgroup size: ${bestSize}`)

2. Coalesce Memory Access

wgsl

// ✅ Do: Sequential access (fast)
@compute @workgroup_size(64)
fn good(@builtin(global_invocation_id) id: vec3<u32>) {
  let index = id.x;
  output[index] = input[index];
}

// ❌ Don't: Strided access (slow)
@compute @workgroup_size(64)
fn bad(@builtin(global_invocation_id) id: vec3<u32>) {
  let index = id.x * 13u;  // Non-sequential
  output[index] = input[index];
}

3. Use Shared Memory

wgsl

var<workgroup> shared: array<f32, 256>;

@compute @workgroup_size(256)
fn optimized(@builtin(local_invocation_id) local_id: vec3<u32>) {
  let tid = local_id.x;

  // Load into fast shared memory
  shared[tid] = input[global_id.x];
  workgroupBarrier();

  // Access from shared memory (faster)
  let left = shared[max(tid - 1u, 0u)];
  let right = shared[min(tid + 1u, 255u)];
  let value = shared[tid];

  output[global_id.x] = (left + value + right) / 3.0;
}

4. Minimize Divergence

wgsl

// ❌ Don't: Divergent branches
if (id.x % 2u == 0u) {
  // Half threads do expensive work
  output[id.x] = expensiveComputation();
} else {
  // Other half idle
  output[id.x] = 0.0;
}

// ✅ Do: Uniform work
// All threads do same work
output[id.x] = computation(id.x);

5. Reduce Register Pressure

wgsl

// ❌ Don't: Too many temporaries
let temp1 = a + b;
let temp2 = c + d;
let temp3 = e + f;
let temp4 = temp1 + temp2;
let temp5 = temp3 + temp4;
return temp5;

// ✅ Do: Reuse variables
var result = a + b;
result += c + d;
result += e + f;
return result;

Render Optimization

1. Minimize State Changes

javascript

// ❌ Don't: Frequent pipeline changes
for (const obj of objects) {
  pass.setPipeline(obj.pipeline)
  pass.setBindGroup(0, obj.bindGroup)
  pass.draw(obj.vertexCount, 1, 0, 0)
}

// ✅ Do: Sort by pipeline
const sorted = sortByPipeline(objects)
for (const [pipeline, objs] of sorted) {
  pass.setPipeline(pipeline)
  for (const obj of objs) {
    pass.setBindGroup(0, obj.bindGroup)
    pass.draw(obj.vertexCount, 1, 0, 0)
  }
}

2. Use Instancing

javascript

// ❌ Don't: Many draw calls
for (let i = 0; i < 1000; i++) {
  updateUniforms(i)
  pass.draw(vertexCount, 1, 0, 0)
}

// ✅ Do: Single instanced draw
pass.draw(vertexCount, 1000, 0, 0)

3. Frustum Culling

javascript

function isInFrustum(obj, camera) {
  // Check if object is visible
  return camera.frustum.contains(obj.boundingBox)
}

const visible = objects.filter(obj => isInFrustum(obj, camera))

for (const obj of visible) {
  // Only render visible objects
  renderObject(obj)
}

4. Level of Detail (LOD)

javascript

function selectLOD(obj, camera) {
  const distance = length(obj.position - camera.position)

  if (distance < 10) return obj.lodHigh
  if (distance < 50) return obj.lodMed
  return obj.lodLow
}

for (const obj of objects) {
  const lod = selectLOD(obj, camera)
  pass.setVertexBuffer(0, lod.buffer)
  pass.draw(lod.vertexCount, 1, 0, 0)
}

5. Occlusion Culling

javascript

// Don't render objects behind other objects
const sorted = sortFrontToBack(objects, camera)
const occluded = new Set()

for (const obj of sorted) {
  if (occluded.has(obj)) continue

  renderObject(obj)

  // Mark objects behind this as occluded
  if (obj.isOccluder) {
    for (const other of objectsBehind(obj, camera)) {
      occluded.add(other)
    }
  }
}

Texture Optimization

1. Use Appropriate Formats

javascript

// ❌ Don't: Waste memory
const texture = device.createTexture({
  format: 'rgba32float',  // 16 bytes/pixel - overkill
  // ...
})

// ✅ Do: Use efficient format
const texture = device.createTexture({
  format: 'rgba8unorm',  // 4 bytes/pixel
  // ...
})

2. Generate Mipmaps

javascript

// Mipmap generation (conceptual - requires compute shader)
function generateMipmaps(texture) {
  const levels = Math.floor(Math.log2(Math.max(width, height))) + 1

  for (let i = 1; i < levels; i++) {
    // Downsample previous level to current level
    downsampleLevel(texture, i - 1, i)
  }
}

3. Use Texture Compression

Check for compression support:

javascript

const features = adapter.getFeatures()

let format = 'rgba8unorm'
if (features.includes('texture-compression-bc')) {
  format = 'bc1-rgba-unorm'  // 4:1 compression
}

4. Minimize Texture Uploads

javascript

// ✅ Upload once at startup
const texture = createAndUploadTexture(imageData)

// Reuse in render loop
for (let frame = 0; frame < 1000; frame++) {
  renderWithTexture(texture)
}

Pipeline Optimization

1. Cache Pipelines

javascript

const pipelineCache = new Map()

function getPipeline(config) {
  const key = JSON.stringify(config)

  if (!pipelineCache.has(key)) {
    pipelineCache.set(key, device.createRenderPipeline(config))
  }

  return pipelineCache.get(key)
}

2. Use Render Bundles

Pre-record static geometry:

javascript

// Create once
const bundle = createRenderBundle(staticGeometry)

// Reuse every frame
for (let frame = 0; frame < 1000; frame++) {
  const pass = encoder.beginRenderPass({ /* ... */ })
  pass.executeBundles([bundle])
  pass.end()
}

Command Encoding

1. Minimize Command Encoders

javascript

// ❌ Don't: Many encoders
for (const obj of objects) {
  const encoder = device.createCommandEncoder()
  const pass = encoder.beginComputePass()
  // ... render obj
  pass.end()
  device.queueSubmit(encoder.finish())
}

// ✅ Do: Single encoder
const encoder = device.createCommandEncoder()
for (const obj of objects) {
  const pass = encoder.beginComputePass()
  // ... render obj
  pass.end()
}
device.queueSubmit(encoder.finish())

2. Batch Submissions

javascript

// ❌ Don't: Submit every iteration
for (let i = 0; i < 100; i++) {
  const encoder = device.createCommandEncoder()
  // ... commands
  device.queueSubmit(encoder.finish())
}

// ✅ Do: Batch submissions
const commands = []
for (let i = 0; i < 100; i++) {
  const encoder = device.createCommandEncoder()
  // ... commands
  commands.push(encoder.finish())
}
device.queueSubmit(...commands)

Memory Management

1. Destroy Unused Resources

javascript

// ✅ Explicit cleanup
const buffer = device.createBuffer(size, usage, false)
// ... use buffer
buffer.destroy()

const texture = device.createTexture({ /* ... */ })
// ... use texture
texture.destroy()

2. Pool Allocations

javascript

class BufferPool {
  constructor(device, size, usage) {
    this.device = device
    this.size = size
    this.usage = usage
    this.free = []
    this.used = new Set()
  }

  acquire() {
    if (this.free.length === 0) {
      const buffer = this.device.createBuffer(this.size, this.usage, false)
      this.free.push(buffer)
    }

    const buffer = this.free.pop()
    this.used.add(buffer)
    return buffer
  }

  release(buffer) {
    this.used.delete(buffer)
    this.free.push(buffer)
  }
}

const pool = new BufferPool(device, 1024, BufferUsage.STORAGE)

// Use
const buffer = pool.acquire()
// ... use buffer
pool.release(buffer)

Benchmarking

Create Benchmark Suite

javascript

class Benchmark {
  constructor(name) {
    this.name = name
    this.samples = []
  }

  async run(fn, iterations = 10) {
    for (let i = 0; i < iterations; i++) {
      const start = performance.now()
      await fn()
      const elapsed = performance.now() - start
      this.samples.push(elapsed)
    }

    return this.report()
  }

  report() {
    const avg = this.samples.reduce((a, b) => a + b) / this.samples.length
    const min = Math.min(...this.samples)
    const max = Math.max(...this.samples)

    console.log(`${this.name}:`)
    console.log(`  Average: ${avg.toFixed(2)}ms`)
    console.log(`  Min: ${min.toFixed(2)}ms`)
    console.log(`  Max: ${max.toFixed(2)}ms`)

    return { avg, min, max }
  }
}

// Usage
const bench = new Benchmark('Vector Addition')
await bench.run(() => runVectorAdd(), 100)

Platform-Specific Tips

macOS (Metal)

Metal backend is highly optimized
Unified memory architecture benefits buffer access
Lower CPU overhead than Vulkan

Linux (Vulkan)

Vulkan may have higher CPU overhead
Good multi-threading support
Test on target hardware

Windows (DX12/Vulkan)

DX12 may be faster on Windows
Vulkan more portable
Test both backends if available

Profiling Tools

Built-in Timing

javascript

async function profileGPU(name, fn) {
  const start = performance.now()

  await fn()
  device.poll(true)

  const elapsed = performance.now() - start
  console.log(`${name}: ${elapsed.toFixed(2)}ms`)
}

await profileGPU('Compute Shader', async () => {
  const encoder = device.createCommandEncoder()
  const pass = encoder.beginComputePass()
  // ... compute work
  pass.end()
  device.queueSubmit(encoder.finish())
})

Summary

Key Performance Principles:

Minimize transfers between CPU and GPU
Batch operations to reduce overhead
Reuse resources instead of recreating
Sort draw calls to minimize state changes
Use instancing for repeated geometry
Profile first, optimize bottlenecks
Test on target hardware - results vary by GPU

Next Steps

Learn about Testing →
See Examples →

Performance ​

Profiling ​

Measure GPU Time ​

Identify Bottlenecks ​

Buffer Optimization ​

1. Minimize Transfers ​

2. Use Appropriate Usage Flags ​

3. Batch Updates ​

4. Reuse Buffers ​

Compute Optimization ​

1. Choose Optimal Workgroup Size ​

2. Coalesce Memory Access ​

3. Use Shared Memory ​

4. Minimize Divergence ​

5. Reduce Register Pressure ​

Render Optimization ​

1. Minimize State Changes ​

2. Use Instancing ​

3. Frustum Culling ​

4. Level of Detail (LOD) ​

5. Occlusion Culling ​

Texture Optimization ​

1. Use Appropriate Formats ​

2. Generate Mipmaps ​

3. Use Texture Compression ​

4. Minimize Texture Uploads ​

Pipeline Optimization ​

1. Cache Pipelines ​

2. Use Render Bundles ​

Command Encoding ​

1. Minimize Command Encoders ​

2. Batch Submissions ​

Memory Management ​

1. Destroy Unused Resources ​

2. Pool Allocations ​

Benchmarking ​

Create Benchmark Suite ​

Platform-Specific Tips ​

macOS (Metal) ​

Linux (Vulkan) ​

Windows (DX12/Vulkan) ​

Profiling Tools ​

Built-in Timing ​

Summary ​

Next Steps ​

Performance

Profiling

Measure GPU Time

Identify Bottlenecks

Buffer Optimization

1. Minimize Transfers

2. Use Appropriate Usage Flags

3. Batch Updates

4. Reuse Buffers

Compute Optimization

1. Choose Optimal Workgroup Size

2. Coalesce Memory Access

3. Use Shared Memory

4. Minimize Divergence

5. Reduce Register Pressure

Render Optimization

1. Minimize State Changes

2. Use Instancing

3. Frustum Culling

4. Level of Detail (LOD)

5. Occlusion Culling

Texture Optimization

1. Use Appropriate Formats

2. Generate Mipmaps

3. Use Texture Compression

4. Minimize Texture Uploads

Pipeline Optimization

1. Cache Pipelines

2. Use Render Bundles

Command Encoding

1. Minimize Command Encoders

2. Batch Submissions

Memory Management

1. Destroy Unused Resources

2. Pool Allocations

Benchmarking

Create Benchmark Suite

Platform-Specific Tips

macOS (Metal)

Linux (Vulkan)

Windows (DX12/Vulkan)

Profiling Tools

Built-in Timing

Summary

Next Steps