From 3b81bbe36212325152f276f1c5ed20cd65251970 Mon Sep 17 00:00:00 2001
From: shijiashuai <jiashuai.shi@qq.com>
Date: Fri, 22 May 2026 15:11:26 +0800
Subject: [PATCH 1/2] refactor: extract scan module and archive completed
 changes

- Extract GPU prefix sum into dedicated ScanModule
  - Create src/sorting/scan/ with ScanModule class
  - Split scan WGSL kernels from radix.wgsl to scan.wgsl
  - Refactor RadixSorter to consume ScanModule
  - Add ScanModule interface tests

- Archive completed OpenSpec changes:
  - sorting-runtime-hardening
  - injectable-gpu-runtime
  - scan-module-extraction

This improves code organization by giving GPU prefix sum its own
module, shader ownership, and tests. RadixSorter is now reduced
to histogram/scatter orchestration plus scan consumption.
---
 .../.openspec.yaml                            |   2 +
 .../2026-05-injectable-gpu-runtime/design.md  |  63 ++++
 .../proposal.md                               |  27 ++
 .../specs/infrastructure/spec.md              |  25 ++
 .../2026-05-injectable-gpu-runtime/tasks.md   |  18 +
 .../.openspec.yaml                            |   2 +
 .../2026-05-scan-module-extraction/design.md  |  39 +++
 .../proposal.md                               |  27 ++
 .../specs/gpu-prefix-sum-module/spec.md       |  15 +
 .../specs/sorting/spec.md                     |  15 +
 .../2026-05-scan-module-extraction/tasks.md   |  18 +
 .../.openspec.yaml                            |   2 +
 .../design.md                                 |  71 ++++
 .../proposal.md                               |  30 ++
 .../specs/infrastructure/spec.md              |  39 +++
 .../specs/sorting/spec.md                     |  20 ++
 .../tasks.md                                  |  21 ++
 src/shaders/radix.wgsl                        | 248 +-------------
 src/shaders/scan.wgsl                         | 244 +++++++++++++
 src/sorting/RadixSorter.ts                    | 181 +---------
 src/sorting/scan/ScanModule.ts                | 324 ++++++++++++++++++
 src/sorting/scan/index.ts                     |   5 +
 test/sorting/ScanModule.test.ts               | 199 +++++++++++
 23 files changed, 1220 insertions(+), 415 deletions(-)
 create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/.openspec.yaml
 create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/design.md
 create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/proposal.md
 create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/specs/infrastructure/spec.md
 create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/tasks.md
 create mode 100644 openspec/archive/2026-05-scan-module-extraction/.openspec.yaml
 create mode 100644 openspec/archive/2026-05-scan-module-extraction/design.md
 create mode 100644 openspec/archive/2026-05-scan-module-extraction/proposal.md
 create mode 100644 openspec/archive/2026-05-scan-module-extraction/specs/gpu-prefix-sum-module/spec.md
 create mode 100644 openspec/archive/2026-05-scan-module-extraction/specs/sorting/spec.md
 create mode 100644 openspec/archive/2026-05-scan-module-extraction/tasks.md
 create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/.openspec.yaml
 create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/design.md
 create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/proposal.md
 create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/specs/infrastructure/spec.md
 create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/specs/sorting/spec.md
 create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/tasks.md
 create mode 100644 src/shaders/scan.wgsl
 create mode 100644 src/sorting/scan/ScanModule.ts
 create mode 100644 src/sorting/scan/index.ts
 create mode 100644 test/sorting/ScanModule.test.ts

diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/.openspec.yaml b/openspec/archive/2026-05-injectable-gpu-runtime/.openspec.yaml
new file mode 100644
index 0000000..4a1c677
--- /dev/null
+++ b/openspec/archive/2026-05-injectable-gpu-runtime/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-05-22
diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/design.md b/openspec/archive/2026-05-injectable-gpu-runtime/design.md
new file mode 100644
index 0000000..36740ce
--- /dev/null
+++ b/openspec/archive/2026-05-injectable-gpu-runtime/design.md
@@ -0,0 +1,63 @@
+## Context
+
+`GPUContext` currently owns both the interface and the browser-specific implementation for support checks, adapter acquisition, device creation, limits mapping, and device-loss subscription. Tests can only reach most branches by stubbing browser globals, which keeps the interface shallow and the implementation hard to move.
+
+The goal of this change is to deepen `GPUContext`, not replace it. Callers should still use one lifecycle module, but the browser runtime becomes an adapter behind a seam.
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- Keep `GPUContext` as the public lifecycle module.
+- Introduce an injectable runtime interface for support checks and adapter acquisition.
+- Preserve `new GPUContext()` as the default browser path.
+- Make initialization, failure, loss, and recovery paths unit-testable without browser globals.
+
+**Non-Goals:**
+
+- Refactor demo/benchmark orchestration in this change.
+- Add non-browser production adapters yet.
+- Change sorting algorithms or shader ownership.
+
+## Decisions
+
+### 1. Add a `GPURuntime` seam under `src/core/runtime/`
+
+Create an interface that owns:
+
+- support detection
+- adapter acquisition
+
+Add `browserGPURuntime` as the default adapter that wraps `navigator.gpu`.
+
+**Why:** Browser-global access is the shallow part. Moving it behind one seam gives `GPUContext` leverage and keeps future adapters possible.
+
+**Alternative:** Keep static browser-global checks and only inject requestAdapter. Rejected because support detection and acquisition belong to the same runtime adapter.
+
+### 2. Keep `GPUContext` as the public module
+
+`GPUContext` constructor accepts an optional runtime adapter, defaulting to `browserGPURuntime`. `initialize()` and `recover()` continue to define lifecycle semantics and error shaping.
+
+**Why:** Callers keep one familiar interface. The seam changes implementation ownership, not the high-level API shape.
+
+**Alternative:** Replace `GPUContext` with factories or free functions. Rejected because it would increase churn before the orchestration seam lands.
+
+### 3. Export runtime types, not extra orchestration
+
+Export the runtime interface and browser adapter from `src/index.ts` so tests and advanced consumers can inject them directly.
+
+**Why:** Constructor injection without exported types keeps the seam half-hidden.
+
+**Alternative:** Keep runtime files internal. Rejected because public constructor injection needs a public type.
+
+### 4. Rewrite `GPUContext` tests around fake runtimes
+
+Use injected fake runtimes/adapters/devices for initialization branches, limits mapping, recovery, and device-loss callbacks. Keep only small browser-global tests for the default static support path.
+
+**Why:** The interface is the test surface. The fake runtime gives locality for failure-path testing without browser globals.
+
+## Risks / Trade-offs
+
+- **Public API surface grows slightly** -> Limit it to runtime interface + default browser adapter.
+- **Fake GPU objects can drift from WebGPU reality** -> Keep browser E2E smoke tests and only fake branches that do not need real hardware.
+- **Future adapters may want more hooks** -> Start with the smallest seam: support detection + adapter acquisition.
diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/proposal.md b/openspec/archive/2026-05-injectable-gpu-runtime/proposal.md
new file mode 100644
index 0000000..305c968
--- /dev/null
+++ b/openspec/archive/2026-05-injectable-gpu-runtime/proposal.md
@@ -0,0 +1,27 @@
+## Why
+
+`GPUContext` still hard-codes `navigator.gpu`, adapter acquisition, and device-loss wiring inside one shallow module. That blocks deterministic unit tests, leaks browser globals into call sites, and makes future runtime adapters impossible without editing the implementation in place.
+
+## What Changes
+
+- Add an injectable GPU runtime seam so `GPUContext` acquires adapters and support checks through an adapter interface instead of browser globals.
+- Keep `GPUContext` as the public lifecycle module, but move browser-global access into a dedicated browser adapter.
+- Expand unit coverage for adapter absence, device request failure, limits mapping, device-loss callbacks, and recovery using fake runtimes.
+- Export the runtime types needed for external injection while keeping the default browser path intact.
+
+## Capabilities
+
+### New Capabilities
+
+- None.
+
+### Modified Capabilities
+
+- `infrastructure`: GPU context initialization must support injected runtime adapters while preserving the default browser runtime path.
+
+## Impact
+
+- **Affected code:** `src/core/GPUContext.ts`, new `src/core/runtime/` files, `src/index.ts`, `test/core/GPUContext.test.ts`
+- **Affected APIs:** `GPUContext` constructor gains an optional runtime adapter; runtime interface/types become importable.
+- **Dependencies:** No new packages.
+- **Systems:** Core runtime initialization, lifecycle handling, and unit-test seams.
diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/specs/infrastructure/spec.md b/openspec/archive/2026-05-injectable-gpu-runtime/specs/infrastructure/spec.md
new file mode 100644
index 0000000..38341f2
--- /dev/null
+++ b/openspec/archive/2026-05-injectable-gpu-runtime/specs/infrastructure/spec.md
@@ -0,0 +1,25 @@
+## MODIFIED Requirements
+
+### Requirement: GPU Context Initialization
+
+The GPUContext SHALL initialize through an injectable runtime adapter while preserving the default browser runtime path and existing lifecycle guarantees.
+
+#### Scenario: Default browser runtime remains available
+
+- **WHEN** a caller constructs `GPUContext` without providing a runtime adapter
+- **THEN** the context SHALL use the browser runtime adapter for support detection and adapter acquisition
+
+#### Scenario: Injected runtime drives initialization
+
+- **WHEN** a caller constructs `GPUContext` with a runtime adapter
+- **THEN** `initialize()` SHALL use that adapter instead of reading browser globals directly
+
+#### Scenario: Injected runtime failure is surfaced as typed errors
+
+- **WHEN** the injected runtime reports no adapter or device creation fails
+- **THEN** GPUContext SHALL preserve the existing typed error semantics for adapter and device failures
+
+#### Scenario: Injected runtime remains compatible with recovery and loss callbacks
+
+- **WHEN** device loss occurs and `recover()` is called on a context using an injected runtime
+- **THEN** the context SHALL reset state, reacquire a device through the same runtime adapter, and continue notifying registered callbacks
diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/tasks.md b/openspec/archive/2026-05-injectable-gpu-runtime/tasks.md
new file mode 100644
index 0000000..550e47e
--- /dev/null
+++ b/openspec/archive/2026-05-injectable-gpu-runtime/tasks.md
@@ -0,0 +1,18 @@
+## 1. OpenSpec and test setup
+
+- [x] 1.1 Finalize infrastructure delta spec for injected GPU runtime behavior
+- [x] 1.2 Add failing `GPUContext` tests for injected runtime initialization, failure paths, limits mapping, and recovery/loss callbacks
+
+## 2. Runtime seam implementation
+
+- [x] 2.1 Add `src/core/runtime/` runtime interface and default browser adapter
+- [x] 2.2 Refactor `GPUContext` to use injected runtime adapters while preserving `new GPUContext()` as the default path
+- [x] 2.3 Export runtime seam types from `src/index.ts`
+
+## 3. Validation
+
+- [x] 3.1 Update `GPUContext` tests to remove unnecessary browser-global stubbing from non-browser paths
+- [x] 3.2 Run `npm run typecheck`
+- [x] 3.3 Run `npm run lint`
+- [x] 3.4 Run `npm run test`
+- [x] 3.5 Run `npm run build`
diff --git a/openspec/archive/2026-05-scan-module-extraction/.openspec.yaml b/openspec/archive/2026-05-scan-module-extraction/.openspec.yaml
new file mode 100644
index 0000000..4a1c677
--- /dev/null
+++ b/openspec/archive/2026-05-scan-module-extraction/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-05-22
diff --git a/openspec/archive/2026-05-scan-module-extraction/design.md b/openspec/archive/2026-05-scan-module-extraction/design.md
new file mode 100644
index 0000000..88ccf47
--- /dev/null
+++ b/openspec/archive/2026-05-scan-module-extraction/design.md
@@ -0,0 +1,39 @@
+## Context
+
+The repository already moved radix prefix-sum onto the GPU, but the implementation stayed embedded in `RadixSorter`. That gives callers no scan seam, couples shader ownership to radix sorting, and leaves scan behavior hard to test independently.
+
+This change extracts the scan path into a deep module that RadixSorter can consume.
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- Give GPU prefix-sum its own module, shader ownership, and tests.
+- Reduce `RadixSorter` to histogram/scatter orchestration plus scan consumption.
+- Align docs/specs with actual GPU prefix-sum behavior.
+
+**Non-Goals:**
+
+- Change radix sort user-facing behavior.
+- Refactor demo/benchmark orchestration here.
+- Build a general-purpose GPU algorithm framework.
+
+## Decisions
+
+### 1. Add `src/sorting/scan/`
+
+Create a scan module with its own interface, initialization, dispatch, and teardown.
+
+### 2. Split shader ownership
+
+Move Blelloch scan kernels out of `radix.wgsl` into scan-owned WGSL so shader locality matches module ownership.
+
+### 3. Keep RadixSorter as a consumer
+
+RadixSorter will request exclusive prefix sums through a narrow interface and keep radix-specific buffers/passes local.
+
+## Risks / Trade-offs
+
+- **Shader split churn** -> Accept one-time movement for better locality and reuse.
+- **Extra module lifecycle** -> Keep scan interface narrow and explicit.
+- **Spec/doc drift risk** -> Update stable sorting specs as part of the change.
diff --git a/openspec/archive/2026-05-scan-module-extraction/proposal.md b/openspec/archive/2026-05-scan-module-extraction/proposal.md
new file mode 100644
index 0000000..80066a4
--- /dev/null
+++ b/openspec/archive/2026-05-scan-module-extraction/proposal.md
@@ -0,0 +1,27 @@
+## Why
+
+`RadixSorter` still owns histogram, scatter, and GPU prefix-sum orchestration inside one shallow module. The scan pipeline has real leverage on its own, but today its interface is trapped inside `RadixSorter` and its shader kernels are mixed into `radix.wgsl`.
+
+## What Changes
+
+- Extract GPU prefix-sum into a dedicated scan module with its own initialization, dispatch, and cleanup.
+- Split scan WGSL kernels from radix-only kernels.
+- Update RadixSorter to depend on the scan seam instead of owning scan pipeline details directly.
+- Add dedicated scan tests and align specs/docs with GPU-based prefix-sum ownership.
+
+## Capabilities
+
+### New Capabilities
+
+- `gpu-prefix-sum-module`: Dedicated scan module for GPU exclusive prefix sums with reusable runtime ownership.
+
+### Modified Capabilities
+
+- `sorting`: Radix sort shall consume the standalone scan module instead of embedding scan orchestration directly.
+
+## Impact
+
+- **Affected code:** `src/sorting/RadixSorter.ts`, new `src/sorting/scan/` files, `src/shaders/radix.wgsl`, new scan shader file(s), related tests/docs/specs
+- **Affected APIs:** New scan module surface; RadixSorter internal orchestration changes.
+- **Dependencies:** Likely overlaps conceptually with existing `gpu-prefix-sum` implementation history but should land as a new architectural extraction change.
+- **Systems:** Radix sorting internals, shader ownership, and scan-specific tests.
diff --git a/openspec/archive/2026-05-scan-module-extraction/specs/gpu-prefix-sum-module/spec.md b/openspec/archive/2026-05-scan-module-extraction/specs/gpu-prefix-sum-module/spec.md
new file mode 100644
index 0000000..b6bf18b
--- /dev/null
+++ b/openspec/archive/2026-05-scan-module-extraction/specs/gpu-prefix-sum-module/spec.md
@@ -0,0 +1,15 @@
+## ADDED Requirements
+
+### Requirement: GPU prefix-sum is a standalone module
+
+The system SHALL provide GPU exclusive prefix-sum through a dedicated scan module with its own initialization, dispatch, and cleanup interface.
+
+#### Scenario: Scan module computes exclusive prefix sums
+
+- **WHEN** a caller provides input, output, and scan configuration buffers
+- **THEN** the scan module SHALL execute the GPU prefix-sum passes without requiring RadixSorter-specific knowledge
+
+#### Scenario: Scan module owns scan pipeline resources
+
+- **WHEN** the scan module is initialized or destroyed
+- **THEN** it SHALL create and clean up scan-specific pipeline and shader resources independently
diff --git a/openspec/archive/2026-05-scan-module-extraction/specs/sorting/spec.md b/openspec/archive/2026-05-scan-module-extraction/specs/sorting/spec.md
new file mode 100644
index 0000000..e10e679
--- /dev/null
+++ b/openspec/archive/2026-05-scan-module-extraction/specs/sorting/spec.md
@@ -0,0 +1,15 @@
+## MODIFIED Requirements
+
+### Requirement: Radix Sort Implementation
+
+The RadixSorter SHALL perform histogram and scatter passes while delegating GPU prefix-sum work to a dedicated scan module.
+
+#### Scenario: RadixSorter consumes the scan seam
+
+- **WHEN** a radix pass needs prefix sums
+- **THEN** RadixSorter SHALL invoke the standalone scan module instead of owning scan dispatch logic directly
+
+#### Scenario: Sorting behavior remains unchanged
+
+- **WHEN** valid input arrays are sorted through RadixSorter
+- **THEN** the sorter SHALL continue producing correctly sorted ascending output
diff --git a/openspec/archive/2026-05-scan-module-extraction/tasks.md b/openspec/archive/2026-05-scan-module-extraction/tasks.md
new file mode 100644
index 0000000..ee931de
--- /dev/null
+++ b/openspec/archive/2026-05-scan-module-extraction/tasks.md
@@ -0,0 +1,18 @@
+## 1. Test-first scan seam
+
+- [x] 1.1 Add failing tests for scan-module interface and scan-specific runtime behavior
+- [x] 1.2 Add failing integration coverage for RadixSorter consuming the scan seam
+
+## 2. Implementation
+
+- [x] 2.1 Add scan module files under `src/sorting/scan/`
+- [x] 2.2 Split scan WGSL kernels from `radix.wgsl`
+- [x] 2.3 Refactor `RadixSorter` to consume the scan module
+- [x] 2.4 Update docs/specs for GPU prefix-sum ownership
+
+## 3. Validation
+
+- [x] 3.1 Run `npm run typecheck`
+- [x] 3.2 Run `npm run lint`
+- [x] 3.3 Run `npm run test`
+- [x] 3.4 Run `npm run build`
diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/.openspec.yaml b/openspec/archive/2026-05-sorting-runtime-hardening/.openspec.yaml
new file mode 100644
index 0000000..af43829
--- /dev/null
+++ b/openspec/archive/2026-05-sorting-runtime-hardening/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-05-21
diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/design.md b/openspec/archive/2026-05-sorting-runtime-hardening/design.md
new file mode 100644
index 0000000..6a4776e
--- /dev/null
+++ b/openspec/archive/2026-05-sorting-runtime-hardening/design.md
@@ -0,0 +1,71 @@
+## Context
+
+Three runtime hot spots are coupled today:
+
+1. `Benchmark.generateRandomData()` and several browser tests call `crypto.getRandomValues()` once per array, which breaks for the repository's own 100K and 1M benchmark sizes in quota-limited browser implementations.
+2. `BitonicSorter` and `RadixSorter` each own temporary GPU buffer allocation and cleanup inline, but they do it differently. Bitonic cleanup is not fully scoped to failure paths, while Radix directly destroys buffers outside `BufferManager` ownership.
+3. `BufferManager.readBuffer()` is the readback seam for both sorters, yet it does not use the repository's existing timeout guard.
+
+This change hardens runtime behavior without relitigating algorithm choice or public sorter APIs.
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- Guarantee large-array random data generation for benchmark, demo-adjacent, and browser test flows.
+- Create one deep temporary-buffer seam shared by both sorters.
+- Make readback failure modes bounded and deterministic.
+- Strengthen tests around the runtime behaviors that previously escaped coverage.
+
+**Non-Goals:**
+
+- Redesign `GPUContext` device-loss recovery in this change.
+- Change WGSL algorithm behavior or benchmark UX.
+- Introduce a general-purpose GPU buffer pool or new external dependency.
+
+## Decisions
+
+### 1. Add a shared random-data utility in `src/shared/`
+
+Create a focused utility that fills `Uint32Array` instances in quota-safe chunks and provides a convenience generator for callers that need new arrays. `Benchmark` and browser tests will consume the same seam.
+
+**Why:** Large-array randomness is a cross-cutting runtime concern, not benchmark-only behavior. One utility concentrates the browser quota rule and removes repeated inline crypto logic.
+
+**Alternatives considered:**
+
+- Patch only `Benchmark.generateRandomData()` -> rejected because browser tests would keep the same failure mode.
+- Use `Math.random()` everywhere -> rejected because it weakens existing randomness guarantees where crypto is available.
+
+### 2. Introduce a transient buffer scope for sorter-owned temporary buffers
+
+Add a small internal module that registers temporary GPU buffers, releases them exactly once in `finally`, and stays separate from explicit preallocation ownership. Both sorters will use this seam for per-sort buffers while preallocated buffers remain outside the scope.
+
+**Why:** The current modules are shallow around cleanup: each sorter repeats ownership decisions inline, so bugs hide in call sequencing instead of one interface. A transient scope increases locality and gives tests a single interface to verify.
+
+**Alternatives considered:**
+
+- Expand `BufferManager` to own every buffer in the system -> rejected because it deepens a mixed-responsibility module in the wrong direction.
+- Leave cleanup inline and add comments -> rejected because comments do not create a testable seam.
+
+### 3. Apply timeout policy at the readback seam
+
+Wrap `mapAsync()` in `BufferManager.readBuffer()` with the existing timeout utility and preserve cleanup for both success and failure paths.
+
+**Why:** Readback is where an unresponsive GPU becomes visible to CPU callers. The timeout belongs at this seam so callers do not each reinvent it.
+
+**Alternatives considered:**
+
+- Timeout at individual sorter call sites -> rejected because it duplicates policy and misses future `BufferManager` consumers.
+- No timeout, rely on device loss -> rejected because hanging promises do not surface actionable errors.
+
+### 4. Tighten tests around real runtime behavior
+
+Add unit tests for quota-safe random filling and buffer-scope cleanup, then update browser tests to use the shared random utility and call `gpu.isInitialized()` instead of asserting on a method reference.
+
+**Why:** Existing tests mostly cover pure helpers; they do not protect the runtime seams that actually failed.
+
+## Risks / Trade-offs
+
+- **More internal modules** -> Keep new seams narrowly scoped and internal-only to avoid API sprawl.
+- **Timeout values may need tuning across devices** -> Reuse existing timeout defaults first; keep the timeout wrapper centralized for later adjustment.
+- **Browser tests still depend on WebGPU availability** -> Focus new assertions on deterministic setup and data generation so skipped environments do not hide logic bugs.
diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/proposal.md b/openspec/archive/2026-05-sorting-runtime-hardening/proposal.md
new file mode 100644
index 0000000..59c777c
--- /dev/null
+++ b/openspec/archive/2026-05-sorting-runtime-hardening/proposal.md
@@ -0,0 +1,30 @@
+# Change Proposal: Sorting Runtime Hardening
+
+## Why
+
+The current runtime promises large-array benchmarking and browser validation, but multiple code paths still use one-shot `crypto.getRandomValues()` calls that fail once arrays exceed browser quota limits. At the same time, temporary GPU buffer ownership is duplicated across sorters, making failure-path cleanup shallow, fragile, and hard to test.
+
+## What Changes
+
+- Add a shared quota-safe random `Uint32Array` generator and route benchmark/demo/test paths through it.
+- Introduce a dedicated transient GPU buffer seam so Bitonic and Radix sorters clean up temporary buffers through one ownership model.
+- Harden buffer readback with timeout-backed mapping and deterministic staging-buffer cleanup.
+- Tighten browser and unit tests so large-array generation and initialization assertions catch real runtime behavior instead of passing accidentally.
+
+## Capabilities
+
+### New Capabilities
+
+- None.
+
+### Modified Capabilities
+
+- `sorting`: Benchmark-driven sorting flows must support default large dataset sizes without failing because of randomness API call limits.
+- `infrastructure`: Buffer readback and sorter resource lifecycle must time out or clean up deterministically on every exit path.
+
+## Impact
+
+- **Affected code:** `src/benchmark/`, `src/shared/`, `src/core/`, `src/sorting/`, `src/index.ts`, `test/benchmark/`, `test/core/`, `test/browser/`
+- **Affected APIs:** No intentional public API expansion; internal runtime contracts become stricter and more explicit.
+- **Dependencies:** Reuses existing timeout utility and Web Crypto support; no new packages.
+- **Systems:** Browser benchmark/demo flow, GPU sorter failure handling, and runtime-facing test coverage.
diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/specs/infrastructure/spec.md b/openspec/archive/2026-05-sorting-runtime-hardening/specs/infrastructure/spec.md
new file mode 100644
index 0000000..9571c6e
--- /dev/null
+++ b/openspec/archive/2026-05-sorting-runtime-hardening/specs/infrastructure/spec.md
@@ -0,0 +1,39 @@
+## MODIFIED Requirements
+
+### Requirement: GPU Buffer Management
+
+The BufferManager SHALL provide bounded readback behavior and deterministic staging-buffer cleanup for every buffer read operation.
+
+#### Scenario: Successful readback releases staging buffer
+
+- **WHEN** `readBuffer()` completes successfully
+- **THEN** it SHALL unmap the staging buffer, release it, and return a `Uint32Array` containing the requested byte range
+
+#### Scenario: Timed-out readback surfaces explicit failure
+
+- **WHEN** staging-buffer mapping exceeds the configured timeout
+- **THEN** `readBuffer()` SHALL reject with `GPUTimeoutError` and release the staging buffer before returning control to the caller
+
+#### Scenario: Readback failure still cleans up
+
+- **WHEN** staging-buffer mapping or copy fails for any other reason
+- **THEN** `readBuffer()` SHALL release the staging buffer and surface a typed buffer readback error
+
+### Requirement: Resource Lifecycle
+
+Sorter implementations SHALL release temporary GPU resources through a dedicated per-sort ownership seam while preserving explicitly preallocated buffers until callers clear or destroy them.
+
+#### Scenario: Temporary buffers are cleaned up on success
+
+- **WHEN** a sorter completes a sort using temporary buffers
+- **THEN** all temporary buffers created for that sort SHALL be released before the sorter returns
+
+#### Scenario: Temporary buffers are cleaned up on failure
+
+- **WHEN** a sorter throws after allocating temporary buffers
+- **THEN** the per-sort ownership seam SHALL still release those buffers before the error is propagated
+
+#### Scenario: Preallocated buffers remain opt-in
+
+- **WHEN** a caller uses `preallocate()` and later sorts data within that capacity
+- **THEN** the sorter SHALL reuse preallocated buffers without releasing them until `clearPreallocation()` or `destroy()` is called
diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/specs/sorting/spec.md b/openspec/archive/2026-05-sorting-runtime-hardening/specs/sorting/spec.md
new file mode 100644
index 0000000..273aa84
--- /dev/null
+++ b/openspec/archive/2026-05-sorting-runtime-hardening/specs/sorting/spec.md
@@ -0,0 +1,20 @@
+## MODIFIED Requirements
+
+### Requirement: Performance Benchmarking
+
+The system SHALL benchmark JavaScript native sort, Bitonic sort, and Radix sort across supported dataset sizes without failing because benchmark input generation exceeds browser randomness API quotas.
+
+#### Scenario: Benchmark generates large default datasets
+
+- **WHEN** the benchmark generates input for configured sizes such as 100K or 1M `u32` values
+- **THEN** it SHALL fill the array in quota-safe chunks and preserve the exact requested length
+
+#### Scenario: Benchmark reports averaged timings
+
+- **WHEN** multiple iterations complete for a benchmark run
+- **THEN** the system SHALL report averaged total timing and GPU timing when the selected algorithm exposes it
+
+#### Scenario: Benchmark compares GPU runs against native sort
+
+- **WHEN** Bitonic or Radix results are reported
+- **THEN** the system SHALL include speedup data derived from the corresponding JavaScript native benchmark result
diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/tasks.md b/openspec/archive/2026-05-sorting-runtime-hardening/tasks.md
new file mode 100644
index 0000000..64a65e1
--- /dev/null
+++ b/openspec/archive/2026-05-sorting-runtime-hardening/tasks.md
@@ -0,0 +1,21 @@
+## 1. Spec and test seam setup
+
+- [x] 1.1 Finalize delta specs for sorting benchmark input generation and infrastructure cleanup/timeout behavior
+- [x] 1.2 Add failing benchmark tests for quota-limited random generation
+- [x] 1.3 Add failing core tests for transient buffer cleanup and timed readback behavior
+
+## 2. Runtime hardening implementation
+
+- [x] 2.1 Implement a shared quota-safe random `Uint32Array` utility in `src/shared/`
+- [x] 2.2 Refactor `Benchmark` and browser tests to use the shared random utility
+- [x] 2.3 Introduce a transient GPU buffer scope and refactor both sorters to use it
+- [x] 2.4 Wrap `BufferManager.readBuffer()` with the existing timeout utility and preserve cleanup on all paths
+
+## 3. Validation and cleanup
+
+- [x] 3.1 Remove sorter non-null assertions by replacing them with explicit ownership guards
+- [x] 3.2 Fix browser initialization assertions so they invoke real APIs
+- [x] 3.3 Run `npm run lint`
+- [x] 3.4 Run `npm run typecheck`
+- [x] 3.5 Run `npm run test`
+- [x] 3.6 Run `npm run build`
diff --git a/src/shaders/radix.wgsl b/src/shaders/radix.wgsl
index 8486490..346cb42 100644
--- a/src/shaders/radix.wgsl
+++ b/src/shaders/radix.wgsl
@@ -1,6 +1,6 @@
 // Radix Sort WGSL Compute Shaders
-// Implements 4-bit radix sort with histogram, prefix sum, and scatter
-// Includes GPU-based Blelloch scan for prefix sum computation
+// Implements 4-bit radix sort with histogram and scatter
+// Prefix sum computation is handled by scan.wgsl
 //
 // IMPORTANT: WORKGROUP_SIZE and RADIX must match the values in src/constants.ts
 // @see src/constants.ts - WORKGROUP_SIZE = 256, RADIX = 16
@@ -12,16 +12,8 @@ struct RadixUniforms {
   _pad: u32,
 }
 
-struct ScanUniforms {
-  data_size: u32,      // Total elements to scan
-  num_blocks: u32,     // Number of workgroups/blocks
-  _pad1: u32,
-  _pad2: u32,
-}
-
 const WORKGROUP_SIZE: u32 = 256u;
 const RADIX: u32 = 16u;  // 4-bit radix = 16 buckets
-const SCAN_WORKGROUP_SIZE: u32 = 256u;  // Size for prefix sum scan
 
 @group(0) @binding(0) var<storage, read> input_data: array<u32>;
 @group(0) @binding(1) var<storage, read_write> output_data: array<u32>;
@@ -106,239 +98,3 @@ fn scatter(
     output_data[global_offset] = value;
   }
 }
-
-// ============================================================================
-// Blelloch Scan (Work-Efficient Parallel Prefix Sum)
-// ============================================================================
-
-// Shared memory for local Blelloch scan within a workgroup
-var<workgroup> scan_shared: array<u32, 512>;  // Must be >= 2 * SCAN_WORKGROUP_SIZE
-
-// Bindings for prefix sum scan
-@group(0) @binding(0) var<storage, read> scan_input: array<u32>;
-@group(0) @binding(1) var<storage, read_write> scan_output: array<u32>;
-@group(0) @binding(2) var<storage, read_write> block_sums: array<u32>;  // Sum of each block
-@group(0) @binding(3) var<uniform> scan_uniforms: ScanUniforms;
-
-// Blelloch scan - exclusive prefix sum
-// Phase 1: Up-sweep (reduce) - build binary tree of partial sums
-// Phase 2: Down-sweep (distribute) - propagate sums down the tree
-@compute @workgroup_size(SCAN_WORKGROUP_SIZE)
-fn blelloch_scan(
-  @builtin(global_invocation_id) global_id: vec3<u32>,
-  @builtin(local_invocation_id) local_id: vec3<u32>,
-  @builtin(workgroup_id) workgroup_id: vec3<u32>
-) {
-  let tid = local_id.x;
-  let gid = global_id.x;
-  let block_id = workgroup_id.x;
-  let n = scan_uniforms.data_size;
-  let block_size = SCAN_WORKGROUP_SIZE * 2u;  // Each workgroup processes 512 elements
-
-  // Calculate the range this workgroup handles
-  let block_start = block_id * block_size;
-  let block_end = min(block_start + block_size, n);
-  let local_n = block_end - block_start;
-
-  // Load data into shared memory (coalesced reads)
-  // Each thread loads 2 elements
-  let idx0 = block_start + tid;
-  let idx1 = block_start + tid + SCAN_WORKGROUP_SIZE;
-
-  // Initialize shared memory
-  if (idx0 < n) {
-    scan_shared[tid] = scan_input[idx0];
-  } else {
-    scan_shared[tid] = 0u;
-  }
-
-  if (idx1 < n) {
-    scan_shared[tid + SCAN_WORKGROUP_SIZE] = scan_input[idx1];
-  } else {
-    scan_shared[tid + SCAN_WORKGROUP_SIZE] = 0u;
-  }
-
-  workgroupBarrier();
-
-  // ========================================================================
-  // Phase 1: Up-sweep (Reduce)
-  // Build a binary tree of partial sums from leaves to root
-  // ========================================================================
-  var offset = 1u;
-  var d = block_size / 2u;
-  while (d > 0u) {
-    workgroupBarrier();
-
-    if (tid < d) {
-      let ai = offset * (2u * tid + 1u) - 1u;
-      let bi = offset * (2u * tid + 2u) - 1u;
-
-      // Only process if within our data range
-      if (bi < local_n) {
-        scan_shared[bi] = scan_shared[ai] + scan_shared[bi];
-      }
-    }
-
-    offset *= 2u;
-    d /= 2u;
-  }
-
-  workgroupBarrier();
-
-  // ========================================================================
-  // Phase 2: Down-sweep (Distribute)
-  // Propagate partial sums from root to leaves
-  // ========================================================================
-  // Clear the last element (makes it exclusive scan)
-  if (tid == 0u) {
-    // Store block sum before clearing (for multi-block scan)
-    let block_sum = scan_shared[block_size - 1u];
-    block_sums[block_id] = block_sum;
-    scan_shared[block_size - 1u] = 0u;
-  }
-
-  workgroupBarrier();
-
-  d = 1u;
-  while (d < block_size) {
-    offset /= 2u;
-    workgroupBarrier();
-
-    if (tid < d) {
-      let ai = offset * (2u * tid + 1u) - 1u;
-      let bi = offset * (2u * tid + 2u) - 1u;
-
-      if (bi < local_n) {
-        let t = scan_shared[ai];
-        scan_shared[ai] = scan_shared[bi];
-        scan_shared[bi] = t + scan_shared[bi];
-      }
-    }
-
-    d *= 2u;
-  }
-
-  workgroupBarrier();
-
-  // ========================================================================
-  // Write results back to global memory
-  // ========================================================================
-  if (idx0 < n) {
-    scan_output[idx0] = scan_shared[tid];
-  }
-
-  if (idx1 < n) {
-    scan_output[idx1] = scan_shared[tid + SCAN_WORKGROUP_SIZE];
-  }
-}
-
-// Shared memory for block sum scan (smaller, max blocks typically < 1024)
-var<workgroup> block_scan_shared: array<u32, 512>;
-
-// Scan the block sums (second level of two-level scan)
-@compute @workgroup_size(SCAN_WORKGROUP_SIZE)
-fn scan_block_sums(
-  @builtin(global_invocation_id) global_id: vec3<u32>,
-  @builtin(local_invocation_id) local_id: vec3<u32>,
-  @builtin(workgroup_id) workgroup_id: vec3<u32>
-) {
-  let tid = local_id.x;
-  let n = scan_uniforms.num_blocks;
-
-  // Load block sums into shared memory
-  if (tid < n) {
-    block_scan_shared[tid] = block_sums[tid];
-  } else {
-    block_scan_shared[tid] = 0u;
-  }
-
-  // Pad with zeros
-  if (tid + SCAN_WORKGROUP_SIZE < n) {
-    block_scan_shared[tid + SCAN_WORKGROUP_SIZE] = block_sums[tid + SCAN_WORKGROUP_SIZE];
-  } else if (tid + SCAN_WORKGROUP_SIZE < 512u) {
-    block_scan_shared[tid + SCAN_WORKGROUP_SIZE] = 0u;
-  }
-
-  workgroupBarrier();
-
-  // ========================================================================
-  // Phase 1: Up-sweep (Reduce)
-  // ========================================================================
-  var offset = 1u;
-  var d = 256u;  // SCAN_WORKGROUP_SIZE
-  while (d > 0u) {
-    workgroupBarrier();
-
-    if (tid < d) {
-      let ai = offset * (2u * tid + 1u) - 1u;
-      let bi = offset * (2u * tid + 2u) - 1u;
-
-      if (bi < n) {
-        block_scan_shared[bi] = block_scan_shared[ai] + block_scan_shared[bi];
-      }
-    }
-
-    offset *= 2u;
-    d /= 2u;
-  }
-
-  workgroupBarrier();
-
-  // ========================================================================
-  // Phase 2: Down-sweep (Distribute)
-  // ========================================================================
-  if (tid == 0u) {
-    block_scan_shared[511u] = 0u;  // Clear last element for exclusive scan
-  }
-
-  workgroupBarrier();
-
-  d = 1u;
-  while (d < 512u) {
-    offset /= 2u;
-    workgroupBarrier();
-
-    if (tid < d) {
-      let ai = offset * (2u * tid + 1u) - 1u;
-      let bi = offset * (2u * tid + 2u) - 1u;
-
-      if (bi < n) {
-        let t = block_scan_shared[ai];
-        block_scan_shared[ai] = block_scan_shared[bi];
-        block_scan_shared[bi] = t + block_scan_shared[bi];
-      }
-    }
-
-    d *= 2u;
-  }
-
-  workgroupBarrier();
-
-  // Write scanned block sums back
-  if (tid < n) {
-    block_sums[tid] = block_scan_shared[tid];
-  }
-}
-
-// Add block prefixes to each block's local scan results
-// This is the third step of two-level scan
-@compute @workgroup_size(SCAN_WORKGROUP_SIZE)
-fn add_block_prefixes(
-  @builtin(global_invocation_id) global_id: vec3<u32>,
-  @builtin(local_invocation_id) local_id: vec3<u32>,
-  @builtin(workgroup_id) workgroup_id: vec3<u32>
-) {
-  let tid = local_id.x;
-  let gid = global_id.x;
-  let block_id = workgroup_id.x;
-  let n = scan_uniforms.data_size;
-
-  // Get the prefix for this block (sum of all previous blocks)
-  let block_prefix = block_sums[block_id];
-
-  // Add block prefix to each element in this block
-  let idx = gid;
-  if (idx < n) {
-    scan_output[idx] = scan_output[idx] + block_prefix;
-  }
-}
diff --git a/src/shaders/scan.wgsl b/src/shaders/scan.wgsl
new file mode 100644
index 0000000..513da1e
--- /dev/null
+++ b/src/shaders/scan.wgsl
@@ -0,0 +1,244 @@
+// Blelloch Scan WGSL Compute Shaders
+// Work-efficient parallel exclusive prefix sum
+// This module provides GPU-based prefix sum computation
+
+struct ScanUniforms {
+  data_size: u32,      // Total elements to scan
+  num_blocks: u32,     // Number of workgroups/blocks
+  _pad1: u32,
+  _pad2: u32,
+}
+
+const SCAN_WORKGROUP_SIZE: u32 = 256u;  // Size for prefix sum scan
+
+// Shared memory for local Blelloch scan within a workgroup
+var<workgroup> scan_shared: array<u32, 512>;  // Must be >= 2 * SCAN_WORKGROUP_SIZE
+
+// Bindings for prefix sum scan
+@group(0) @binding(0) var<storage, read> scan_input: array<u32>;
+@group(0) @binding(1) var<storage, read_write> scan_output: array<u32>;
+@group(0) @binding(2) var<storage, read_write> block_sums: array<u32>;  // Sum of each block
+@group(0) @binding(3) var<uniform> scan_uniforms: ScanUniforms;
+
+// Blelloch scan - exclusive prefix sum
+// Phase 1: Up-sweep (reduce) - build binary tree of partial sums
+// Phase 2: Down-sweep (distribute) - propagate sums down the tree
+@compute @workgroup_size(SCAN_WORKGROUP_SIZE)
+fn blelloch_scan(
+  @builtin(global_invocation_id) global_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+  @builtin(workgroup_id) workgroup_id: vec3<u32>
+) {
+  let tid = local_id.x;
+  let gid = global_id.x;
+  let block_id = workgroup_id.x;
+  let n = scan_uniforms.data_size;
+  let block_size = SCAN_WORKGROUP_SIZE * 2u;  // Each workgroup processes 512 elements
+
+  // Calculate the range this workgroup handles
+  let block_start = block_id * block_size;
+  let block_end = min(block_start + block_size, n);
+  let local_n = block_end - block_start;
+
+  // Load data into shared memory (coalesced reads)
+  // Each thread loads 2 elements
+  let idx0 = block_start + tid;
+  let idx1 = block_start + tid + SCAN_WORKGROUP_SIZE;
+
+  // Initialize shared memory
+  if (idx0 < n) {
+    scan_shared[tid] = scan_input[idx0];
+  } else {
+    scan_shared[tid] = 0u;
+  }
+
+  if (idx1 < n) {
+    scan_shared[tid + SCAN_WORKGROUP_SIZE] = scan_input[idx1];
+  } else {
+    scan_shared[tid + SCAN_WORKGROUP_SIZE] = 0u;
+  }
+
+  workgroupBarrier();
+
+  // ========================================================================
+  // Phase 1: Up-sweep (Reduce)
+  // Build a binary tree of partial sums from leaves to root
+  // ========================================================================
+  var offset = 1u;
+  var d = block_size / 2u;
+  while (d > 0u) {
+    workgroupBarrier();
+
+    if (tid < d) {
+      let ai = offset * (2u * tid + 1u) - 1u;
+      let bi = offset * (2u * tid + 2u) - 1u;
+
+      // Only process if within our data range
+      if (bi < local_n) {
+        scan_shared[bi] = scan_shared[ai] + scan_shared[bi];
+      }
+    }
+
+    offset *= 2u;
+    d /= 2u;
+  }
+
+  workgroupBarrier();
+
+  // ========================================================================
+  // Phase 2: Down-sweep (Distribute)
+  // Propagate partial sums from root to leaves
+  // ========================================================================
+  // Clear the last element (makes it exclusive scan)
+  if (tid == 0u) {
+    // Store block sum before clearing (for multi-block scan)
+    let block_sum = scan_shared[block_size - 1u];
+    block_sums[block_id] = block_sum;
+    scan_shared[block_size - 1u] = 0u;
+  }
+
+  workgroupBarrier();
+
+  d = 1u;
+  while (d < block_size) {
+    offset /= 2u;
+    workgroupBarrier();
+
+    if (tid < d) {
+      let ai = offset * (2u * tid + 1u) - 1u;
+      let bi = offset * (2u * tid + 2u) - 1u;
+
+      if (bi < local_n) {
+        let t = scan_shared[ai];
+        scan_shared[ai] = scan_shared[bi];
+        scan_shared[bi] = t + scan_shared[bi];
+      }
+    }
+
+    d *= 2u;
+  }
+
+  workgroupBarrier();
+
+  // ========================================================================
+  // Write results back to global memory
+  // ========================================================================
+  if (idx0 < n) {
+    scan_output[idx0] = scan_shared[tid];
+  }
+
+  if (idx1 < n) {
+    scan_output[idx1] = scan_shared[tid + SCAN_WORKGROUP_SIZE];
+  }
+}
+
+// Shared memory for block sum scan (smaller, max blocks typically < 1024)
+var<workgroup> block_scan_shared: array<u32, 512>;
+
+// Scan the block sums (second level of two-level scan)
+@compute @workgroup_size(SCAN_WORKGROUP_SIZE)
+fn scan_block_sums(
+  @builtin(global_invocation_id) global_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+  @builtin(workgroup_id) workgroup_id: vec3<u32>
+) {
+  let tid = local_id.x;
+  let n = scan_uniforms.num_blocks;
+
+  // Load block sums into shared memory
+  if (tid < n) {
+    block_scan_shared[tid] = block_sums[tid];
+  } else {
+    block_scan_shared[tid] = 0u;
+  }
+
+  // Pad with zeros
+  if (tid + SCAN_WORKGROUP_SIZE < n) {
+    block_scan_shared[tid + SCAN_WORKGROUP_SIZE] = block_sums[tid + SCAN_WORKGROUP_SIZE];
+  } else if (tid + SCAN_WORKGROUP_SIZE < 512u) {
+    block_scan_shared[tid + SCAN_WORKGROUP_SIZE] = 0u;
+  }
+
+  workgroupBarrier();
+
+  // ========================================================================
+  // Phase 1: Up-sweep (Reduce)
+  // ========================================================================
+  var offset = 1u;
+  var d = 256u;  // SCAN_WORKGROUP_SIZE
+  while (d > 0u) {
+    workgroupBarrier();
+
+    if (tid < d) {
+      let ai = offset * (2u * tid + 1u) - 1u;
+      let bi = offset * (2u * tid + 2u) - 1u;
+
+      if (bi < n) {
+        block_scan_shared[bi] = block_scan_shared[ai] + block_scan_shared[bi];
+      }
+    }
+
+    offset *= 2u;
+    d /= 2u;
+  }
+
+  workgroupBarrier();
+
+  // ========================================================================
+  // Phase 2: Down-sweep (Distribute)
+  // ========================================================================
+  if (tid == 0u) {
+    block_scan_shared[511u] = 0u;  // Clear last element for exclusive scan
+  }
+
+  workgroupBarrier();
+
+  d = 1u;
+  while (d < 512u) {
+    offset /= 2u;
+    workgroupBarrier();
+
+    if (tid < d) {
+      let ai = offset * (2u * tid + 1u) - 1u;
+      let bi = offset * (2u * tid + 2u) - 1u;
+
+      if (bi < n) {
+        let t = block_scan_shared[ai];
+        block_scan_shared[ai] = block_scan_shared[bi];
+        block_scan_shared[bi] = t + block_scan_shared[bi];
+      }
+    }
+
+    d *= 2u;
+  }
+
+  workgroupBarrier();
+
+  // Write scanned block sums back
+  if (tid < n) {
+    block_sums[tid] = block_scan_shared[tid];
+  }
+}
+
+// Add block prefixes to each block's local scan results
+// This is the third step of two-level scan
+@compute @workgroup_size(SCAN_WORKGROUP_SIZE)
+fn add_block_prefixes(
+  @builtin(global_invocation_id) global_id: vec3<u32>,
+  @builtin(local_invocation_id) local_id: vec3<u32>,
+  @builtin(workgroup_id) workgroup_id: vec3<u32>
+) {
+  let tid = local_id.x;
+  let gid = global_id.x;
+  let block_id = workgroup_id.x;
+  let n = scan_uniforms.data_size;
+
+  // Get the prefix for this block (sum of all previous blocks)
+  let block_prefix = block_sums[block_id];
+
+  // Add block prefix to each element in this block
+  let idx = gid;
+  if (idx < n) {
+    scan_output[idx] = scan_output[idx] + block_prefix;
+  }
+}
\ No newline at end of file
diff --git a/src/sorting/RadixSorter.ts b/src/sorting/RadixSorter.ts
index 3fae6df..a3c6403 100644
--- a/src/sorting/RadixSorter.ts
+++ b/src/sorting/RadixSorter.ts
@@ -4,36 +4,22 @@ import { BufferScope } from '../core/BufferScope';
 import { SortResult, SortOptions } from '../shared/types';
 import { ShaderCompilationError } from '../core/errors';
 import { Validator } from '../core/Validator';
+import { ScanModule } from './scan/ScanModule';
 import radixShaderCode from '../shaders/radix.wgsl?raw';
 import { WORKGROUP_SIZE, RADIX, BITS_PER_PASS, NUM_PASSES } from '../shared/constants';
 
-/**
- * IMPORTANT: These constants must match the values in src/shaders/radix.wgsl
- * @see src/shaders/radix.wgsl:14-15 - const WORKGROUP_SIZE: u32 = 256u; const RADIX: u32 = 16u;
- * @see src/shaders/radix.wgsl:16 - const SCAN_WORKGROUP_SIZE: u32 = 256u;
- */
-
-/** Size for Blelloch scan workgroups */
-const SCAN_WORKGROUP_SIZE = 256;
-/** Elements processed per scan workgroup (each thread handles 2 elements) */
-const ELEMENTS_PER_SCAN_BLOCK = SCAN_WORKGROUP_SIZE * 2;
-
 /**
  * GPU-accelerated Radix Sort implementation with GPU-based prefix sum
  */
 export class RadixSorter {
   private device: GPUDevice;
   private bufferManager: BufferManager;
+  private scanModule: ScanModule;
+
   private histogramPipeline: GPUComputePipeline | null = null;
   private scatterPipeline: GPUComputePipeline | null = null;
   private bindGroupLayout: GPUBindGroupLayout | null = null;
 
-  // Blelloch scan pipelines
-  private blellochScanPipeline: GPUComputePipeline | null = null;
-  private scanBlockSumsPipeline: GPUComputePipeline | null = null;
-  private addBlockPrefixesPipeline: GPUComputePipeline | null = null;
-  private scanBindGroupLayout: GPUBindGroupLayout | null = null;
-
   // Preallocation state
   private preallocatedBuffers: {
     input: GPUBuffer;
@@ -50,6 +36,7 @@ export class RadixSorter {
   constructor(context: GPUContext) {
     this.device = context.getDevice();
     this.bufferManager = new BufferManager(this.device);
+    this.scanModule = new ScanModule(context);
   }
 
   /**
@@ -68,9 +55,10 @@ export class RadixSorter {
     // Release any existing preallocation
     this.clearPreallocation();
 
+    const { elementsPerScanBlock } = ScanModule.getConstants();
     const numWorkgroups = Math.ceil(maxSize / WORKGROUP_SIZE);
     const histogramSize = RADIX * numWorkgroups;
-    const numScanBlocks = Math.ceil(histogramSize / ELEMENTS_PER_SCAN_BLOCK);
+    const numScanBlocks = Math.ceil(histogramSize / elementsPerScanBlock);
 
     this.preallocatedBuffers = {
       input: this.device.createBuffer({
@@ -171,155 +159,12 @@ export class RadixSorter {
       },
     });
 
-    // Create scan bind group layout for Blelloch scan
-    this.scanBindGroupLayout = this.device.createBindGroupLayout({
-      label: 'scan-bind-group-layout',
-      entries: [
-        { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'read-only-storage' } },
-        { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } },
-        { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } },
-        { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'uniform' } },
-      ],
-    });
-
-    const scanPipelineLayout = this.device.createPipelineLayout({
-      label: 'scan-pipeline-layout',
-      bindGroupLayouts: [this.scanBindGroupLayout],
-    });
-
-    // Create Blelloch scan pipelines
-    this.blellochScanPipeline = this.device.createComputePipeline({
-      label: 'blelloch-scan-pipeline',
-      layout: scanPipelineLayout,
-      compute: {
-        module: shaderModule,
-        entryPoint: 'blelloch_scan',
-      },
-    });
-
-    this.scanBlockSumsPipeline = this.device.createComputePipeline({
-      label: 'scan-block-sums-pipeline',
-      layout: scanPipelineLayout,
-      compute: {
-        module: shaderModule,
-        entryPoint: 'scan_block_sums',
-      },
-    });
-
-    this.addBlockPrefixesPipeline = this.device.createComputePipeline({
-      label: 'add-block-prefixes-pipeline',
-      layout: scanPipelineLayout,
-      compute: {
-        module: shaderModule,
-        entryPoint: 'add_block_prefixes',
-      },
-    });
+    // Initialize scan module
+    await this.scanModule.initialize();
 
     this.initialized = true;
   }
 
-  /**
-   * Compute exclusive prefix sum on GPU using Blelloch scan
-   * Uses a two-level scan for large histograms:
-   * 1. Local scan within each workgroup
-   * 2. Scan of block sums
-   * 3. Add block prefixes to local results
-   */
-  private computePrefixSumGPU(
-    inputBuffer: GPUBuffer,
-    outputBuffer: GPUBuffer,
-    blockSumsBuffer: GPUBuffer,
-    scanUniformBuffer: GPUBuffer,
-    dataSize: number
-  ): void {
-    const scanBindGroupLayout = this.scanBindGroupLayout;
-    const blellochPipeline = this.blellochScanPipeline;
-    const scanBlockSumsPipeline = this.scanBlockSumsPipeline;
-    const addBlockPrefixesPipeline = this.addBlockPrefixesPipeline;
-
-    if (
-      !scanBindGroupLayout ||
-      !blellochPipeline ||
-      !scanBlockSumsPipeline ||
-      !addBlockPrefixesPipeline
-    ) {
-      throw new ShaderCompilationError('Scan pipelines not initialized');
-    }
-
-    // Calculate number of scan blocks
-    const numScanBlocks = Math.ceil(dataSize / ELEMENTS_PER_SCAN_BLOCK);
-
-    // Update scan uniforms
-    const scanUniformData = new Uint32Array([dataSize, numScanBlocks, 0, 0]);
-    this.device.queue.writeBuffer(scanUniformBuffer, 0, scanUniformData);
-
-    // Step 1: Local Blelloch scan within each workgroup
-    {
-      const bindGroup = this.device.createBindGroup({
-        label: 'blelloch-scan-bind-group',
-        layout: scanBindGroupLayout,
-        entries: [
-          { binding: 0, resource: { buffer: inputBuffer } },
-          { binding: 1, resource: { buffer: outputBuffer } },
-          { binding: 2, resource: { buffer: blockSumsBuffer } },
-          { binding: 3, resource: { buffer: scanUniformBuffer } },
-        ],
-      });
-
-      const commandEncoder = this.device.createCommandEncoder();
-      const passEncoder = commandEncoder.beginComputePass();
-      passEncoder.setPipeline(blellochPipeline);
-      passEncoder.setBindGroup(0, bindGroup);
-      passEncoder.dispatchWorkgroups(numScanBlocks);
-      passEncoder.end();
-      this.device.queue.submit([commandEncoder.finish()]);
-    }
-
-    // Step 2: Scan the block sums (if more than one block)
-    if (numScanBlocks > 1) {
-      const bindGroup = this.device.createBindGroup({
-        label: 'scan-block-sums-bind-group',
-        layout: scanBindGroupLayout,
-        entries: [
-          { binding: 0, resource: { buffer: blockSumsBuffer } },
-          { binding: 1, resource: { buffer: blockSumsBuffer } },
-          { binding: 2, resource: { buffer: blockSumsBuffer } },
-          { binding: 3, resource: { buffer: scanUniformBuffer } },
-        ],
-      });
-
-      const commandEncoder = this.device.createCommandEncoder();
-      const passEncoder = commandEncoder.beginComputePass();
-      passEncoder.setPipeline(scanBlockSumsPipeline);
-      passEncoder.setBindGroup(0, bindGroup);
-      passEncoder.dispatchWorkgroups(1);
-      passEncoder.end();
-      this.device.queue.submit([commandEncoder.finish()]);
-
-      // Step 3: Add block prefixes to each block's local results
-      {
-        const bindGroup = this.device.createBindGroup({
-          label: 'add-block-prefixes-bind-group',
-          layout: scanBindGroupLayout,
-          entries: [
-            { binding: 0, resource: { buffer: inputBuffer } },
-            { binding: 1, resource: { buffer: outputBuffer } },
-            { binding: 2, resource: { buffer: blockSumsBuffer } },
-            { binding: 3, resource: { buffer: scanUniformBuffer } },
-          ],
-        });
-
-        const commandEncoder = this.device.createCommandEncoder();
-        const passEncoder = commandEncoder.beginComputePass();
-        passEncoder.setPipeline(addBlockPrefixesPipeline);
-        passEncoder.setBindGroup(0, bindGroup);
-        passEncoder.dispatchWorkgroups(numScanBlocks);
-        passEncoder.end();
-        this.device.queue.submit([commandEncoder.finish()]);
-      }
-    }
-  }
-
   /**
    * Sort an array using GPU radix sort
    * @param data - The array to sort
@@ -340,9 +185,10 @@ export class RadixSorter {
       };
     }
 
+    const { elementsPerScanBlock } = ScanModule.getConstants();
     const numWorkgroups = Math.ceil(size / WORKGROUP_SIZE);
     const histogramSize = RADIX * numWorkgroups;
-    const numScanBlocks = Math.ceil(histogramSize / ELEMENTS_PER_SCAN_BLOCK);
+    const numScanBlocks = Math.ceil(histogramSize / elementsPerScanBlock);
 
     // Check if preallocated buffers can be used
     const preallocatedBuffers = this.preallocatedBuffers;
@@ -479,7 +325,7 @@ export class RadixSorter {
         }
 
         // Step 2: Compute prefix sum on GPU using Blelloch scan
-        this.computePrefixSumGPU(
+        this.scanModule.computePrefixSumGPU(
           histogramBuffer,
           prefixSumBuffer,
           blockSumsBuffer,
@@ -550,13 +396,10 @@ export class RadixSorter {
   destroy(): void {
     this.clearPreallocation();
     this.bufferManager.releaseAll();
+    this.scanModule.destroy();
     this.histogramPipeline = null;
     this.scatterPipeline = null;
     this.bindGroupLayout = null;
-    this.blellochScanPipeline = null;
-    this.scanBlockSumsPipeline = null;
-    this.addBlockPrefixesPipeline = null;
-    this.scanBindGroupLayout = null;
     this.initialized = false;
   }
 }
diff --git a/src/sorting/scan/ScanModule.ts b/src/sorting/scan/ScanModule.ts
new file mode 100644
index 0000000..e5eb800
--- /dev/null
+++ b/src/sorting/scan/ScanModule.ts
@@ -0,0 +1,324 @@
+/**
+ * ScanModule - GPU-based exclusive prefix sum (Blelloch scan)
+ *
+ * This module provides a dedicated scan interface for computing
+ * exclusive prefix sums on the GPU using the Blelloch algorithm.
+ */
+
+import { GPUContext } from '../../core/GPUContext';
+import { BufferManager } from '../../core/BufferManager';
+import { BufferScope } from '../../core/BufferScope';
+import { ShaderCompilationError } from '../../core/errors';
+import scanShaderCode from '../../shaders/scan.wgsl?raw';
+
+/** Size for Blelloch scan workgroups */
+const SCAN_WORKGROUP_SIZE = 256;
+/** Elements processed per scan workgroup (each thread handles 2 elements) */
+const ELEMENTS_PER_SCAN_BLOCK = SCAN_WORKGROUP_SIZE * 2;
+
+/**
+ * GPU-based exclusive prefix sum module using Blelloch scan
+ */
+export class ScanModule {
+  private device: GPUDevice;
+  private bufferManager: BufferManager;
+
+  // Scan pipelines
+  private blellochScanPipeline: GPUComputePipeline | null = null;
+  private scanBlockSumsPipeline: GPUComputePipeline | null = null;
+  private addBlockPrefixesPipeline: GPUComputePipeline | null = null;
+  private scanBindGroupLayout: GPUBindGroupLayout | null = null;
+
+  private initialized = false;
+
+  constructor(context: GPUContext) {
+    this.device = context.getDevice();
+    this.bufferManager = new BufferManager(this.device);
+  }
+
+  /**
+   * Initialize scan pipelines
+   */
+  async initialize(): Promise<void> {
+    if (this.initialized) return;
+
+    const shaderModule = this.device.createShaderModule({
+      label: 'scan-shader',
+      code: scanShaderCode,
+    });
+
+    const compilationInfo = await shaderModule.getCompilationInfo();
+    const errors = compilationInfo.messages.filter((m) => m.type === 'error');
+    if (errors.length > 0) {
+      throw new ShaderCompilationError(
+        `Scan shader compilation failed: ${errors.map((e) => e.message).join(', ')}`
+      );
+    }
+
+    // Create scan bind group layout
+    this.scanBindGroupLayout = this.device.createBindGroupLayout({
+      label: 'scan-bind-group-layout',
+      entries: [
+        { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'read-only-storage' } },
+        { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } },
+        { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } },
+        { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'uniform' } },
+      ],
+    });
+
+    const scanPipelineLayout = this.device.createPipelineLayout({
+      label: 'scan-pipeline-layout',
+      bindGroupLayouts: [this.scanBindGroupLayout],
+    });
+
+    // Create Blelloch scan pipelines
+    this.blellochScanPipeline = this.device.createComputePipeline({
+      label: 'blelloch-scan-pipeline',
+      layout: scanPipelineLayout,
+      compute: {
+        module: shaderModule,
+        entryPoint: 'blelloch_scan',
+      },
+    });
+
+    this.scanBlockSumsPipeline = this.device.createComputePipeline({
+      label: 'scan-block-sums-pipeline',
+      layout: scanPipelineLayout,
+      compute: {
+        module: shaderModule,
+        entryPoint: 'scan_block_sums',
+      },
+    });
+
+    this.addBlockPrefixesPipeline = this.device.createComputePipeline({
+      label: 'add-block-prefixes-pipeline',
+      layout: scanPipelineLayout,
+      compute: {
+        module: shaderModule,
+        entryPoint: 'add_block_prefixes',
+      },
+    });
+
+    this.initialized = true;
+  }
+
+  /**
+   * Compute exclusive prefix sum on GPU using Blelloch scan
+   *
+   * Uses a two-level scan for large arrays:
+   * 1. Local scan within each workgroup
+   * 2. Scan of block sums
+   * 3. Add block prefixes to local results
+   *
+   * @param input - Input array to compute prefix sum for
+   * @returns Exclusive prefix sum of input
+   */
+  async computeExclusivePrefixSum(input: Uint32Array): Promise<Uint32Array> {
+    if (!this.initialized) {
+      throw new ShaderCompilationError('ScanModule not initialized. Call initialize() first.');
+    }
+
+    const dataSize = input.length;
+
+    // Handle edge cases
+    if (dataSize === 0) {
+      return new Uint32Array(0);
+    }
+
+    if (dataSize === 1) {
+      return new Uint32Array([0]);
+    }
+
+    const numScanBlocks = Math.ceil(dataSize / ELEMENTS_PER_SCAN_BLOCK);
+    const bufferScope = new BufferScope();
+
+    try {
+      // Create buffers
+      const inputBuffer = bufferScope.track(
+        this.bufferManager.createStorageBuffer(input, 'scan-input'),
+        (buffer) => this.bufferManager.releaseBuffer(buffer)
+      );
+
+      const outputBuffer = bufferScope.track(
+        this.device.createBuffer({
+          label: 'scan-output',
+          size: BufferManager.alignSize(dataSize * 4, 4),
+          usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
+        })
+      );
+
+      const blockSumsBuffer = bufferScope.track(
+        this.device.createBuffer({
+          label: 'scan-block-sums',
+          size: BufferManager.alignSize(numScanBlocks * 4, 4),
+          usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
+        })
+      );
+
+      const uniformBuffer = bufferScope.track(
+        this.bufferManager.createUniformBuffer(16, 'scan-uniforms'),
+        (buffer) => this.bufferManager.releaseBuffer(buffer)
+      );
+
+      // Perform the scan
+      this.computePrefixSumGPU(inputBuffer, outputBuffer, blockSumsBuffer, uniformBuffer, dataSize);
+
+      // Wait for GPU to finish
+      await this.device.queue.onSubmittedWorkDone();
+
+      // Read results
+      const result = await this.bufferManager.readBuffer(outputBuffer, dataSize * 4);
+
+      return result;
+    } finally {
+      bufferScope.releaseAll();
+    }
+  }
+
+  /**
+   * Internal method to compute prefix sum on GPU
+   *
+   * This is exposed for use by RadixSorter which already has buffers allocated.
+   */
+  computePrefixSumGPU(
+    inputBuffer: GPUBuffer,
+    outputBuffer: GPUBuffer,
+    blockSumsBuffer: GPUBuffer,
+    uniformBuffer: GPUBuffer,
+    dataSize: number
+  ): void {
+    const scanBindGroupLayout = this.scanBindGroupLayout;
+    const blellochPipeline = this.blellochScanPipeline;
+    const scanBlockSumsPipeline = this.scanBlockSumsPipeline;
+    const addBlockPrefixesPipeline = this.addBlockPrefixesPipeline;
+
+    if (
+      !scanBindGroupLayout ||
+      !blellochPipeline ||
+      !scanBlockSumsPipeline ||
+      !addBlockPrefixesPipeline
+    ) {
+      throw new ShaderCompilationError('Scan pipelines not initialized');
+    }
+
+    // Calculate number of scan blocks
+    const numScanBlocks = Math.ceil(dataSize / ELEMENTS_PER_SCAN_BLOCK);
+
+    // Update scan uniforms
+    const scanUniformData = new Uint32Array([dataSize, numScanBlocks, 0, 0]);
+    this.device.queue.writeBuffer(uniformBuffer, 0, scanUniformData);
+
+    // Step 1: Local Blelloch scan within each workgroup
+    {
+      const bindGroup = this.device.createBindGroup({
+        label: 'blelloch-scan-bind-group',
+        layout: scanBindGroupLayout,
+        entries: [
+          { binding: 0, resource: { buffer: inputBuffer } },
+          { binding: 1, resource: { buffer: outputBuffer } },
+          { binding: 2, resource: { buffer: blockSumsBuffer } },
+          { binding: 3, resource: { buffer: uniformBuffer } },
+        ],
+      });
+
+      const commandEncoder = this.device.createCommandEncoder();
+      const passEncoder = commandEncoder.beginComputePass();
+      passEncoder.setPipeline(blellochPipeline);
+      passEncoder.setBindGroup(0, bindGroup);
+      passEncoder.dispatchWorkgroups(numScanBlocks);
+      passEncoder.end();
+      this.device.queue.submit([commandEncoder.finish()]);
+    }
+
+    // Step 2: Scan the block sums (if more than one block)
+    if (numScanBlocks > 1) {
+      const bindGroup = this.device.createBindGroup({
+        label: 'scan-block-sums-bind-group',
+        layout: scanBindGroupLayout,
+        entries: [
+          { binding: 0, resource: { buffer: blockSumsBuffer } },
+          { binding: 1, resource: { buffer: blockSumsBuffer } },
+          { binding: 2, resource: { buffer: blockSumsBuffer } },
+          { binding: 3, resource: { buffer: uniformBuffer } },
+        ],
+      });
+
+      const commandEncoder = this.device.createCommandEncoder();
+      const passEncoder = commandEncoder.beginComputePass();
+      passEncoder.setPipeline(scanBlockSumsPipeline);
+      passEncoder.setBindGroup(0, bindGroup);
+      passEncoder.dispatchWorkgroups(1);
+      passEncoder.end();
+      this.device.queue.submit([commandEncoder.finish()]);
+
+      // Step 3: Add block prefixes to each block's local results
+      {
+        const bindGroup = this.device.createBindGroup({
+          label: 'add-block-prefixes-bind-group',
+          layout: scanBindGroupLayout,
+          entries: [
+            { binding: 0, resource: { buffer: inputBuffer } },
+            { binding: 1, resource: { buffer: outputBuffer } },
+            { binding: 2, resource: { buffer: blockSumsBuffer } },
+            { binding: 3, resource: { buffer: uniformBuffer } },
+          ],
+        });
+
+        const commandEncoder = this.device.createCommandEncoder();
+        const passEncoder = commandEncoder.beginComputePass();
+        passEncoder.setPipeline(addBlockPrefixesPipeline);
+        passEncoder.setBindGroup(0, bindGroup);
+        passEncoder.dispatchWorkgroups(numScanBlocks);
+        passEncoder.end();
+        this.device.queue.submit([commandEncoder.finish()]);
+      }
+    }
+  }
+
+  /**
+   * Get the scan bind group layout for external use
+   */
+  getBindGroupLayout(): GPUBindGroupLayout | null {
+    return this.scanBindGroupLayout;
+  }
+
+  /**
+   * Get the scan pipelines for external use
+   */
+  getPipelines(): {
+    blellochScan: GPUComputePipeline | null;
+    scanBlockSums: GPUComputePipeline | null;
+    addBlockPrefixes: GPUComputePipeline | null;
+  } {
+    return {
+      blellochScan: this.blellochScanPipeline,
+      scanBlockSums: this.scanBlockSumsPipeline,
+      addBlockPrefixes: this.addBlockPrefixesPipeline,
+    };
+  }
+
+  /**
+   * Get constants for external use
+   */
+  static getConstants(): {
+    scanWorkgroupSize: number;
+    elementsPerScanBlock: number;
+  } {
+    return {
+      scanWorkgroupSize: SCAN_WORKGROUP_SIZE,
+      elementsPerScanBlock: ELEMENTS_PER_SCAN_BLOCK,
+    };
+  }
+
+  /**
+   * Release all resources
+   */
+  destroy(): void {
+    this.bufferManager.releaseAll();
+    this.blellochScanPipeline = null;
+    this.scanBlockSumsPipeline = null;
+    this.addBlockPrefixesPipeline = null;
+    this.scanBindGroupLayout = null;
+    this.initialized = false;
+  }
+}
diff --git a/src/sorting/scan/index.ts b/src/sorting/scan/index.ts
new file mode 100644
index 0000000..31825b9
--- /dev/null
+++ b/src/sorting/scan/index.ts
@@ -0,0 +1,5 @@
+/**
+ * Scan module exports
+ */
+
+export { ScanModule } from './ScanModule';
diff --git a/test/sorting/ScanModule.test.ts b/test/sorting/ScanModule.test.ts
new file mode 100644
index 0000000..1908bc7
--- /dev/null
+++ b/test/sorting/ScanModule.test.ts
@@ -0,0 +1,199 @@
+/**
+ * Tests for ScanModule interface
+ * The ScanModule provides GPU-based exclusive prefix sum (Blelloch scan)
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { GPUContext } from '../../src/core/GPUContext';
+import { ScanModule } from '../../src/sorting/scan/ScanModule';
+
+describe('ScanModule', () => {
+  let context: GPUContext;
+  let scanModule: ScanModule;
+
+  beforeEach(async () => {
+    // Skip tests if WebGPU is not available
+    if (!GPUContext.isSupported()) {
+      return;
+    }
+
+    context = new GPUContext();
+    await context.initialize();
+    scanModule = new ScanModule(context);
+  });
+
+  afterEach(() => {
+    if (scanModule) {
+      scanModule.destroy();
+    }
+    if (context) {
+      context.destroy();
+    }
+  });
+
+  describe('initialization', () => {
+    it('should initialize successfully', async () => {
+      if (!GPUContext.isSupported()) {
+        return; // Skip in Node.js
+      }
+
+      await expect(scanModule.initialize()).resolves.not.toThrow();
+    });
+
+    it('should be idempotent - calling initialize twice should not throw', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      await scanModule.initialize();
+      await expect(scanModule.initialize()).resolves.not.toThrow();
+    });
+  });
+
+  describe('computeExclusivePrefixSum', () => {
+    beforeEach(async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+      await scanModule.initialize();
+    });
+
+    it('should handle single element input', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      const input = new Uint32Array([5]);
+      const result = await scanModule.computeExclusivePrefixSum(input);
+
+      expect(result.length).toBe(1);
+      expect(result[0]).toBe(0); // Exclusive prefix sum of [5] is [0]
+    });
+
+    it('should compute correct prefix sum for small arrays', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      const input = new Uint32Array([3, 1, 7, 0]);
+      const result = await scanModule.computeExclusivePrefixSum(input);
+
+      // Exclusive prefix sum: [0, 3, 4, 11]
+      expect(result.length).toBe(4);
+      expect(result[0]).toBe(0);
+      expect(result[1]).toBe(3);
+      expect(result[2]).toBe(4);
+      expect(result[3]).toBe(11);
+    });
+
+    it('should handle array of zeros', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      const input = new Uint32Array([0, 0, 0, 0]);
+      const result = await scanModule.computeExclusivePrefixSum(input);
+
+      expect(result).toEqual(new Uint32Array([0, 0, 0, 0]));
+    });
+
+    it('should handle larger arrays', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      // Create array [1, 2, 3, ..., 100]
+      const input = new Uint32Array(100);
+      for (let i = 0; i < 100; i++) {
+        input[i] = i + 1;
+      }
+
+      const result = await scanModule.computeExclusivePrefixSum(input);
+
+      // Verify: result[i] = sum of input[0..i-1]
+      let expectedSum = 0;
+      for (let i = 0; i < 100; i++) {
+        expect(result[i]).toBe(expectedSum);
+        expectedSum += input[i];
+      }
+    });
+
+    it('should handle non-power-of-2 sized arrays', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      const input = new Uint32Array([1, 2, 3, 4, 5]); // 5 elements, not power of 2
+      const result = await scanModule.computeExclusivePrefixSum(input);
+
+      // Expected: [0, 1, 3, 6, 10]
+      expect(result).toEqual(new Uint32Array([0, 1, 3, 6, 10]));
+    });
+
+    it('should throw error if not initialized', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      // Create a new module without initializing
+      const uninitializedModule = new ScanModule(context);
+
+      const input = new Uint32Array([1, 2, 3]);
+
+      await expect(uninitializedModule.computeExclusivePrefixSum(input)).rejects.toThrow();
+    });
+  });
+
+  describe('destroy', () => {
+    it('should release resources without throwing', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      await scanModule.initialize();
+      expect(() => scanModule.destroy()).not.toThrow();
+    });
+
+    it('should be idempotent - calling destroy twice should not throw', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      await scanModule.initialize();
+      scanModule.destroy();
+      expect(() => scanModule.destroy()).not.toThrow();
+    });
+  });
+
+  describe('integration with RadixSorter', () => {
+    it('should produce correct histogram prefix sums for radix sort', async () => {
+      if (!GPUContext.isSupported()) {
+        return;
+      }
+
+      await scanModule.initialize();
+
+      // Simulate a histogram that might be produced by RadixSorter
+      // For 4 workgroups, each bucket has 4 counts
+      const histogram = new Uint32Array(16 * 4); // RADIX=16, 4 workgroups
+      for (let bucket = 0; bucket < 16; bucket++) {
+        for (let wg = 0; wg < 4; wg++) {
+          // Each bucket gets some counts
+          histogram[bucket * 4 + wg] = bucket + wg;
+        }
+      }
+
+      const result = await scanModule.computeExclusivePrefixSum(histogram);
+
+      // Verify correctness using simple reference
+      const expected = new Uint32Array(histogram.length);
+      let sum = 0;
+      for (let i = 0; i < histogram.length; i++) {
+        expected[i] = sum;
+        sum += histogram[i];
+      }
+
+      expect(result).toEqual(expected);
+    });
+  });
+});

From 66f8ae4cecb79f85e21426af544ffe817143e523 Mon Sep 17 00:00:00 2001
From: shijiashuai <jiashuai.shi@qq.com>
Date: Fri, 22 May 2026 16:41:51 +0800
Subject: [PATCH 2/2] chore: archive skipped changes and fix browser test setup

- Archive demo-benchmark-orchestration (over-engineering for demo project)
- Archive doc-implementation-sync (core work completed)
- Fix playwright.config.ts: add WebGPU flags and testMatch pattern
- Fix ScanModule: batch GPU commands in single command encoder
- Fix fixtures: navigate to Vite dev server before tests
- Update browser-e2e-testing tasks with blocker note

Browser tests fail on SwiftShader (headless Chromium) due to incomplete
WebGPU compute shader support. Node.js tests pass (103/103).
---
 .../.openspec.yaml                            |  2 +
 .../design.md                                 | 39 +++++++++++++++++++
 .../proposal.md                               | 27 +++++++++++++
 .../specs/demo-orchestration/spec.md          | 24 ++++++++++++
 .../specs/sorting/spec.md                     | 15 +++++++
 .../tasks.md                                  | 17 ++++++++
 .../proposal.md                               |  0
 .../2026-05-doc-implementation-sync}/tasks.md |  0
 openspec/changes/browser-e2e-testing/tasks.md |  3 +-
 playwright.config.ts                          | 15 ++++++-
 src/sorting/scan/ScanModule.ts                | 12 +++---
 test/browser/fixtures.ts                      |  5 +++
 12 files changed, 150 insertions(+), 9 deletions(-)
 create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/.openspec.yaml
 create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/design.md
 create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/proposal.md
 create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/specs/demo-orchestration/spec.md
 create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/specs/sorting/spec.md
 create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/tasks.md
 rename openspec/{changes/doc-implementation-sync => archive/2026-05-doc-implementation-sync}/proposal.md (100%)
 rename openspec/{changes/doc-implementation-sync => archive/2026-05-doc-implementation-sync}/tasks.md (100%)

diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/.openspec.yaml b/openspec/archive/2026-05-demo-benchmark-orchestration/.openspec.yaml
new file mode 100644
index 0000000..4a1c677
--- /dev/null
+++ b/openspec/archive/2026-05-demo-benchmark-orchestration/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-05-22
diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/design.md b/openspec/archive/2026-05-demo-benchmark-orchestration/design.md
new file mode 100644
index 0000000..e157aeb
--- /dev/null
+++ b/openspec/archive/2026-05-demo-benchmark-orchestration/design.md
@@ -0,0 +1,39 @@
+## Context
+
+The current demo path spans `src/main.ts` and `src/benchmark/Benchmark.ts`, but the seam is shallow. `Benchmark` owns both pure math and stateful execution, while `main.ts` owns DOM lookup, status updates, progress, validation, and direct sorter construction.
+
+This change deepens the orchestration path by making execution and presentation explicit modules with narrow interfaces.
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- Make benchmark execution unit-testable without DOM or real WebGPU.
+- Make demo UI flow unit-testable without `document`.
+- Keep `main.ts` as a thin bootstrap file.
+
+**Non-Goals:**
+
+- Redesign `GPUContext` again; that lands in `injectable-gpu-runtime`.
+- Change benchmark outputs or visible UI behavior.
+- Extract radix prefix-sum in this change.
+
+## Decisions
+
+### 1. Split pure benchmark helpers from execution
+
+Keep `Benchmark` for `calculateSpeedup`, `calculateAverage`, and `formatResults`. Move `runSingle` and `runAll` into `BenchmarkRunner`.
+
+### 2. Add a `DemoController` seam
+
+Move run-button and run-all orchestration, progress updates, validation, and error handling into a controller with injected dependencies.
+
+### 3. Isolate DOM access behind `DomView`
+
+Create a concrete adapter for DOM reads/writes so tests can drive orchestration with fake views.
+
+## Risks / Trade-offs
+
+- **More files** -> Accept smaller modules to gain locality and test seams.
+- **Public benchmark API churn** -> Preserve helper exports and document runner extraction in the change.
+- **Controller/view split may feel verbose** -> The leverage is deterministic tests and a thinner bootstrap.
diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/proposal.md b/openspec/archive/2026-05-demo-benchmark-orchestration/proposal.md
new file mode 100644
index 0000000..4cd85de
--- /dev/null
+++ b/openspec/archive/2026-05-demo-benchmark-orchestration/proposal.md
@@ -0,0 +1,27 @@
+## Why
+
+`main.ts` and `Benchmark` still braid DOM wiring, benchmark sequencing, sorter lifecycle, validation, and presentation into shallow modules. That makes the demo flow hard to unit-test and keeps UI/runtime knowledge spread across multiple call sites.
+
+## What Changes
+
+- Extract benchmark execution into a dedicated `BenchmarkRunner` seam with injected sorter factory, random data provider, and clock.
+- Extract demo orchestration into a `DemoController` seam with a `DomView` adapter for DOM reads/writes.
+- Shrink `main.ts` to bootstrap only and keep `Benchmark` focused on pure formatting/math helpers.
+- Add unit tests for orchestration, progress/status flow, validation flow, and error handling without requiring real WebGPU.
+
+## Capabilities
+
+### New Capabilities
+
+- `demo-orchestration`: Deep orchestration seam for demo and benchmark flows, separating controller logic from DOM adapters and benchmark execution.
+
+### Modified Capabilities
+
+- `sorting`: Benchmark execution moves behind an injected runner while preserving existing benchmark behavior.
+
+## Impact
+
+- **Affected code:** `src/main.ts`, `src/benchmark/Benchmark.ts`, new `src/benchmark/BenchmarkRunner.ts`, new `src/demo/` files, browser/unit tests
+- **Affected APIs:** `Benchmark` likely loses execution methods; orchestration moves into new modules.
+- **Dependencies:** No new packages.
+- **Systems:** Demo UI, benchmark execution, validation flow, and public benchmark surface.
diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/specs/demo-orchestration/spec.md b/openspec/archive/2026-05-demo-benchmark-orchestration/specs/demo-orchestration/spec.md
new file mode 100644
index 0000000..dbbd627
--- /dev/null
+++ b/openspec/archive/2026-05-demo-benchmark-orchestration/specs/demo-orchestration/spec.md
@@ -0,0 +1,24 @@
+## ADDED Requirements
+
+### Requirement: Demo orchestration uses injected seams
+
+The demo benchmark flow SHALL execute through controller and view seams so orchestration can be tested without direct DOM or WebGPU dependencies.
+
+#### Scenario: Controller drives benchmark execution
+
+- **WHEN** a user starts a single benchmark or the full suite
+- **THEN** the controller SHALL coordinate progress, status, validation, and result reporting through injected runner and view interfaces
+
+#### Scenario: DOM access stays in the view adapter
+
+- **WHEN** the demo needs to read controls or update status, progress, and results
+- **THEN** those DOM operations SHALL be isolated behind a concrete view adapter
+
+### Requirement: Benchmark execution is a separate runner
+
+Benchmark execution SHALL live behind a runner seam distinct from pure formatting and math helpers.
+
+#### Scenario: Runner executes benchmark cases
+
+- **WHEN** benchmark execution is requested
+- **THEN** the runner SHALL use injected sorter factories, random data providers, and clocks to produce benchmark results
diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/specs/sorting/spec.md b/openspec/archive/2026-05-demo-benchmark-orchestration/specs/sorting/spec.md
new file mode 100644
index 0000000..53e6feb
--- /dev/null
+++ b/openspec/archive/2026-05-demo-benchmark-orchestration/specs/sorting/spec.md
@@ -0,0 +1,15 @@
+## MODIFIED Requirements
+
+### Requirement: Performance Benchmarking
+
+The benchmark system SHALL preserve its existing timing and reporting behavior while executing through a dedicated runner seam rather than the formatting helper module.
+
+#### Scenario: Runner preserves benchmark behavior
+
+- **WHEN** a caller runs a single benchmark or the default suite
+- **THEN** the system SHALL still measure native and GPU paths, average timings, and report speedups as before
+
+#### Scenario: Formatting stays available independently
+
+- **WHEN** code needs to format benchmark results
+- **THEN** it SHALL be able to do so without instantiating the execution runner
diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/tasks.md b/openspec/archive/2026-05-demo-benchmark-orchestration/tasks.md
new file mode 100644
index 0000000..62e1d73
--- /dev/null
+++ b/openspec/archive/2026-05-demo-benchmark-orchestration/tasks.md
@@ -0,0 +1,17 @@
+## 1. Test-first orchestration seams
+
+- [ ] 1.1 Add failing unit tests for `BenchmarkRunner` execution with injected sorter factory and random provider
+- [ ] 1.2 Add failing unit tests for `DemoController` progress, validation, and error flow with a fake view
+
+## 2. Implementation
+
+- [ ] 2.1 Extract pure helpers into a slim `Benchmark` module
+- [ ] 2.2 Add `BenchmarkRunner` and move benchmark execution there
+- [ ] 2.3 Add `DemoController` and `DomView`, then shrink `main.ts` to bootstrap only
+
+## 3. Validation
+
+- [ ] 3.1 Run `npm run typecheck`
+- [ ] 3.2 Run `npm run lint`
+- [ ] 3.3 Run `npm run test`
+- [ ] 3.4 Run `npm run build`
diff --git a/openspec/changes/doc-implementation-sync/proposal.md b/openspec/archive/2026-05-doc-implementation-sync/proposal.md
similarity index 100%
rename from openspec/changes/doc-implementation-sync/proposal.md
rename to openspec/archive/2026-05-doc-implementation-sync/proposal.md
diff --git a/openspec/changes/doc-implementation-sync/tasks.md b/openspec/archive/2026-05-doc-implementation-sync/tasks.md
similarity index 100%
rename from openspec/changes/doc-implementation-sync/tasks.md
rename to openspec/archive/2026-05-doc-implementation-sync/tasks.md
diff --git a/openspec/changes/browser-e2e-testing/tasks.md b/openspec/changes/browser-e2e-testing/tasks.md
index 943b01d..213a838 100644
--- a/openspec/changes/browser-e2e-testing/tasks.md
+++ b/openspec/changes/browser-e2e-testing/tasks.md
@@ -27,7 +27,7 @@
 - [x] Create `test/browser/` directory
 - [x] Create `playwright.config.ts`
 - [x] Add test scripts to `package.json`
-- [ ] Install Playwright browsers: `npx playwright install chromium`
+- [x] Install Playwright browsers: `npx playwright install chromium`
 
 ## Phase 2: Test Infrastructure
 
@@ -105,6 +105,7 @@ None.
 - Browser tests require a display (use xvfb in CI if needed)
 - WebGPU may not be available in all CI environments
 - Consider using `test.skip()` for GPU tests when WebGPU unavailable
+- **Blocker (2026-05-22)**: System lacks `libasound.so.2` and other libraries required by Playwright's Chromium. Installing requires sudo: `npx playwright install-deps chromium`
 
 ---
 
diff --git a/playwright.config.ts b/playwright.config.ts
index 073f352..d3c8408 100644
--- a/playwright.config.ts
+++ b/playwright.config.ts
@@ -5,6 +5,7 @@ import { defineConfig, devices } from '@playwright/test';
  */
 export default defineConfig({
   testDir: './test/browser',
+  testMatch: '**/*.e2e.ts',
   fullyParallel: true,
   forbidOnly: !!process.env.CI,
   retries: process.env.CI ? 2 : 0,
@@ -18,11 +19,21 @@ export default defineConfig({
   projects: [
     {
       name: 'chromium',
-      use: { ...devices['Desktop Chrome'] },
+      use: {
+        ...devices['Desktop Chrome'],
+        launchOptions: {
+          args: [
+            '--enable-unsafe-webgpu',
+            '--enable-features=Vulkan',
+            '--use-vulkan=swiftshader',
+            '--disable-gpu-sandbox',
+          ],
+        },
+      },
     },
   ],
   webServer: {
-    command: 'npm run dev',
+    command: 'npx vite --port 5173',
     url: 'http://localhost:5173',
     reuseExistingServer: !process.env.CI,
     timeout: 120000,
diff --git a/src/sorting/scan/ScanModule.ts b/src/sorting/scan/ScanModule.ts
index e5eb800..a4e36ee 100644
--- a/src/sorting/scan/ScanModule.ts
+++ b/src/sorting/scan/ScanModule.ts
@@ -208,6 +208,9 @@ export class ScanModule {
     const scanUniformData = new Uint32Array([dataSize, numScanBlocks, 0, 0]);
     this.device.queue.writeBuffer(uniformBuffer, 0, scanUniformData);
 
+    // Use a single command encoder for all dispatches to ensure proper ordering
+    const commandEncoder = this.device.createCommandEncoder();
+
     // Step 1: Local Blelloch scan within each workgroup
     {
       const bindGroup = this.device.createBindGroup({
@@ -221,13 +224,11 @@ export class ScanModule {
         ],
       });
 
-      const commandEncoder = this.device.createCommandEncoder();
       const passEncoder = commandEncoder.beginComputePass();
       passEncoder.setPipeline(blellochPipeline);
       passEncoder.setBindGroup(0, bindGroup);
       passEncoder.dispatchWorkgroups(numScanBlocks);
       passEncoder.end();
-      this.device.queue.submit([commandEncoder.finish()]);
     }
 
     // Step 2: Scan the block sums (if more than one block)
@@ -243,13 +244,11 @@ export class ScanModule {
         ],
       });
 
-      const commandEncoder = this.device.createCommandEncoder();
       const passEncoder = commandEncoder.beginComputePass();
       passEncoder.setPipeline(scanBlockSumsPipeline);
       passEncoder.setBindGroup(0, bindGroup);
       passEncoder.dispatchWorkgroups(1);
       passEncoder.end();
-      this.device.queue.submit([commandEncoder.finish()]);
 
       // Step 3: Add block prefixes to each block's local results
       {
@@ -264,15 +263,16 @@ export class ScanModule {
           ],
         });
 
-        const commandEncoder = this.device.createCommandEncoder();
         const passEncoder = commandEncoder.beginComputePass();
         passEncoder.setPipeline(addBlockPrefixesPipeline);
         passEncoder.setBindGroup(0, bindGroup);
         passEncoder.dispatchWorkgroups(numScanBlocks);
         passEncoder.end();
-        this.device.queue.submit([commandEncoder.finish()]);
       }
     }
+
+    // Submit all commands together
+    this.device.queue.submit([commandEncoder.finish()]);
   }
 
   /**
diff --git a/test/browser/fixtures.ts b/test/browser/fixtures.ts
index 6453aa0..165a9ab 100644
--- a/test/browser/fixtures.ts
+++ b/test/browser/fixtures.ts
@@ -9,6 +9,11 @@ import { test as base, expect } from '@playwright/test';
 export const test = base.extend<{
   webgpuSupported: boolean;
 }>({
+  // Navigate to the Vite dev server before each test
+  page: async ({ page, baseURL }, use) => {
+    await page.goto(baseURL || 'http://localhost:5173');
+    await use(page);
+  },
   webgpuSupported: async ({ page }, use) => {
     const isSupported = await page.evaluate(() => {
       return 'gpu' in navigator;