From 3b81bbe36212325152f276f1c5ed20cd65251970 Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Fri, 22 May 2026 15:11:26 +0800 Subject: [PATCH 1/2] refactor: extract scan module and archive completed changes - Extract GPU prefix sum into dedicated ScanModule - Create src/sorting/scan/ with ScanModule class - Split scan WGSL kernels from radix.wgsl to scan.wgsl - Refactor RadixSorter to consume ScanModule - Add ScanModule interface tests - Archive completed OpenSpec changes: - sorting-runtime-hardening - injectable-gpu-runtime - scan-module-extraction This improves code organization by giving GPU prefix sum its own module, shader ownership, and tests. RadixSorter is now reduced to histogram/scatter orchestration plus scan consumption. --- .../.openspec.yaml | 2 + .../2026-05-injectable-gpu-runtime/design.md | 63 ++++ .../proposal.md | 27 ++ .../specs/infrastructure/spec.md | 25 ++ .../2026-05-injectable-gpu-runtime/tasks.md | 18 + .../.openspec.yaml | 2 + .../2026-05-scan-module-extraction/design.md | 39 +++ .../proposal.md | 27 ++ .../specs/gpu-prefix-sum-module/spec.md | 15 + .../specs/sorting/spec.md | 15 + .../2026-05-scan-module-extraction/tasks.md | 18 + .../.openspec.yaml | 2 + .../design.md | 71 ++++ .../proposal.md | 30 ++ .../specs/infrastructure/spec.md | 39 +++ .../specs/sorting/spec.md | 20 ++ .../tasks.md | 21 ++ src/shaders/radix.wgsl | 248 +------------- src/shaders/scan.wgsl | 244 +++++++++++++ src/sorting/RadixSorter.ts | 181 +--------- src/sorting/scan/ScanModule.ts | 324 ++++++++++++++++++ src/sorting/scan/index.ts | 5 + test/sorting/ScanModule.test.ts | 199 +++++++++++ 23 files changed, 1220 insertions(+), 415 deletions(-) create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/.openspec.yaml create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/design.md create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/proposal.md create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/specs/infrastructure/spec.md create mode 100644 openspec/archive/2026-05-injectable-gpu-runtime/tasks.md create mode 100644 openspec/archive/2026-05-scan-module-extraction/.openspec.yaml create mode 100644 openspec/archive/2026-05-scan-module-extraction/design.md create mode 100644 openspec/archive/2026-05-scan-module-extraction/proposal.md create mode 100644 openspec/archive/2026-05-scan-module-extraction/specs/gpu-prefix-sum-module/spec.md create mode 100644 openspec/archive/2026-05-scan-module-extraction/specs/sorting/spec.md create mode 100644 openspec/archive/2026-05-scan-module-extraction/tasks.md create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/.openspec.yaml create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/design.md create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/proposal.md create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/specs/infrastructure/spec.md create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/specs/sorting/spec.md create mode 100644 openspec/archive/2026-05-sorting-runtime-hardening/tasks.md create mode 100644 src/shaders/scan.wgsl create mode 100644 src/sorting/scan/ScanModule.ts create mode 100644 src/sorting/scan/index.ts create mode 100644 test/sorting/ScanModule.test.ts diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/.openspec.yaml b/openspec/archive/2026-05-injectable-gpu-runtime/.openspec.yaml new file mode 100644 index 0000000..4a1c677 --- /dev/null +++ b/openspec/archive/2026-05-injectable-gpu-runtime/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-22 diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/design.md b/openspec/archive/2026-05-injectable-gpu-runtime/design.md new file mode 100644 index 0000000..36740ce --- /dev/null +++ b/openspec/archive/2026-05-injectable-gpu-runtime/design.md @@ -0,0 +1,63 @@ +## Context + +`GPUContext` currently owns both the interface and the browser-specific implementation for support checks, adapter acquisition, device creation, limits mapping, and device-loss subscription. Tests can only reach most branches by stubbing browser globals, which keeps the interface shallow and the implementation hard to move. + +The goal of this change is to deepen `GPUContext`, not replace it. Callers should still use one lifecycle module, but the browser runtime becomes an adapter behind a seam. + +## Goals / Non-Goals + +**Goals:** + +- Keep `GPUContext` as the public lifecycle module. +- Introduce an injectable runtime interface for support checks and adapter acquisition. +- Preserve `new GPUContext()` as the default browser path. +- Make initialization, failure, loss, and recovery paths unit-testable without browser globals. + +**Non-Goals:** + +- Refactor demo/benchmark orchestration in this change. +- Add non-browser production adapters yet. +- Change sorting algorithms or shader ownership. + +## Decisions + +### 1. Add a `GPURuntime` seam under `src/core/runtime/` + +Create an interface that owns: + +- support detection +- adapter acquisition + +Add `browserGPURuntime` as the default adapter that wraps `navigator.gpu`. + +**Why:** Browser-global access is the shallow part. Moving it behind one seam gives `GPUContext` leverage and keeps future adapters possible. + +**Alternative:** Keep static browser-global checks and only inject requestAdapter. Rejected because support detection and acquisition belong to the same runtime adapter. + +### 2. Keep `GPUContext` as the public module + +`GPUContext` constructor accepts an optional runtime adapter, defaulting to `browserGPURuntime`. `initialize()` and `recover()` continue to define lifecycle semantics and error shaping. + +**Why:** Callers keep one familiar interface. The seam changes implementation ownership, not the high-level API shape. + +**Alternative:** Replace `GPUContext` with factories or free functions. Rejected because it would increase churn before the orchestration seam lands. + +### 3. Export runtime types, not extra orchestration + +Export the runtime interface and browser adapter from `src/index.ts` so tests and advanced consumers can inject them directly. + +**Why:** Constructor injection without exported types keeps the seam half-hidden. + +**Alternative:** Keep runtime files internal. Rejected because public constructor injection needs a public type. + +### 4. Rewrite `GPUContext` tests around fake runtimes + +Use injected fake runtimes/adapters/devices for initialization branches, limits mapping, recovery, and device-loss callbacks. Keep only small browser-global tests for the default static support path. + +**Why:** The interface is the test surface. The fake runtime gives locality for failure-path testing without browser globals. + +## Risks / Trade-offs + +- **Public API surface grows slightly** -> Limit it to runtime interface + default browser adapter. +- **Fake GPU objects can drift from WebGPU reality** -> Keep browser E2E smoke tests and only fake branches that do not need real hardware. +- **Future adapters may want more hooks** -> Start with the smallest seam: support detection + adapter acquisition. diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/proposal.md b/openspec/archive/2026-05-injectable-gpu-runtime/proposal.md new file mode 100644 index 0000000..305c968 --- /dev/null +++ b/openspec/archive/2026-05-injectable-gpu-runtime/proposal.md @@ -0,0 +1,27 @@ +## Why + +`GPUContext` still hard-codes `navigator.gpu`, adapter acquisition, and device-loss wiring inside one shallow module. That blocks deterministic unit tests, leaks browser globals into call sites, and makes future runtime adapters impossible without editing the implementation in place. + +## What Changes + +- Add an injectable GPU runtime seam so `GPUContext` acquires adapters and support checks through an adapter interface instead of browser globals. +- Keep `GPUContext` as the public lifecycle module, but move browser-global access into a dedicated browser adapter. +- Expand unit coverage for adapter absence, device request failure, limits mapping, device-loss callbacks, and recovery using fake runtimes. +- Export the runtime types needed for external injection while keeping the default browser path intact. + +## Capabilities + +### New Capabilities + +- None. + +### Modified Capabilities + +- `infrastructure`: GPU context initialization must support injected runtime adapters while preserving the default browser runtime path. + +## Impact + +- **Affected code:** `src/core/GPUContext.ts`, new `src/core/runtime/` files, `src/index.ts`, `test/core/GPUContext.test.ts` +- **Affected APIs:** `GPUContext` constructor gains an optional runtime adapter; runtime interface/types become importable. +- **Dependencies:** No new packages. +- **Systems:** Core runtime initialization, lifecycle handling, and unit-test seams. diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/specs/infrastructure/spec.md b/openspec/archive/2026-05-injectable-gpu-runtime/specs/infrastructure/spec.md new file mode 100644 index 0000000..38341f2 --- /dev/null +++ b/openspec/archive/2026-05-injectable-gpu-runtime/specs/infrastructure/spec.md @@ -0,0 +1,25 @@ +## MODIFIED Requirements + +### Requirement: GPU Context Initialization + +The GPUContext SHALL initialize through an injectable runtime adapter while preserving the default browser runtime path and existing lifecycle guarantees. + +#### Scenario: Default browser runtime remains available + +- **WHEN** a caller constructs `GPUContext` without providing a runtime adapter +- **THEN** the context SHALL use the browser runtime adapter for support detection and adapter acquisition + +#### Scenario: Injected runtime drives initialization + +- **WHEN** a caller constructs `GPUContext` with a runtime adapter +- **THEN** `initialize()` SHALL use that adapter instead of reading browser globals directly + +#### Scenario: Injected runtime failure is surfaced as typed errors + +- **WHEN** the injected runtime reports no adapter or device creation fails +- **THEN** GPUContext SHALL preserve the existing typed error semantics for adapter and device failures + +#### Scenario: Injected runtime remains compatible with recovery and loss callbacks + +- **WHEN** device loss occurs and `recover()` is called on a context using an injected runtime +- **THEN** the context SHALL reset state, reacquire a device through the same runtime adapter, and continue notifying registered callbacks diff --git a/openspec/archive/2026-05-injectable-gpu-runtime/tasks.md b/openspec/archive/2026-05-injectable-gpu-runtime/tasks.md new file mode 100644 index 0000000..550e47e --- /dev/null +++ b/openspec/archive/2026-05-injectable-gpu-runtime/tasks.md @@ -0,0 +1,18 @@ +## 1. OpenSpec and test setup + +- [x] 1.1 Finalize infrastructure delta spec for injected GPU runtime behavior +- [x] 1.2 Add failing `GPUContext` tests for injected runtime initialization, failure paths, limits mapping, and recovery/loss callbacks + +## 2. Runtime seam implementation + +- [x] 2.1 Add `src/core/runtime/` runtime interface and default browser adapter +- [x] 2.2 Refactor `GPUContext` to use injected runtime adapters while preserving `new GPUContext()` as the default path +- [x] 2.3 Export runtime seam types from `src/index.ts` + +## 3. Validation + +- [x] 3.1 Update `GPUContext` tests to remove unnecessary browser-global stubbing from non-browser paths +- [x] 3.2 Run `npm run typecheck` +- [x] 3.3 Run `npm run lint` +- [x] 3.4 Run `npm run test` +- [x] 3.5 Run `npm run build` diff --git a/openspec/archive/2026-05-scan-module-extraction/.openspec.yaml b/openspec/archive/2026-05-scan-module-extraction/.openspec.yaml new file mode 100644 index 0000000..4a1c677 --- /dev/null +++ b/openspec/archive/2026-05-scan-module-extraction/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-22 diff --git a/openspec/archive/2026-05-scan-module-extraction/design.md b/openspec/archive/2026-05-scan-module-extraction/design.md new file mode 100644 index 0000000..88ccf47 --- /dev/null +++ b/openspec/archive/2026-05-scan-module-extraction/design.md @@ -0,0 +1,39 @@ +## Context + +The repository already moved radix prefix-sum onto the GPU, but the implementation stayed embedded in `RadixSorter`. That gives callers no scan seam, couples shader ownership to radix sorting, and leaves scan behavior hard to test independently. + +This change extracts the scan path into a deep module that RadixSorter can consume. + +## Goals / Non-Goals + +**Goals:** + +- Give GPU prefix-sum its own module, shader ownership, and tests. +- Reduce `RadixSorter` to histogram/scatter orchestration plus scan consumption. +- Align docs/specs with actual GPU prefix-sum behavior. + +**Non-Goals:** + +- Change radix sort user-facing behavior. +- Refactor demo/benchmark orchestration here. +- Build a general-purpose GPU algorithm framework. + +## Decisions + +### 1. Add `src/sorting/scan/` + +Create a scan module with its own interface, initialization, dispatch, and teardown. + +### 2. Split shader ownership + +Move Blelloch scan kernels out of `radix.wgsl` into scan-owned WGSL so shader locality matches module ownership. + +### 3. Keep RadixSorter as a consumer + +RadixSorter will request exclusive prefix sums through a narrow interface and keep radix-specific buffers/passes local. + +## Risks / Trade-offs + +- **Shader split churn** -> Accept one-time movement for better locality and reuse. +- **Extra module lifecycle** -> Keep scan interface narrow and explicit. +- **Spec/doc drift risk** -> Update stable sorting specs as part of the change. diff --git a/openspec/archive/2026-05-scan-module-extraction/proposal.md b/openspec/archive/2026-05-scan-module-extraction/proposal.md new file mode 100644 index 0000000..80066a4 --- /dev/null +++ b/openspec/archive/2026-05-scan-module-extraction/proposal.md @@ -0,0 +1,27 @@ +## Why + +`RadixSorter` still owns histogram, scatter, and GPU prefix-sum orchestration inside one shallow module. The scan pipeline has real leverage on its own, but today its interface is trapped inside `RadixSorter` and its shader kernels are mixed into `radix.wgsl`. + +## What Changes + +- Extract GPU prefix-sum into a dedicated scan module with its own initialization, dispatch, and cleanup. +- Split scan WGSL kernels from radix-only kernels. +- Update RadixSorter to depend on the scan seam instead of owning scan pipeline details directly. +- Add dedicated scan tests and align specs/docs with GPU-based prefix-sum ownership. + +## Capabilities + +### New Capabilities + +- `gpu-prefix-sum-module`: Dedicated scan module for GPU exclusive prefix sums with reusable runtime ownership. + +### Modified Capabilities + +- `sorting`: Radix sort shall consume the standalone scan module instead of embedding scan orchestration directly. + +## Impact + +- **Affected code:** `src/sorting/RadixSorter.ts`, new `src/sorting/scan/` files, `src/shaders/radix.wgsl`, new scan shader file(s), related tests/docs/specs +- **Affected APIs:** New scan module surface; RadixSorter internal orchestration changes. +- **Dependencies:** Likely overlaps conceptually with existing `gpu-prefix-sum` implementation history but should land as a new architectural extraction change. +- **Systems:** Radix sorting internals, shader ownership, and scan-specific tests. diff --git a/openspec/archive/2026-05-scan-module-extraction/specs/gpu-prefix-sum-module/spec.md b/openspec/archive/2026-05-scan-module-extraction/specs/gpu-prefix-sum-module/spec.md new file mode 100644 index 0000000..b6bf18b --- /dev/null +++ b/openspec/archive/2026-05-scan-module-extraction/specs/gpu-prefix-sum-module/spec.md @@ -0,0 +1,15 @@ +## ADDED Requirements + +### Requirement: GPU prefix-sum is a standalone module + +The system SHALL provide GPU exclusive prefix-sum through a dedicated scan module with its own initialization, dispatch, and cleanup interface. + +#### Scenario: Scan module computes exclusive prefix sums + +- **WHEN** a caller provides input, output, and scan configuration buffers +- **THEN** the scan module SHALL execute the GPU prefix-sum passes without requiring RadixSorter-specific knowledge + +#### Scenario: Scan module owns scan pipeline resources + +- **WHEN** the scan module is initialized or destroyed +- **THEN** it SHALL create and clean up scan-specific pipeline and shader resources independently diff --git a/openspec/archive/2026-05-scan-module-extraction/specs/sorting/spec.md b/openspec/archive/2026-05-scan-module-extraction/specs/sorting/spec.md new file mode 100644 index 0000000..e10e679 --- /dev/null +++ b/openspec/archive/2026-05-scan-module-extraction/specs/sorting/spec.md @@ -0,0 +1,15 @@ +## MODIFIED Requirements + +### Requirement: Radix Sort Implementation + +The RadixSorter SHALL perform histogram and scatter passes while delegating GPU prefix-sum work to a dedicated scan module. + +#### Scenario: RadixSorter consumes the scan seam + +- **WHEN** a radix pass needs prefix sums +- **THEN** RadixSorter SHALL invoke the standalone scan module instead of owning scan dispatch logic directly + +#### Scenario: Sorting behavior remains unchanged + +- **WHEN** valid input arrays are sorted through RadixSorter +- **THEN** the sorter SHALL continue producing correctly sorted ascending output diff --git a/openspec/archive/2026-05-scan-module-extraction/tasks.md b/openspec/archive/2026-05-scan-module-extraction/tasks.md new file mode 100644 index 0000000..ee931de --- /dev/null +++ b/openspec/archive/2026-05-scan-module-extraction/tasks.md @@ -0,0 +1,18 @@ +## 1. Test-first scan seam + +- [x] 1.1 Add failing tests for scan-module interface and scan-specific runtime behavior +- [x] 1.2 Add failing integration coverage for RadixSorter consuming the scan seam + +## 2. Implementation + +- [x] 2.1 Add scan module files under `src/sorting/scan/` +- [x] 2.2 Split scan WGSL kernels from `radix.wgsl` +- [x] 2.3 Refactor `RadixSorter` to consume the scan module +- [x] 2.4 Update docs/specs for GPU prefix-sum ownership + +## 3. Validation + +- [x] 3.1 Run `npm run typecheck` +- [x] 3.2 Run `npm run lint` +- [x] 3.3 Run `npm run test` +- [x] 3.4 Run `npm run build` diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/.openspec.yaml b/openspec/archive/2026-05-sorting-runtime-hardening/.openspec.yaml new file mode 100644 index 0000000..af43829 --- /dev/null +++ b/openspec/archive/2026-05-sorting-runtime-hardening/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-21 diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/design.md b/openspec/archive/2026-05-sorting-runtime-hardening/design.md new file mode 100644 index 0000000..6a4776e --- /dev/null +++ b/openspec/archive/2026-05-sorting-runtime-hardening/design.md @@ -0,0 +1,71 @@ +## Context + +Three runtime hot spots are coupled today: + +1. `Benchmark.generateRandomData()` and several browser tests call `crypto.getRandomValues()` once per array, which breaks for the repository's own 100K and 1M benchmark sizes in quota-limited browser implementations. +2. `BitonicSorter` and `RadixSorter` each own temporary GPU buffer allocation and cleanup inline, but they do it differently. Bitonic cleanup is not fully scoped to failure paths, while Radix directly destroys buffers outside `BufferManager` ownership. +3. `BufferManager.readBuffer()` is the readback seam for both sorters, yet it does not use the repository's existing timeout guard. + +This change hardens runtime behavior without relitigating algorithm choice or public sorter APIs. + +## Goals / Non-Goals + +**Goals:** + +- Guarantee large-array random data generation for benchmark, demo-adjacent, and browser test flows. +- Create one deep temporary-buffer seam shared by both sorters. +- Make readback failure modes bounded and deterministic. +- Strengthen tests around the runtime behaviors that previously escaped coverage. + +**Non-Goals:** + +- Redesign `GPUContext` device-loss recovery in this change. +- Change WGSL algorithm behavior or benchmark UX. +- Introduce a general-purpose GPU buffer pool or new external dependency. + +## Decisions + +### 1. Add a shared random-data utility in `src/shared/` + +Create a focused utility that fills `Uint32Array` instances in quota-safe chunks and provides a convenience generator for callers that need new arrays. `Benchmark` and browser tests will consume the same seam. + +**Why:** Large-array randomness is a cross-cutting runtime concern, not benchmark-only behavior. One utility concentrates the browser quota rule and removes repeated inline crypto logic. + +**Alternatives considered:** + +- Patch only `Benchmark.generateRandomData()` -> rejected because browser tests would keep the same failure mode. +- Use `Math.random()` everywhere -> rejected because it weakens existing randomness guarantees where crypto is available. + +### 2. Introduce a transient buffer scope for sorter-owned temporary buffers + +Add a small internal module that registers temporary GPU buffers, releases them exactly once in `finally`, and stays separate from explicit preallocation ownership. Both sorters will use this seam for per-sort buffers while preallocated buffers remain outside the scope. + +**Why:** The current modules are shallow around cleanup: each sorter repeats ownership decisions inline, so bugs hide in call sequencing instead of one interface. A transient scope increases locality and gives tests a single interface to verify. + +**Alternatives considered:** + +- Expand `BufferManager` to own every buffer in the system -> rejected because it deepens a mixed-responsibility module in the wrong direction. +- Leave cleanup inline and add comments -> rejected because comments do not create a testable seam. + +### 3. Apply timeout policy at the readback seam + +Wrap `mapAsync()` in `BufferManager.readBuffer()` with the existing timeout utility and preserve cleanup for both success and failure paths. + +**Why:** Readback is where an unresponsive GPU becomes visible to CPU callers. The timeout belongs at this seam so callers do not each reinvent it. + +**Alternatives considered:** + +- Timeout at individual sorter call sites -> rejected because it duplicates policy and misses future `BufferManager` consumers. +- No timeout, rely on device loss -> rejected because hanging promises do not surface actionable errors. + +### 4. Tighten tests around real runtime behavior + +Add unit tests for quota-safe random filling and buffer-scope cleanup, then update browser tests to use the shared random utility and call `gpu.isInitialized()` instead of asserting on a method reference. + +**Why:** Existing tests mostly cover pure helpers; they do not protect the runtime seams that actually failed. + +## Risks / Trade-offs + +- **More internal modules** -> Keep new seams narrowly scoped and internal-only to avoid API sprawl. +- **Timeout values may need tuning across devices** -> Reuse existing timeout defaults first; keep the timeout wrapper centralized for later adjustment. +- **Browser tests still depend on WebGPU availability** -> Focus new assertions on deterministic setup and data generation so skipped environments do not hide logic bugs. diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/proposal.md b/openspec/archive/2026-05-sorting-runtime-hardening/proposal.md new file mode 100644 index 0000000..59c777c --- /dev/null +++ b/openspec/archive/2026-05-sorting-runtime-hardening/proposal.md @@ -0,0 +1,30 @@ +# Change Proposal: Sorting Runtime Hardening + +## Why + +The current runtime promises large-array benchmarking and browser validation, but multiple code paths still use one-shot `crypto.getRandomValues()` calls that fail once arrays exceed browser quota limits. At the same time, temporary GPU buffer ownership is duplicated across sorters, making failure-path cleanup shallow, fragile, and hard to test. + +## What Changes + +- Add a shared quota-safe random `Uint32Array` generator and route benchmark/demo/test paths through it. +- Introduce a dedicated transient GPU buffer seam so Bitonic and Radix sorters clean up temporary buffers through one ownership model. +- Harden buffer readback with timeout-backed mapping and deterministic staging-buffer cleanup. +- Tighten browser and unit tests so large-array generation and initialization assertions catch real runtime behavior instead of passing accidentally. + +## Capabilities + +### New Capabilities + +- None. + +### Modified Capabilities + +- `sorting`: Benchmark-driven sorting flows must support default large dataset sizes without failing because of randomness API call limits. +- `infrastructure`: Buffer readback and sorter resource lifecycle must time out or clean up deterministically on every exit path. + +## Impact + +- **Affected code:** `src/benchmark/`, `src/shared/`, `src/core/`, `src/sorting/`, `src/index.ts`, `test/benchmark/`, `test/core/`, `test/browser/` +- **Affected APIs:** No intentional public API expansion; internal runtime contracts become stricter and more explicit. +- **Dependencies:** Reuses existing timeout utility and Web Crypto support; no new packages. +- **Systems:** Browser benchmark/demo flow, GPU sorter failure handling, and runtime-facing test coverage. diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/specs/infrastructure/spec.md b/openspec/archive/2026-05-sorting-runtime-hardening/specs/infrastructure/spec.md new file mode 100644 index 0000000..9571c6e --- /dev/null +++ b/openspec/archive/2026-05-sorting-runtime-hardening/specs/infrastructure/spec.md @@ -0,0 +1,39 @@ +## MODIFIED Requirements + +### Requirement: GPU Buffer Management + +The BufferManager SHALL provide bounded readback behavior and deterministic staging-buffer cleanup for every buffer read operation. + +#### Scenario: Successful readback releases staging buffer + +- **WHEN** `readBuffer()` completes successfully +- **THEN** it SHALL unmap the staging buffer, release it, and return a `Uint32Array` containing the requested byte range + +#### Scenario: Timed-out readback surfaces explicit failure + +- **WHEN** staging-buffer mapping exceeds the configured timeout +- **THEN** `readBuffer()` SHALL reject with `GPUTimeoutError` and release the staging buffer before returning control to the caller + +#### Scenario: Readback failure still cleans up + +- **WHEN** staging-buffer mapping or copy fails for any other reason +- **THEN** `readBuffer()` SHALL release the staging buffer and surface a typed buffer readback error + +### Requirement: Resource Lifecycle + +Sorter implementations SHALL release temporary GPU resources through a dedicated per-sort ownership seam while preserving explicitly preallocated buffers until callers clear or destroy them. + +#### Scenario: Temporary buffers are cleaned up on success + +- **WHEN** a sorter completes a sort using temporary buffers +- **THEN** all temporary buffers created for that sort SHALL be released before the sorter returns + +#### Scenario: Temporary buffers are cleaned up on failure + +- **WHEN** a sorter throws after allocating temporary buffers +- **THEN** the per-sort ownership seam SHALL still release those buffers before the error is propagated + +#### Scenario: Preallocated buffers remain opt-in + +- **WHEN** a caller uses `preallocate()` and later sorts data within that capacity +- **THEN** the sorter SHALL reuse preallocated buffers without releasing them until `clearPreallocation()` or `destroy()` is called diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/specs/sorting/spec.md b/openspec/archive/2026-05-sorting-runtime-hardening/specs/sorting/spec.md new file mode 100644 index 0000000..273aa84 --- /dev/null +++ b/openspec/archive/2026-05-sorting-runtime-hardening/specs/sorting/spec.md @@ -0,0 +1,20 @@ +## MODIFIED Requirements + +### Requirement: Performance Benchmarking + +The system SHALL benchmark JavaScript native sort, Bitonic sort, and Radix sort across supported dataset sizes without failing because benchmark input generation exceeds browser randomness API quotas. + +#### Scenario: Benchmark generates large default datasets + +- **WHEN** the benchmark generates input for configured sizes such as 100K or 1M `u32` values +- **THEN** it SHALL fill the array in quota-safe chunks and preserve the exact requested length + +#### Scenario: Benchmark reports averaged timings + +- **WHEN** multiple iterations complete for a benchmark run +- **THEN** the system SHALL report averaged total timing and GPU timing when the selected algorithm exposes it + +#### Scenario: Benchmark compares GPU runs against native sort + +- **WHEN** Bitonic or Radix results are reported +- **THEN** the system SHALL include speedup data derived from the corresponding JavaScript native benchmark result diff --git a/openspec/archive/2026-05-sorting-runtime-hardening/tasks.md b/openspec/archive/2026-05-sorting-runtime-hardening/tasks.md new file mode 100644 index 0000000..64a65e1 --- /dev/null +++ b/openspec/archive/2026-05-sorting-runtime-hardening/tasks.md @@ -0,0 +1,21 @@ +## 1. Spec and test seam setup + +- [x] 1.1 Finalize delta specs for sorting benchmark input generation and infrastructure cleanup/timeout behavior +- [x] 1.2 Add failing benchmark tests for quota-limited random generation +- [x] 1.3 Add failing core tests for transient buffer cleanup and timed readback behavior + +## 2. Runtime hardening implementation + +- [x] 2.1 Implement a shared quota-safe random `Uint32Array` utility in `src/shared/` +- [x] 2.2 Refactor `Benchmark` and browser tests to use the shared random utility +- [x] 2.3 Introduce a transient GPU buffer scope and refactor both sorters to use it +- [x] 2.4 Wrap `BufferManager.readBuffer()` with the existing timeout utility and preserve cleanup on all paths + +## 3. Validation and cleanup + +- [x] 3.1 Remove sorter non-null assertions by replacing them with explicit ownership guards +- [x] 3.2 Fix browser initialization assertions so they invoke real APIs +- [x] 3.3 Run `npm run lint` +- [x] 3.4 Run `npm run typecheck` +- [x] 3.5 Run `npm run test` +- [x] 3.6 Run `npm run build` diff --git a/src/shaders/radix.wgsl b/src/shaders/radix.wgsl index 8486490..346cb42 100644 --- a/src/shaders/radix.wgsl +++ b/src/shaders/radix.wgsl @@ -1,6 +1,6 @@ // Radix Sort WGSL Compute Shaders -// Implements 4-bit radix sort with histogram, prefix sum, and scatter -// Includes GPU-based Blelloch scan for prefix sum computation +// Implements 4-bit radix sort with histogram and scatter +// Prefix sum computation is handled by scan.wgsl // // IMPORTANT: WORKGROUP_SIZE and RADIX must match the values in src/constants.ts // @see src/constants.ts - WORKGROUP_SIZE = 256, RADIX = 16 @@ -12,16 +12,8 @@ struct RadixUniforms { _pad: u32, } -struct ScanUniforms { - data_size: u32, // Total elements to scan - num_blocks: u32, // Number of workgroups/blocks - _pad1: u32, - _pad2: u32, -} - const WORKGROUP_SIZE: u32 = 256u; const RADIX: u32 = 16u; // 4-bit radix = 16 buckets -const SCAN_WORKGROUP_SIZE: u32 = 256u; // Size for prefix sum scan @group(0) @binding(0) var input_data: array; @group(0) @binding(1) var output_data: array; @@ -106,239 +98,3 @@ fn scatter( output_data[global_offset] = value; } } - -// ============================================================================ -// Blelloch Scan (Work-Efficient Parallel Prefix Sum) -// ============================================================================ - -// Shared memory for local Blelloch scan within a workgroup -var scan_shared: array; // Must be >= 2 * SCAN_WORKGROUP_SIZE - -// Bindings for prefix sum scan -@group(0) @binding(0) var scan_input: array; -@group(0) @binding(1) var scan_output: array; -@group(0) @binding(2) var block_sums: array; // Sum of each block -@group(0) @binding(3) var scan_uniforms: ScanUniforms; - -// Blelloch scan - exclusive prefix sum -// Phase 1: Up-sweep (reduce) - build binary tree of partial sums -// Phase 2: Down-sweep (distribute) - propagate sums down the tree -@compute @workgroup_size(SCAN_WORKGROUP_SIZE) -fn blelloch_scan( - @builtin(global_invocation_id) global_id: vec3, - @builtin(local_invocation_id) local_id: vec3, - @builtin(workgroup_id) workgroup_id: vec3 -) { - let tid = local_id.x; - let gid = global_id.x; - let block_id = workgroup_id.x; - let n = scan_uniforms.data_size; - let block_size = SCAN_WORKGROUP_SIZE * 2u; // Each workgroup processes 512 elements - - // Calculate the range this workgroup handles - let block_start = block_id * block_size; - let block_end = min(block_start + block_size, n); - let local_n = block_end - block_start; - - // Load data into shared memory (coalesced reads) - // Each thread loads 2 elements - let idx0 = block_start + tid; - let idx1 = block_start + tid + SCAN_WORKGROUP_SIZE; - - // Initialize shared memory - if (idx0 < n) { - scan_shared[tid] = scan_input[idx0]; - } else { - scan_shared[tid] = 0u; - } - - if (idx1 < n) { - scan_shared[tid + SCAN_WORKGROUP_SIZE] = scan_input[idx1]; - } else { - scan_shared[tid + SCAN_WORKGROUP_SIZE] = 0u; - } - - workgroupBarrier(); - - // ======================================================================== - // Phase 1: Up-sweep (Reduce) - // Build a binary tree of partial sums from leaves to root - // ======================================================================== - var offset = 1u; - var d = block_size / 2u; - while (d > 0u) { - workgroupBarrier(); - - if (tid < d) { - let ai = offset * (2u * tid + 1u) - 1u; - let bi = offset * (2u * tid + 2u) - 1u; - - // Only process if within our data range - if (bi < local_n) { - scan_shared[bi] = scan_shared[ai] + scan_shared[bi]; - } - } - - offset *= 2u; - d /= 2u; - } - - workgroupBarrier(); - - // ======================================================================== - // Phase 2: Down-sweep (Distribute) - // Propagate partial sums from root to leaves - // ======================================================================== - // Clear the last element (makes it exclusive scan) - if (tid == 0u) { - // Store block sum before clearing (for multi-block scan) - let block_sum = scan_shared[block_size - 1u]; - block_sums[block_id] = block_sum; - scan_shared[block_size - 1u] = 0u; - } - - workgroupBarrier(); - - d = 1u; - while (d < block_size) { - offset /= 2u; - workgroupBarrier(); - - if (tid < d) { - let ai = offset * (2u * tid + 1u) - 1u; - let bi = offset * (2u * tid + 2u) - 1u; - - if (bi < local_n) { - let t = scan_shared[ai]; - scan_shared[ai] = scan_shared[bi]; - scan_shared[bi] = t + scan_shared[bi]; - } - } - - d *= 2u; - } - - workgroupBarrier(); - - // ======================================================================== - // Write results back to global memory - // ======================================================================== - if (idx0 < n) { - scan_output[idx0] = scan_shared[tid]; - } - - if (idx1 < n) { - scan_output[idx1] = scan_shared[tid + SCAN_WORKGROUP_SIZE]; - } -} - -// Shared memory for block sum scan (smaller, max blocks typically < 1024) -var block_scan_shared: array; - -// Scan the block sums (second level of two-level scan) -@compute @workgroup_size(SCAN_WORKGROUP_SIZE) -fn scan_block_sums( - @builtin(global_invocation_id) global_id: vec3, - @builtin(local_invocation_id) local_id: vec3, - @builtin(workgroup_id) workgroup_id: vec3 -) { - let tid = local_id.x; - let n = scan_uniforms.num_blocks; - - // Load block sums into shared memory - if (tid < n) { - block_scan_shared[tid] = block_sums[tid]; - } else { - block_scan_shared[tid] = 0u; - } - - // Pad with zeros - if (tid + SCAN_WORKGROUP_SIZE < n) { - block_scan_shared[tid + SCAN_WORKGROUP_SIZE] = block_sums[tid + SCAN_WORKGROUP_SIZE]; - } else if (tid + SCAN_WORKGROUP_SIZE < 512u) { - block_scan_shared[tid + SCAN_WORKGROUP_SIZE] = 0u; - } - - workgroupBarrier(); - - // ======================================================================== - // Phase 1: Up-sweep (Reduce) - // ======================================================================== - var offset = 1u; - var d = 256u; // SCAN_WORKGROUP_SIZE - while (d > 0u) { - workgroupBarrier(); - - if (tid < d) { - let ai = offset * (2u * tid + 1u) - 1u; - let bi = offset * (2u * tid + 2u) - 1u; - - if (bi < n) { - block_scan_shared[bi] = block_scan_shared[ai] + block_scan_shared[bi]; - } - } - - offset *= 2u; - d /= 2u; - } - - workgroupBarrier(); - - // ======================================================================== - // Phase 2: Down-sweep (Distribute) - // ======================================================================== - if (tid == 0u) { - block_scan_shared[511u] = 0u; // Clear last element for exclusive scan - } - - workgroupBarrier(); - - d = 1u; - while (d < 512u) { - offset /= 2u; - workgroupBarrier(); - - if (tid < d) { - let ai = offset * (2u * tid + 1u) - 1u; - let bi = offset * (2u * tid + 2u) - 1u; - - if (bi < n) { - let t = block_scan_shared[ai]; - block_scan_shared[ai] = block_scan_shared[bi]; - block_scan_shared[bi] = t + block_scan_shared[bi]; - } - } - - d *= 2u; - } - - workgroupBarrier(); - - // Write scanned block sums back - if (tid < n) { - block_sums[tid] = block_scan_shared[tid]; - } -} - -// Add block prefixes to each block's local scan results -// This is the third step of two-level scan -@compute @workgroup_size(SCAN_WORKGROUP_SIZE) -fn add_block_prefixes( - @builtin(global_invocation_id) global_id: vec3, - @builtin(local_invocation_id) local_id: vec3, - @builtin(workgroup_id) workgroup_id: vec3 -) { - let tid = local_id.x; - let gid = global_id.x; - let block_id = workgroup_id.x; - let n = scan_uniforms.data_size; - - // Get the prefix for this block (sum of all previous blocks) - let block_prefix = block_sums[block_id]; - - // Add block prefix to each element in this block - let idx = gid; - if (idx < n) { - scan_output[idx] = scan_output[idx] + block_prefix; - } -} diff --git a/src/shaders/scan.wgsl b/src/shaders/scan.wgsl new file mode 100644 index 0000000..513da1e --- /dev/null +++ b/src/shaders/scan.wgsl @@ -0,0 +1,244 @@ +// Blelloch Scan WGSL Compute Shaders +// Work-efficient parallel exclusive prefix sum +// This module provides GPU-based prefix sum computation + +struct ScanUniforms { + data_size: u32, // Total elements to scan + num_blocks: u32, // Number of workgroups/blocks + _pad1: u32, + _pad2: u32, +} + +const SCAN_WORKGROUP_SIZE: u32 = 256u; // Size for prefix sum scan + +// Shared memory for local Blelloch scan within a workgroup +var scan_shared: array; // Must be >= 2 * SCAN_WORKGROUP_SIZE + +// Bindings for prefix sum scan +@group(0) @binding(0) var scan_input: array; +@group(0) @binding(1) var scan_output: array; +@group(0) @binding(2) var block_sums: array; // Sum of each block +@group(0) @binding(3) var scan_uniforms: ScanUniforms; + +// Blelloch scan - exclusive prefix sum +// Phase 1: Up-sweep (reduce) - build binary tree of partial sums +// Phase 2: Down-sweep (distribute) - propagate sums down the tree +@compute @workgroup_size(SCAN_WORKGROUP_SIZE) +fn blelloch_scan( + @builtin(global_invocation_id) global_id: vec3, + @builtin(local_invocation_id) local_id: vec3, + @builtin(workgroup_id) workgroup_id: vec3 +) { + let tid = local_id.x; + let gid = global_id.x; + let block_id = workgroup_id.x; + let n = scan_uniforms.data_size; + let block_size = SCAN_WORKGROUP_SIZE * 2u; // Each workgroup processes 512 elements + + // Calculate the range this workgroup handles + let block_start = block_id * block_size; + let block_end = min(block_start + block_size, n); + let local_n = block_end - block_start; + + // Load data into shared memory (coalesced reads) + // Each thread loads 2 elements + let idx0 = block_start + tid; + let idx1 = block_start + tid + SCAN_WORKGROUP_SIZE; + + // Initialize shared memory + if (idx0 < n) { + scan_shared[tid] = scan_input[idx0]; + } else { + scan_shared[tid] = 0u; + } + + if (idx1 < n) { + scan_shared[tid + SCAN_WORKGROUP_SIZE] = scan_input[idx1]; + } else { + scan_shared[tid + SCAN_WORKGROUP_SIZE] = 0u; + } + + workgroupBarrier(); + + // ======================================================================== + // Phase 1: Up-sweep (Reduce) + // Build a binary tree of partial sums from leaves to root + // ======================================================================== + var offset = 1u; + var d = block_size / 2u; + while (d > 0u) { + workgroupBarrier(); + + if (tid < d) { + let ai = offset * (2u * tid + 1u) - 1u; + let bi = offset * (2u * tid + 2u) - 1u; + + // Only process if within our data range + if (bi < local_n) { + scan_shared[bi] = scan_shared[ai] + scan_shared[bi]; + } + } + + offset *= 2u; + d /= 2u; + } + + workgroupBarrier(); + + // ======================================================================== + // Phase 2: Down-sweep (Distribute) + // Propagate partial sums from root to leaves + // ======================================================================== + // Clear the last element (makes it exclusive scan) + if (tid == 0u) { + // Store block sum before clearing (for multi-block scan) + let block_sum = scan_shared[block_size - 1u]; + block_sums[block_id] = block_sum; + scan_shared[block_size - 1u] = 0u; + } + + workgroupBarrier(); + + d = 1u; + while (d < block_size) { + offset /= 2u; + workgroupBarrier(); + + if (tid < d) { + let ai = offset * (2u * tid + 1u) - 1u; + let bi = offset * (2u * tid + 2u) - 1u; + + if (bi < local_n) { + let t = scan_shared[ai]; + scan_shared[ai] = scan_shared[bi]; + scan_shared[bi] = t + scan_shared[bi]; + } + } + + d *= 2u; + } + + workgroupBarrier(); + + // ======================================================================== + // Write results back to global memory + // ======================================================================== + if (idx0 < n) { + scan_output[idx0] = scan_shared[tid]; + } + + if (idx1 < n) { + scan_output[idx1] = scan_shared[tid + SCAN_WORKGROUP_SIZE]; + } +} + +// Shared memory for block sum scan (smaller, max blocks typically < 1024) +var block_scan_shared: array; + +// Scan the block sums (second level of two-level scan) +@compute @workgroup_size(SCAN_WORKGROUP_SIZE) +fn scan_block_sums( + @builtin(global_invocation_id) global_id: vec3, + @builtin(local_invocation_id) local_id: vec3, + @builtin(workgroup_id) workgroup_id: vec3 +) { + let tid = local_id.x; + let n = scan_uniforms.num_blocks; + + // Load block sums into shared memory + if (tid < n) { + block_scan_shared[tid] = block_sums[tid]; + } else { + block_scan_shared[tid] = 0u; + } + + // Pad with zeros + if (tid + SCAN_WORKGROUP_SIZE < n) { + block_scan_shared[tid + SCAN_WORKGROUP_SIZE] = block_sums[tid + SCAN_WORKGROUP_SIZE]; + } else if (tid + SCAN_WORKGROUP_SIZE < 512u) { + block_scan_shared[tid + SCAN_WORKGROUP_SIZE] = 0u; + } + + workgroupBarrier(); + + // ======================================================================== + // Phase 1: Up-sweep (Reduce) + // ======================================================================== + var offset = 1u; + var d = 256u; // SCAN_WORKGROUP_SIZE + while (d > 0u) { + workgroupBarrier(); + + if (tid < d) { + let ai = offset * (2u * tid + 1u) - 1u; + let bi = offset * (2u * tid + 2u) - 1u; + + if (bi < n) { + block_scan_shared[bi] = block_scan_shared[ai] + block_scan_shared[bi]; + } + } + + offset *= 2u; + d /= 2u; + } + + workgroupBarrier(); + + // ======================================================================== + // Phase 2: Down-sweep (Distribute) + // ======================================================================== + if (tid == 0u) { + block_scan_shared[511u] = 0u; // Clear last element for exclusive scan + } + + workgroupBarrier(); + + d = 1u; + while (d < 512u) { + offset /= 2u; + workgroupBarrier(); + + if (tid < d) { + let ai = offset * (2u * tid + 1u) - 1u; + let bi = offset * (2u * tid + 2u) - 1u; + + if (bi < n) { + let t = block_scan_shared[ai]; + block_scan_shared[ai] = block_scan_shared[bi]; + block_scan_shared[bi] = t + block_scan_shared[bi]; + } + } + + d *= 2u; + } + + workgroupBarrier(); + + // Write scanned block sums back + if (tid < n) { + block_sums[tid] = block_scan_shared[tid]; + } +} + +// Add block prefixes to each block's local scan results +// This is the third step of two-level scan +@compute @workgroup_size(SCAN_WORKGROUP_SIZE) +fn add_block_prefixes( + @builtin(global_invocation_id) global_id: vec3, + @builtin(local_invocation_id) local_id: vec3, + @builtin(workgroup_id) workgroup_id: vec3 +) { + let tid = local_id.x; + let gid = global_id.x; + let block_id = workgroup_id.x; + let n = scan_uniforms.data_size; + + // Get the prefix for this block (sum of all previous blocks) + let block_prefix = block_sums[block_id]; + + // Add block prefix to each element in this block + let idx = gid; + if (idx < n) { + scan_output[idx] = scan_output[idx] + block_prefix; + } +} \ No newline at end of file diff --git a/src/sorting/RadixSorter.ts b/src/sorting/RadixSorter.ts index 3fae6df..a3c6403 100644 --- a/src/sorting/RadixSorter.ts +++ b/src/sorting/RadixSorter.ts @@ -4,36 +4,22 @@ import { BufferScope } from '../core/BufferScope'; import { SortResult, SortOptions } from '../shared/types'; import { ShaderCompilationError } from '../core/errors'; import { Validator } from '../core/Validator'; +import { ScanModule } from './scan/ScanModule'; import radixShaderCode from '../shaders/radix.wgsl?raw'; import { WORKGROUP_SIZE, RADIX, BITS_PER_PASS, NUM_PASSES } from '../shared/constants'; -/** - * IMPORTANT: These constants must match the values in src/shaders/radix.wgsl - * @see src/shaders/radix.wgsl:14-15 - const WORKGROUP_SIZE: u32 = 256u; const RADIX: u32 = 16u; - * @see src/shaders/radix.wgsl:16 - const SCAN_WORKGROUP_SIZE: u32 = 256u; - */ - -/** Size for Blelloch scan workgroups */ -const SCAN_WORKGROUP_SIZE = 256; -/** Elements processed per scan workgroup (each thread handles 2 elements) */ -const ELEMENTS_PER_SCAN_BLOCK = SCAN_WORKGROUP_SIZE * 2; - /** * GPU-accelerated Radix Sort implementation with GPU-based prefix sum */ export class RadixSorter { private device: GPUDevice; private bufferManager: BufferManager; + private scanModule: ScanModule; + private histogramPipeline: GPUComputePipeline | null = null; private scatterPipeline: GPUComputePipeline | null = null; private bindGroupLayout: GPUBindGroupLayout | null = null; - // Blelloch scan pipelines - private blellochScanPipeline: GPUComputePipeline | null = null; - private scanBlockSumsPipeline: GPUComputePipeline | null = null; - private addBlockPrefixesPipeline: GPUComputePipeline | null = null; - private scanBindGroupLayout: GPUBindGroupLayout | null = null; - // Preallocation state private preallocatedBuffers: { input: GPUBuffer; @@ -50,6 +36,7 @@ export class RadixSorter { constructor(context: GPUContext) { this.device = context.getDevice(); this.bufferManager = new BufferManager(this.device); + this.scanModule = new ScanModule(context); } /** @@ -68,9 +55,10 @@ export class RadixSorter { // Release any existing preallocation this.clearPreallocation(); + const { elementsPerScanBlock } = ScanModule.getConstants(); const numWorkgroups = Math.ceil(maxSize / WORKGROUP_SIZE); const histogramSize = RADIX * numWorkgroups; - const numScanBlocks = Math.ceil(histogramSize / ELEMENTS_PER_SCAN_BLOCK); + const numScanBlocks = Math.ceil(histogramSize / elementsPerScanBlock); this.preallocatedBuffers = { input: this.device.createBuffer({ @@ -171,155 +159,12 @@ export class RadixSorter { }, }); - // Create scan bind group layout for Blelloch scan - this.scanBindGroupLayout = this.device.createBindGroupLayout({ - label: 'scan-bind-group-layout', - entries: [ - { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'read-only-storage' } }, - { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, - { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, - { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'uniform' } }, - ], - }); - - const scanPipelineLayout = this.device.createPipelineLayout({ - label: 'scan-pipeline-layout', - bindGroupLayouts: [this.scanBindGroupLayout], - }); - - // Create Blelloch scan pipelines - this.blellochScanPipeline = this.device.createComputePipeline({ - label: 'blelloch-scan-pipeline', - layout: scanPipelineLayout, - compute: { - module: shaderModule, - entryPoint: 'blelloch_scan', - }, - }); - - this.scanBlockSumsPipeline = this.device.createComputePipeline({ - label: 'scan-block-sums-pipeline', - layout: scanPipelineLayout, - compute: { - module: shaderModule, - entryPoint: 'scan_block_sums', - }, - }); - - this.addBlockPrefixesPipeline = this.device.createComputePipeline({ - label: 'add-block-prefixes-pipeline', - layout: scanPipelineLayout, - compute: { - module: shaderModule, - entryPoint: 'add_block_prefixes', - }, - }); + // Initialize scan module + await this.scanModule.initialize(); this.initialized = true; } - /** - * Compute exclusive prefix sum on GPU using Blelloch scan - * Uses a two-level scan for large histograms: - * 1. Local scan within each workgroup - * 2. Scan of block sums - * 3. Add block prefixes to local results - */ - private computePrefixSumGPU( - inputBuffer: GPUBuffer, - outputBuffer: GPUBuffer, - blockSumsBuffer: GPUBuffer, - scanUniformBuffer: GPUBuffer, - dataSize: number - ): void { - const scanBindGroupLayout = this.scanBindGroupLayout; - const blellochPipeline = this.blellochScanPipeline; - const scanBlockSumsPipeline = this.scanBlockSumsPipeline; - const addBlockPrefixesPipeline = this.addBlockPrefixesPipeline; - - if ( - !scanBindGroupLayout || - !blellochPipeline || - !scanBlockSumsPipeline || - !addBlockPrefixesPipeline - ) { - throw new ShaderCompilationError('Scan pipelines not initialized'); - } - - // Calculate number of scan blocks - const numScanBlocks = Math.ceil(dataSize / ELEMENTS_PER_SCAN_BLOCK); - - // Update scan uniforms - const scanUniformData = new Uint32Array([dataSize, numScanBlocks, 0, 0]); - this.device.queue.writeBuffer(scanUniformBuffer, 0, scanUniformData); - - // Step 1: Local Blelloch scan within each workgroup - { - const bindGroup = this.device.createBindGroup({ - label: 'blelloch-scan-bind-group', - layout: scanBindGroupLayout, - entries: [ - { binding: 0, resource: { buffer: inputBuffer } }, - { binding: 1, resource: { buffer: outputBuffer } }, - { binding: 2, resource: { buffer: blockSumsBuffer } }, - { binding: 3, resource: { buffer: scanUniformBuffer } }, - ], - }); - - const commandEncoder = this.device.createCommandEncoder(); - const passEncoder = commandEncoder.beginComputePass(); - passEncoder.setPipeline(blellochPipeline); - passEncoder.setBindGroup(0, bindGroup); - passEncoder.dispatchWorkgroups(numScanBlocks); - passEncoder.end(); - this.device.queue.submit([commandEncoder.finish()]); - } - - // Step 2: Scan the block sums (if more than one block) - if (numScanBlocks > 1) { - const bindGroup = this.device.createBindGroup({ - label: 'scan-block-sums-bind-group', - layout: scanBindGroupLayout, - entries: [ - { binding: 0, resource: { buffer: blockSumsBuffer } }, - { binding: 1, resource: { buffer: blockSumsBuffer } }, - { binding: 2, resource: { buffer: blockSumsBuffer } }, - { binding: 3, resource: { buffer: scanUniformBuffer } }, - ], - }); - - const commandEncoder = this.device.createCommandEncoder(); - const passEncoder = commandEncoder.beginComputePass(); - passEncoder.setPipeline(scanBlockSumsPipeline); - passEncoder.setBindGroup(0, bindGroup); - passEncoder.dispatchWorkgroups(1); - passEncoder.end(); - this.device.queue.submit([commandEncoder.finish()]); - - // Step 3: Add block prefixes to each block's local results - { - const bindGroup = this.device.createBindGroup({ - label: 'add-block-prefixes-bind-group', - layout: scanBindGroupLayout, - entries: [ - { binding: 0, resource: { buffer: inputBuffer } }, - { binding: 1, resource: { buffer: outputBuffer } }, - { binding: 2, resource: { buffer: blockSumsBuffer } }, - { binding: 3, resource: { buffer: scanUniformBuffer } }, - ], - }); - - const commandEncoder = this.device.createCommandEncoder(); - const passEncoder = commandEncoder.beginComputePass(); - passEncoder.setPipeline(addBlockPrefixesPipeline); - passEncoder.setBindGroup(0, bindGroup); - passEncoder.dispatchWorkgroups(numScanBlocks); - passEncoder.end(); - this.device.queue.submit([commandEncoder.finish()]); - } - } - } - /** * Sort an array using GPU radix sort * @param data - The array to sort @@ -340,9 +185,10 @@ export class RadixSorter { }; } + const { elementsPerScanBlock } = ScanModule.getConstants(); const numWorkgroups = Math.ceil(size / WORKGROUP_SIZE); const histogramSize = RADIX * numWorkgroups; - const numScanBlocks = Math.ceil(histogramSize / ELEMENTS_PER_SCAN_BLOCK); + const numScanBlocks = Math.ceil(histogramSize / elementsPerScanBlock); // Check if preallocated buffers can be used const preallocatedBuffers = this.preallocatedBuffers; @@ -479,7 +325,7 @@ export class RadixSorter { } // Step 2: Compute prefix sum on GPU using Blelloch scan - this.computePrefixSumGPU( + this.scanModule.computePrefixSumGPU( histogramBuffer, prefixSumBuffer, blockSumsBuffer, @@ -550,13 +396,10 @@ export class RadixSorter { destroy(): void { this.clearPreallocation(); this.bufferManager.releaseAll(); + this.scanModule.destroy(); this.histogramPipeline = null; this.scatterPipeline = null; this.bindGroupLayout = null; - this.blellochScanPipeline = null; - this.scanBlockSumsPipeline = null; - this.addBlockPrefixesPipeline = null; - this.scanBindGroupLayout = null; this.initialized = false; } } diff --git a/src/sorting/scan/ScanModule.ts b/src/sorting/scan/ScanModule.ts new file mode 100644 index 0000000..e5eb800 --- /dev/null +++ b/src/sorting/scan/ScanModule.ts @@ -0,0 +1,324 @@ +/** + * ScanModule - GPU-based exclusive prefix sum (Blelloch scan) + * + * This module provides a dedicated scan interface for computing + * exclusive prefix sums on the GPU using the Blelloch algorithm. + */ + +import { GPUContext } from '../../core/GPUContext'; +import { BufferManager } from '../../core/BufferManager'; +import { BufferScope } from '../../core/BufferScope'; +import { ShaderCompilationError } from '../../core/errors'; +import scanShaderCode from '../../shaders/scan.wgsl?raw'; + +/** Size for Blelloch scan workgroups */ +const SCAN_WORKGROUP_SIZE = 256; +/** Elements processed per scan workgroup (each thread handles 2 elements) */ +const ELEMENTS_PER_SCAN_BLOCK = SCAN_WORKGROUP_SIZE * 2; + +/** + * GPU-based exclusive prefix sum module using Blelloch scan + */ +export class ScanModule { + private device: GPUDevice; + private bufferManager: BufferManager; + + // Scan pipelines + private blellochScanPipeline: GPUComputePipeline | null = null; + private scanBlockSumsPipeline: GPUComputePipeline | null = null; + private addBlockPrefixesPipeline: GPUComputePipeline | null = null; + private scanBindGroupLayout: GPUBindGroupLayout | null = null; + + private initialized = false; + + constructor(context: GPUContext) { + this.device = context.getDevice(); + this.bufferManager = new BufferManager(this.device); + } + + /** + * Initialize scan pipelines + */ + async initialize(): Promise { + if (this.initialized) return; + + const shaderModule = this.device.createShaderModule({ + label: 'scan-shader', + code: scanShaderCode, + }); + + const compilationInfo = await shaderModule.getCompilationInfo(); + const errors = compilationInfo.messages.filter((m) => m.type === 'error'); + if (errors.length > 0) { + throw new ShaderCompilationError( + `Scan shader compilation failed: ${errors.map((e) => e.message).join(', ')}` + ); + } + + // Create scan bind group layout + this.scanBindGroupLayout = this.device.createBindGroupLayout({ + label: 'scan-bind-group-layout', + entries: [ + { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'read-only-storage' } }, + { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, + { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, + { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'uniform' } }, + ], + }); + + const scanPipelineLayout = this.device.createPipelineLayout({ + label: 'scan-pipeline-layout', + bindGroupLayouts: [this.scanBindGroupLayout], + }); + + // Create Blelloch scan pipelines + this.blellochScanPipeline = this.device.createComputePipeline({ + label: 'blelloch-scan-pipeline', + layout: scanPipelineLayout, + compute: { + module: shaderModule, + entryPoint: 'blelloch_scan', + }, + }); + + this.scanBlockSumsPipeline = this.device.createComputePipeline({ + label: 'scan-block-sums-pipeline', + layout: scanPipelineLayout, + compute: { + module: shaderModule, + entryPoint: 'scan_block_sums', + }, + }); + + this.addBlockPrefixesPipeline = this.device.createComputePipeline({ + label: 'add-block-prefixes-pipeline', + layout: scanPipelineLayout, + compute: { + module: shaderModule, + entryPoint: 'add_block_prefixes', + }, + }); + + this.initialized = true; + } + + /** + * Compute exclusive prefix sum on GPU using Blelloch scan + * + * Uses a two-level scan for large arrays: + * 1. Local scan within each workgroup + * 2. Scan of block sums + * 3. Add block prefixes to local results + * + * @param input - Input array to compute prefix sum for + * @returns Exclusive prefix sum of input + */ + async computeExclusivePrefixSum(input: Uint32Array): Promise { + if (!this.initialized) { + throw new ShaderCompilationError('ScanModule not initialized. Call initialize() first.'); + } + + const dataSize = input.length; + + // Handle edge cases + if (dataSize === 0) { + return new Uint32Array(0); + } + + if (dataSize === 1) { + return new Uint32Array([0]); + } + + const numScanBlocks = Math.ceil(dataSize / ELEMENTS_PER_SCAN_BLOCK); + const bufferScope = new BufferScope(); + + try { + // Create buffers + const inputBuffer = bufferScope.track( + this.bufferManager.createStorageBuffer(input, 'scan-input'), + (buffer) => this.bufferManager.releaseBuffer(buffer) + ); + + const outputBuffer = bufferScope.track( + this.device.createBuffer({ + label: 'scan-output', + size: BufferManager.alignSize(dataSize * 4, 4), + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST, + }) + ); + + const blockSumsBuffer = bufferScope.track( + this.device.createBuffer({ + label: 'scan-block-sums', + size: BufferManager.alignSize(numScanBlocks * 4, 4), + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST, + }) + ); + + const uniformBuffer = bufferScope.track( + this.bufferManager.createUniformBuffer(16, 'scan-uniforms'), + (buffer) => this.bufferManager.releaseBuffer(buffer) + ); + + // Perform the scan + this.computePrefixSumGPU(inputBuffer, outputBuffer, blockSumsBuffer, uniformBuffer, dataSize); + + // Wait for GPU to finish + await this.device.queue.onSubmittedWorkDone(); + + // Read results + const result = await this.bufferManager.readBuffer(outputBuffer, dataSize * 4); + + return result; + } finally { + bufferScope.releaseAll(); + } + } + + /** + * Internal method to compute prefix sum on GPU + * + * This is exposed for use by RadixSorter which already has buffers allocated. + */ + computePrefixSumGPU( + inputBuffer: GPUBuffer, + outputBuffer: GPUBuffer, + blockSumsBuffer: GPUBuffer, + uniformBuffer: GPUBuffer, + dataSize: number + ): void { + const scanBindGroupLayout = this.scanBindGroupLayout; + const blellochPipeline = this.blellochScanPipeline; + const scanBlockSumsPipeline = this.scanBlockSumsPipeline; + const addBlockPrefixesPipeline = this.addBlockPrefixesPipeline; + + if ( + !scanBindGroupLayout || + !blellochPipeline || + !scanBlockSumsPipeline || + !addBlockPrefixesPipeline + ) { + throw new ShaderCompilationError('Scan pipelines not initialized'); + } + + // Calculate number of scan blocks + const numScanBlocks = Math.ceil(dataSize / ELEMENTS_PER_SCAN_BLOCK); + + // Update scan uniforms + const scanUniformData = new Uint32Array([dataSize, numScanBlocks, 0, 0]); + this.device.queue.writeBuffer(uniformBuffer, 0, scanUniformData); + + // Step 1: Local Blelloch scan within each workgroup + { + const bindGroup = this.device.createBindGroup({ + label: 'blelloch-scan-bind-group', + layout: scanBindGroupLayout, + entries: [ + { binding: 0, resource: { buffer: inputBuffer } }, + { binding: 1, resource: { buffer: outputBuffer } }, + { binding: 2, resource: { buffer: blockSumsBuffer } }, + { binding: 3, resource: { buffer: uniformBuffer } }, + ], + }); + + const commandEncoder = this.device.createCommandEncoder(); + const passEncoder = commandEncoder.beginComputePass(); + passEncoder.setPipeline(blellochPipeline); + passEncoder.setBindGroup(0, bindGroup); + passEncoder.dispatchWorkgroups(numScanBlocks); + passEncoder.end(); + this.device.queue.submit([commandEncoder.finish()]); + } + + // Step 2: Scan the block sums (if more than one block) + if (numScanBlocks > 1) { + const bindGroup = this.device.createBindGroup({ + label: 'scan-block-sums-bind-group', + layout: scanBindGroupLayout, + entries: [ + { binding: 0, resource: { buffer: blockSumsBuffer } }, + { binding: 1, resource: { buffer: blockSumsBuffer } }, + { binding: 2, resource: { buffer: blockSumsBuffer } }, + { binding: 3, resource: { buffer: uniformBuffer } }, + ], + }); + + const commandEncoder = this.device.createCommandEncoder(); + const passEncoder = commandEncoder.beginComputePass(); + passEncoder.setPipeline(scanBlockSumsPipeline); + passEncoder.setBindGroup(0, bindGroup); + passEncoder.dispatchWorkgroups(1); + passEncoder.end(); + this.device.queue.submit([commandEncoder.finish()]); + + // Step 3: Add block prefixes to each block's local results + { + const bindGroup = this.device.createBindGroup({ + label: 'add-block-prefixes-bind-group', + layout: scanBindGroupLayout, + entries: [ + { binding: 0, resource: { buffer: inputBuffer } }, + { binding: 1, resource: { buffer: outputBuffer } }, + { binding: 2, resource: { buffer: blockSumsBuffer } }, + { binding: 3, resource: { buffer: uniformBuffer } }, + ], + }); + + const commandEncoder = this.device.createCommandEncoder(); + const passEncoder = commandEncoder.beginComputePass(); + passEncoder.setPipeline(addBlockPrefixesPipeline); + passEncoder.setBindGroup(0, bindGroup); + passEncoder.dispatchWorkgroups(numScanBlocks); + passEncoder.end(); + this.device.queue.submit([commandEncoder.finish()]); + } + } + } + + /** + * Get the scan bind group layout for external use + */ + getBindGroupLayout(): GPUBindGroupLayout | null { + return this.scanBindGroupLayout; + } + + /** + * Get the scan pipelines for external use + */ + getPipelines(): { + blellochScan: GPUComputePipeline | null; + scanBlockSums: GPUComputePipeline | null; + addBlockPrefixes: GPUComputePipeline | null; + } { + return { + blellochScan: this.blellochScanPipeline, + scanBlockSums: this.scanBlockSumsPipeline, + addBlockPrefixes: this.addBlockPrefixesPipeline, + }; + } + + /** + * Get constants for external use + */ + static getConstants(): { + scanWorkgroupSize: number; + elementsPerScanBlock: number; + } { + return { + scanWorkgroupSize: SCAN_WORKGROUP_SIZE, + elementsPerScanBlock: ELEMENTS_PER_SCAN_BLOCK, + }; + } + + /** + * Release all resources + */ + destroy(): void { + this.bufferManager.releaseAll(); + this.blellochScanPipeline = null; + this.scanBlockSumsPipeline = null; + this.addBlockPrefixesPipeline = null; + this.scanBindGroupLayout = null; + this.initialized = false; + } +} diff --git a/src/sorting/scan/index.ts b/src/sorting/scan/index.ts new file mode 100644 index 0000000..31825b9 --- /dev/null +++ b/src/sorting/scan/index.ts @@ -0,0 +1,5 @@ +/** + * Scan module exports + */ + +export { ScanModule } from './ScanModule'; diff --git a/test/sorting/ScanModule.test.ts b/test/sorting/ScanModule.test.ts new file mode 100644 index 0000000..1908bc7 --- /dev/null +++ b/test/sorting/ScanModule.test.ts @@ -0,0 +1,199 @@ +/** + * Tests for ScanModule interface + * The ScanModule provides GPU-based exclusive prefix sum (Blelloch scan) + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { GPUContext } from '../../src/core/GPUContext'; +import { ScanModule } from '../../src/sorting/scan/ScanModule'; + +describe('ScanModule', () => { + let context: GPUContext; + let scanModule: ScanModule; + + beforeEach(async () => { + // Skip tests if WebGPU is not available + if (!GPUContext.isSupported()) { + return; + } + + context = new GPUContext(); + await context.initialize(); + scanModule = new ScanModule(context); + }); + + afterEach(() => { + if (scanModule) { + scanModule.destroy(); + } + if (context) { + context.destroy(); + } + }); + + describe('initialization', () => { + it('should initialize successfully', async () => { + if (!GPUContext.isSupported()) { + return; // Skip in Node.js + } + + await expect(scanModule.initialize()).resolves.not.toThrow(); + }); + + it('should be idempotent - calling initialize twice should not throw', async () => { + if (!GPUContext.isSupported()) { + return; + } + + await scanModule.initialize(); + await expect(scanModule.initialize()).resolves.not.toThrow(); + }); + }); + + describe('computeExclusivePrefixSum', () => { + beforeEach(async () => { + if (!GPUContext.isSupported()) { + return; + } + await scanModule.initialize(); + }); + + it('should handle single element input', async () => { + if (!GPUContext.isSupported()) { + return; + } + + const input = new Uint32Array([5]); + const result = await scanModule.computeExclusivePrefixSum(input); + + expect(result.length).toBe(1); + expect(result[0]).toBe(0); // Exclusive prefix sum of [5] is [0] + }); + + it('should compute correct prefix sum for small arrays', async () => { + if (!GPUContext.isSupported()) { + return; + } + + const input = new Uint32Array([3, 1, 7, 0]); + const result = await scanModule.computeExclusivePrefixSum(input); + + // Exclusive prefix sum: [0, 3, 4, 11] + expect(result.length).toBe(4); + expect(result[0]).toBe(0); + expect(result[1]).toBe(3); + expect(result[2]).toBe(4); + expect(result[3]).toBe(11); + }); + + it('should handle array of zeros', async () => { + if (!GPUContext.isSupported()) { + return; + } + + const input = new Uint32Array([0, 0, 0, 0]); + const result = await scanModule.computeExclusivePrefixSum(input); + + expect(result).toEqual(new Uint32Array([0, 0, 0, 0])); + }); + + it('should handle larger arrays', async () => { + if (!GPUContext.isSupported()) { + return; + } + + // Create array [1, 2, 3, ..., 100] + const input = new Uint32Array(100); + for (let i = 0; i < 100; i++) { + input[i] = i + 1; + } + + const result = await scanModule.computeExclusivePrefixSum(input); + + // Verify: result[i] = sum of input[0..i-1] + let expectedSum = 0; + for (let i = 0; i < 100; i++) { + expect(result[i]).toBe(expectedSum); + expectedSum += input[i]; + } + }); + + it('should handle non-power-of-2 sized arrays', async () => { + if (!GPUContext.isSupported()) { + return; + } + + const input = new Uint32Array([1, 2, 3, 4, 5]); // 5 elements, not power of 2 + const result = await scanModule.computeExclusivePrefixSum(input); + + // Expected: [0, 1, 3, 6, 10] + expect(result).toEqual(new Uint32Array([0, 1, 3, 6, 10])); + }); + + it('should throw error if not initialized', async () => { + if (!GPUContext.isSupported()) { + return; + } + + // Create a new module without initializing + const uninitializedModule = new ScanModule(context); + + const input = new Uint32Array([1, 2, 3]); + + await expect(uninitializedModule.computeExclusivePrefixSum(input)).rejects.toThrow(); + }); + }); + + describe('destroy', () => { + it('should release resources without throwing', async () => { + if (!GPUContext.isSupported()) { + return; + } + + await scanModule.initialize(); + expect(() => scanModule.destroy()).not.toThrow(); + }); + + it('should be idempotent - calling destroy twice should not throw', async () => { + if (!GPUContext.isSupported()) { + return; + } + + await scanModule.initialize(); + scanModule.destroy(); + expect(() => scanModule.destroy()).not.toThrow(); + }); + }); + + describe('integration with RadixSorter', () => { + it('should produce correct histogram prefix sums for radix sort', async () => { + if (!GPUContext.isSupported()) { + return; + } + + await scanModule.initialize(); + + // Simulate a histogram that might be produced by RadixSorter + // For 4 workgroups, each bucket has 4 counts + const histogram = new Uint32Array(16 * 4); // RADIX=16, 4 workgroups + for (let bucket = 0; bucket < 16; bucket++) { + for (let wg = 0; wg < 4; wg++) { + // Each bucket gets some counts + histogram[bucket * 4 + wg] = bucket + wg; + } + } + + const result = await scanModule.computeExclusivePrefixSum(histogram); + + // Verify correctness using simple reference + const expected = new Uint32Array(histogram.length); + let sum = 0; + for (let i = 0; i < histogram.length; i++) { + expected[i] = sum; + sum += histogram[i]; + } + + expect(result).toEqual(expected); + }); + }); +}); From 66f8ae4cecb79f85e21426af544ffe817143e523 Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Fri, 22 May 2026 16:41:51 +0800 Subject: [PATCH 2/2] chore: archive skipped changes and fix browser test setup - Archive demo-benchmark-orchestration (over-engineering for demo project) - Archive doc-implementation-sync (core work completed) - Fix playwright.config.ts: add WebGPU flags and testMatch pattern - Fix ScanModule: batch GPU commands in single command encoder - Fix fixtures: navigate to Vite dev server before tests - Update browser-e2e-testing tasks with blocker note Browser tests fail on SwiftShader (headless Chromium) due to incomplete WebGPU compute shader support. Node.js tests pass (103/103). --- .../.openspec.yaml | 2 + .../design.md | 39 +++++++++++++++++++ .../proposal.md | 27 +++++++++++++ .../specs/demo-orchestration/spec.md | 24 ++++++++++++ .../specs/sorting/spec.md | 15 +++++++ .../tasks.md | 17 ++++++++ .../proposal.md | 0 .../2026-05-doc-implementation-sync}/tasks.md | 0 openspec/changes/browser-e2e-testing/tasks.md | 3 +- playwright.config.ts | 15 ++++++- src/sorting/scan/ScanModule.ts | 12 +++--- test/browser/fixtures.ts | 5 +++ 12 files changed, 150 insertions(+), 9 deletions(-) create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/.openspec.yaml create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/design.md create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/proposal.md create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/specs/demo-orchestration/spec.md create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/specs/sorting/spec.md create mode 100644 openspec/archive/2026-05-demo-benchmark-orchestration/tasks.md rename openspec/{changes/doc-implementation-sync => archive/2026-05-doc-implementation-sync}/proposal.md (100%) rename openspec/{changes/doc-implementation-sync => archive/2026-05-doc-implementation-sync}/tasks.md (100%) diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/.openspec.yaml b/openspec/archive/2026-05-demo-benchmark-orchestration/.openspec.yaml new file mode 100644 index 0000000..4a1c677 --- /dev/null +++ b/openspec/archive/2026-05-demo-benchmark-orchestration/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-22 diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/design.md b/openspec/archive/2026-05-demo-benchmark-orchestration/design.md new file mode 100644 index 0000000..e157aeb --- /dev/null +++ b/openspec/archive/2026-05-demo-benchmark-orchestration/design.md @@ -0,0 +1,39 @@ +## Context + +The current demo path spans `src/main.ts` and `src/benchmark/Benchmark.ts`, but the seam is shallow. `Benchmark` owns both pure math and stateful execution, while `main.ts` owns DOM lookup, status updates, progress, validation, and direct sorter construction. + +This change deepens the orchestration path by making execution and presentation explicit modules with narrow interfaces. + +## Goals / Non-Goals + +**Goals:** + +- Make benchmark execution unit-testable without DOM or real WebGPU. +- Make demo UI flow unit-testable without `document`. +- Keep `main.ts` as a thin bootstrap file. + +**Non-Goals:** + +- Redesign `GPUContext` again; that lands in `injectable-gpu-runtime`. +- Change benchmark outputs or visible UI behavior. +- Extract radix prefix-sum in this change. + +## Decisions + +### 1. Split pure benchmark helpers from execution + +Keep `Benchmark` for `calculateSpeedup`, `calculateAverage`, and `formatResults`. Move `runSingle` and `runAll` into `BenchmarkRunner`. + +### 2. Add a `DemoController` seam + +Move run-button and run-all orchestration, progress updates, validation, and error handling into a controller with injected dependencies. + +### 3. Isolate DOM access behind `DomView` + +Create a concrete adapter for DOM reads/writes so tests can drive orchestration with fake views. + +## Risks / Trade-offs + +- **More files** -> Accept smaller modules to gain locality and test seams. +- **Public benchmark API churn** -> Preserve helper exports and document runner extraction in the change. +- **Controller/view split may feel verbose** -> The leverage is deterministic tests and a thinner bootstrap. diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/proposal.md b/openspec/archive/2026-05-demo-benchmark-orchestration/proposal.md new file mode 100644 index 0000000..4cd85de --- /dev/null +++ b/openspec/archive/2026-05-demo-benchmark-orchestration/proposal.md @@ -0,0 +1,27 @@ +## Why + +`main.ts` and `Benchmark` still braid DOM wiring, benchmark sequencing, sorter lifecycle, validation, and presentation into shallow modules. That makes the demo flow hard to unit-test and keeps UI/runtime knowledge spread across multiple call sites. + +## What Changes + +- Extract benchmark execution into a dedicated `BenchmarkRunner` seam with injected sorter factory, random data provider, and clock. +- Extract demo orchestration into a `DemoController` seam with a `DomView` adapter for DOM reads/writes. +- Shrink `main.ts` to bootstrap only and keep `Benchmark` focused on pure formatting/math helpers. +- Add unit tests for orchestration, progress/status flow, validation flow, and error handling without requiring real WebGPU. + +## Capabilities + +### New Capabilities + +- `demo-orchestration`: Deep orchestration seam for demo and benchmark flows, separating controller logic from DOM adapters and benchmark execution. + +### Modified Capabilities + +- `sorting`: Benchmark execution moves behind an injected runner while preserving existing benchmark behavior. + +## Impact + +- **Affected code:** `src/main.ts`, `src/benchmark/Benchmark.ts`, new `src/benchmark/BenchmarkRunner.ts`, new `src/demo/` files, browser/unit tests +- **Affected APIs:** `Benchmark` likely loses execution methods; orchestration moves into new modules. +- **Dependencies:** No new packages. +- **Systems:** Demo UI, benchmark execution, validation flow, and public benchmark surface. diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/specs/demo-orchestration/spec.md b/openspec/archive/2026-05-demo-benchmark-orchestration/specs/demo-orchestration/spec.md new file mode 100644 index 0000000..dbbd627 --- /dev/null +++ b/openspec/archive/2026-05-demo-benchmark-orchestration/specs/demo-orchestration/spec.md @@ -0,0 +1,24 @@ +## ADDED Requirements + +### Requirement: Demo orchestration uses injected seams + +The demo benchmark flow SHALL execute through controller and view seams so orchestration can be tested without direct DOM or WebGPU dependencies. + +#### Scenario: Controller drives benchmark execution + +- **WHEN** a user starts a single benchmark or the full suite +- **THEN** the controller SHALL coordinate progress, status, validation, and result reporting through injected runner and view interfaces + +#### Scenario: DOM access stays in the view adapter + +- **WHEN** the demo needs to read controls or update status, progress, and results +- **THEN** those DOM operations SHALL be isolated behind a concrete view adapter + +### Requirement: Benchmark execution is a separate runner + +Benchmark execution SHALL live behind a runner seam distinct from pure formatting and math helpers. + +#### Scenario: Runner executes benchmark cases + +- **WHEN** benchmark execution is requested +- **THEN** the runner SHALL use injected sorter factories, random data providers, and clocks to produce benchmark results diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/specs/sorting/spec.md b/openspec/archive/2026-05-demo-benchmark-orchestration/specs/sorting/spec.md new file mode 100644 index 0000000..53e6feb --- /dev/null +++ b/openspec/archive/2026-05-demo-benchmark-orchestration/specs/sorting/spec.md @@ -0,0 +1,15 @@ +## MODIFIED Requirements + +### Requirement: Performance Benchmarking + +The benchmark system SHALL preserve its existing timing and reporting behavior while executing through a dedicated runner seam rather than the formatting helper module. + +#### Scenario: Runner preserves benchmark behavior + +- **WHEN** a caller runs a single benchmark or the default suite +- **THEN** the system SHALL still measure native and GPU paths, average timings, and report speedups as before + +#### Scenario: Formatting stays available independently + +- **WHEN** code needs to format benchmark results +- **THEN** it SHALL be able to do so without instantiating the execution runner diff --git a/openspec/archive/2026-05-demo-benchmark-orchestration/tasks.md b/openspec/archive/2026-05-demo-benchmark-orchestration/tasks.md new file mode 100644 index 0000000..62e1d73 --- /dev/null +++ b/openspec/archive/2026-05-demo-benchmark-orchestration/tasks.md @@ -0,0 +1,17 @@ +## 1. Test-first orchestration seams + +- [ ] 1.1 Add failing unit tests for `BenchmarkRunner` execution with injected sorter factory and random provider +- [ ] 1.2 Add failing unit tests for `DemoController` progress, validation, and error flow with a fake view + +## 2. Implementation + +- [ ] 2.1 Extract pure helpers into a slim `Benchmark` module +- [ ] 2.2 Add `BenchmarkRunner` and move benchmark execution there +- [ ] 2.3 Add `DemoController` and `DomView`, then shrink `main.ts` to bootstrap only + +## 3. Validation + +- [ ] 3.1 Run `npm run typecheck` +- [ ] 3.2 Run `npm run lint` +- [ ] 3.3 Run `npm run test` +- [ ] 3.4 Run `npm run build` diff --git a/openspec/changes/doc-implementation-sync/proposal.md b/openspec/archive/2026-05-doc-implementation-sync/proposal.md similarity index 100% rename from openspec/changes/doc-implementation-sync/proposal.md rename to openspec/archive/2026-05-doc-implementation-sync/proposal.md diff --git a/openspec/changes/doc-implementation-sync/tasks.md b/openspec/archive/2026-05-doc-implementation-sync/tasks.md similarity index 100% rename from openspec/changes/doc-implementation-sync/tasks.md rename to openspec/archive/2026-05-doc-implementation-sync/tasks.md diff --git a/openspec/changes/browser-e2e-testing/tasks.md b/openspec/changes/browser-e2e-testing/tasks.md index 943b01d..213a838 100644 --- a/openspec/changes/browser-e2e-testing/tasks.md +++ b/openspec/changes/browser-e2e-testing/tasks.md @@ -27,7 +27,7 @@ - [x] Create `test/browser/` directory - [x] Create `playwright.config.ts` - [x] Add test scripts to `package.json` -- [ ] Install Playwright browsers: `npx playwright install chromium` +- [x] Install Playwright browsers: `npx playwright install chromium` ## Phase 2: Test Infrastructure @@ -105,6 +105,7 @@ None. - Browser tests require a display (use xvfb in CI if needed) - WebGPU may not be available in all CI environments - Consider using `test.skip()` for GPU tests when WebGPU unavailable +- **Blocker (2026-05-22)**: System lacks `libasound.so.2` and other libraries required by Playwright's Chromium. Installing requires sudo: `npx playwright install-deps chromium` --- diff --git a/playwright.config.ts b/playwright.config.ts index 073f352..d3c8408 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -5,6 +5,7 @@ import { defineConfig, devices } from '@playwright/test'; */ export default defineConfig({ testDir: './test/browser', + testMatch: '**/*.e2e.ts', fullyParallel: true, forbidOnly: !!process.env.CI, retries: process.env.CI ? 2 : 0, @@ -18,11 +19,21 @@ export default defineConfig({ projects: [ { name: 'chromium', - use: { ...devices['Desktop Chrome'] }, + use: { + ...devices['Desktop Chrome'], + launchOptions: { + args: [ + '--enable-unsafe-webgpu', + '--enable-features=Vulkan', + '--use-vulkan=swiftshader', + '--disable-gpu-sandbox', + ], + }, + }, }, ], webServer: { - command: 'npm run dev', + command: 'npx vite --port 5173', url: 'http://localhost:5173', reuseExistingServer: !process.env.CI, timeout: 120000, diff --git a/src/sorting/scan/ScanModule.ts b/src/sorting/scan/ScanModule.ts index e5eb800..a4e36ee 100644 --- a/src/sorting/scan/ScanModule.ts +++ b/src/sorting/scan/ScanModule.ts @@ -208,6 +208,9 @@ export class ScanModule { const scanUniformData = new Uint32Array([dataSize, numScanBlocks, 0, 0]); this.device.queue.writeBuffer(uniformBuffer, 0, scanUniformData); + // Use a single command encoder for all dispatches to ensure proper ordering + const commandEncoder = this.device.createCommandEncoder(); + // Step 1: Local Blelloch scan within each workgroup { const bindGroup = this.device.createBindGroup({ @@ -221,13 +224,11 @@ export class ScanModule { ], }); - const commandEncoder = this.device.createCommandEncoder(); const passEncoder = commandEncoder.beginComputePass(); passEncoder.setPipeline(blellochPipeline); passEncoder.setBindGroup(0, bindGroup); passEncoder.dispatchWorkgroups(numScanBlocks); passEncoder.end(); - this.device.queue.submit([commandEncoder.finish()]); } // Step 2: Scan the block sums (if more than one block) @@ -243,13 +244,11 @@ export class ScanModule { ], }); - const commandEncoder = this.device.createCommandEncoder(); const passEncoder = commandEncoder.beginComputePass(); passEncoder.setPipeline(scanBlockSumsPipeline); passEncoder.setBindGroup(0, bindGroup); passEncoder.dispatchWorkgroups(1); passEncoder.end(); - this.device.queue.submit([commandEncoder.finish()]); // Step 3: Add block prefixes to each block's local results { @@ -264,15 +263,16 @@ export class ScanModule { ], }); - const commandEncoder = this.device.createCommandEncoder(); const passEncoder = commandEncoder.beginComputePass(); passEncoder.setPipeline(addBlockPrefixesPipeline); passEncoder.setBindGroup(0, bindGroup); passEncoder.dispatchWorkgroups(numScanBlocks); passEncoder.end(); - this.device.queue.submit([commandEncoder.finish()]); } } + + // Submit all commands together + this.device.queue.submit([commandEncoder.finish()]); } /** diff --git a/test/browser/fixtures.ts b/test/browser/fixtures.ts index 6453aa0..165a9ab 100644 --- a/test/browser/fixtures.ts +++ b/test/browser/fixtures.ts @@ -9,6 +9,11 @@ import { test as base, expect } from '@playwright/test'; export const test = base.extend<{ webgpuSupported: boolean; }>({ + // Navigate to the Vite dev server before each test + page: async ({ page, baseURL }, use) => { + await page.goto(baseURL || 'http://localhost:5173'); + await use(page); + }, webgpuSupported: async ({ page }, use) => { const isSupported = await page.evaluate(() => { return 'gpu' in navigator;