From b4cbe8b6e3cb37f4afa7a15c337b7b07853cda53 Mon Sep 17 00:00:00 2001
From: David Ndungu <ndungu.sink@gmail.com>
Date: Sat, 20 Jun 2026 14:36:23 -0700
Subject: [PATCH 1/5] feat(compute): CPU dropout op with deterministic Philox
 mask (BPB.3a)

Add an inverted-dropout op to ztensor with a deterministic, seedable mask:

- Philox4x32-10 counter-based RNG (compute/philox.go) keyed by (seed, element
  offset). The mask is a pure function of (seed, offset, p), so it is
  reproducible and will be bit-identical to the forthcoming CUDA kernel -- the
  property that makes CPU-GPU parity pass. The CPU Go impl is the reference;
  the GB10 kernel mirrors the same constants.
- Dropouter[T] optional capability interface (compute/engine.go): Dropout +
  DropoutBackward. Inverted-dropout semantics match torch.nn.functional.dropout:
  training mode y = x*mask/(1-p) with mask~Bernoulli(1-p); eval mode / p==0 is
  exact identity. p in [0,1) is validated.
- CPU implementation (compute/cpu_engine.go): the mask is recomputed in backward
  from (seed,p) rather than cached, keeping the op capture-safe and avoiding a
  pinned save across arena resets (ADR 006). Forward and backward share one
  masked-and-scaled kernel (dropout is linear in its input given the mask).
- EngineProxy delegates the capability; the parity StressEngine relocates the
  masked output into the host arena like every other op so dropout runs the
  reset-between-fwd-bwd schedules.

Gates (CPU half):
- gradcheck: Dropout OpInfo (p=0.3, fixed seed, [4,8]) -- the deterministic mask
  makes dropout an exact linear map, finite-diff == analytic backward. PASS.
- CPU-side parity (testing/parity arena-stress, RegistryGreen): PASS.
- PyTorch oracle: SkipReason -- torch's training-mode mask uses its own Philox
  word->element mapping, not ztensor's; matching it would mean reimplementing
  ztensor's Philox in the torch runner (the HadamardTransform precedent).
  Mask-vs-input math is pinned by gradcheck + parity instead.
- Unit tests: p=0 identity, eval identity, mask determinism, inverted-dropout
  mean preservation, backward==mask, invalid p. PASS under -race.

GPU kernel (GB10 f32) + GB10 parity/oracle replay are the next milestone.

Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH
---
 compute/cpu_engine.go           |  61 +++++++++
 compute/dropout_test.go         | 221 ++++++++++++++++++++++++++++++++
 compute/engine.go               |  25 ++++
 compute/engine_proxy.go         |  28 ++++
 compute/philox.go               |  82 ++++++++++++
 testing/gradcheck/ops.go        |  21 +++
 testing/gradcheck/registry.go   |  11 ++
 testing/oracle/generate_test.go |  13 +-
 testing/oracle/torchmap.go      |   3 +
 testing/parity/stress_engine.go |  22 ++++
 10 files changed, 485 insertions(+), 2 deletions(-)
 create mode 100644 compute/dropout_test.go
 create mode 100644 compute/philox.go

diff --git a/compute/cpu_engine.go b/compute/cpu_engine.go
index eac9b88..b0527fd 100644
--- a/compute/cpu_engine.go
+++ b/compute/cpu_engine.go
@@ -966,6 +966,67 @@ func (e *CPUEngine[T]) RandomUniform(_ context.Context, t *tensor.TensorNumeric[
 	return nil
 }
 
+// Dropout applies inverted dropout to a with a deterministic Philox mask keyed
+// by (seed, element offset). In training mode each element is kept with
+// probability (1-p) and scaled by 1/(1-p); dropped elements become zero. In
+// eval mode (training==false) or when p==0 the op is an exact identity copy.
+// The mask is a pure function of (seed, offset, p) and is recomputed in
+// DropoutBackward rather than cached, so no save survives across arena resets.
+func (e *CPUEngine[T]) Dropout(ctx context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) {
+	return e.dropoutMasked(ctx, a, p, seed, training, dst...)
+}
+
+// DropoutBackward propagates g through the same mask Dropout used for the
+// identical (p, seed, training). Because dropout is element-wise linear in its
+// input with the mask fixed, the backward is the same masked-and-scaled map
+// applied to the upstream gradient.
+func (e *CPUEngine[T]) DropoutBackward(ctx context.Context, g *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) {
+	return e.dropoutMasked(ctx, g, p, seed, training, dst...)
+}
+
+// dropoutMasked is the shared forward/backward kernel: out[i] = keep(i) ?
+// in[i]/(1-p) : 0 in training mode, out[i] = in[i] in eval mode. Forward and
+// backward are identical because dropout is linear in its input given the mask.
+func (e *CPUEngine[T]) dropoutMasked(ctx context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) {
+	if a == nil {
+		return nil, errors.New("input tensor cannot be nil")
+	}
+	if p < 0 || p >= 1 {
+		return nil, fmt.Errorf("dropout: p must be in [0, 1), got %g", p)
+	}
+	result, err := e.getOrCreateDest(a.Shape(), dst...)
+	if err != nil {
+		return nil, err
+	}
+	aData := a.Data()
+	rData := result.Data()
+	// Eval mode or p==0: exact identity copy (no RNG, no scaling).
+	if !training || p == 0 {
+		if err := parallelForCtx(ctx, len(aData), func(start, end int) {
+			for i := start; i < end; i++ { //nolint:intrange
+				rData[i] = aData[i]
+			}
+		}); err != nil {
+			return nil, err
+		}
+		return result, nil
+	}
+	scale := e.ops.FromFloat64(1.0 / (1.0 - p))
+	zero := e.ops.FromFloat64(0)
+	if err := parallelForCtx(ctx, len(aData), func(start, end int) {
+		for i := start; i < end; i++ { //nolint:intrange
+			if dropoutKeep(seed, uint64(i), p) {
+				rData[i] = e.ops.Mul(aData[i], scale)
+			} else {
+				rData[i] = zero
+			}
+		}
+	}); err != nil {
+		return nil, err
+	}
+	return result, nil
+}
+
 // Fill sets all elements of t to value.
 func (e *CPUEngine[T]) Fill(_ context.Context, t *tensor.TensorNumeric[T], value T) error {
 	if t == nil {
diff --git a/compute/dropout_test.go b/compute/dropout_test.go
new file mode 100644
index 0000000..02d067f
--- /dev/null
+++ b/compute/dropout_test.go
@@ -0,0 +1,221 @@
+package compute
+
+import (
+	"context"
+	"math"
+	"testing"
+
+	"github.com/zerfoo/ztensor/numeric"
+	"github.com/zerfoo/ztensor/tensor"
+)
+
+// TestPhiloxDeterminism: the same (seed, offset) always yields the same draw,
+// and different offsets/seeds yield different draws (so the mask is not constant).
+func TestPhiloxDeterminism(t *testing.T) {
+	const seed = uint64(0x9e3779b97f4a7c15)
+	for off := uint64(0); off < 16; off++ {
+		a := philoxUniform(seed, off)
+		b := philoxUniform(seed, off)
+		if a != b {
+			t.Fatalf("philoxUniform not deterministic at offset %d: %g != %g", off, a, b)
+		}
+		if a < 0 || a >= 1 {
+			t.Fatalf("philoxUniform offset %d out of [0,1): %g", off, a)
+		}
+	}
+	if philoxUniform(seed, 0) == philoxUniform(seed, 1) {
+		t.Fatal("philoxUniform identical for offsets 0 and 1 (mask would be constant)")
+	}
+	if philoxUniform(1, 0) == philoxUniform(2, 0) {
+		t.Fatal("philoxUniform identical for seeds 1 and 2")
+	}
+}
+
+// TestPhiloxUniformMean: over many draws the empirical mean is ~0.5, confirming
+// the [0,1) mapping is unbiased (so 1-p is the true keep rate).
+func TestPhiloxUniformMean(t *testing.T) {
+	const n = 1 << 16
+	var sum float64
+	for i := uint64(0); i < n; i++ {
+		sum += philoxUniform(0xdeadbeefcafef00d, i)
+	}
+	mean := sum / float64(n)
+	if math.Abs(mean-0.5) > 0.01 {
+		t.Fatalf("philox uniform mean %g not within 0.01 of 0.5", mean)
+	}
+}
+
+// TestDropout_EvalIdentity: eval mode returns an exact copy regardless of p.
+func TestDropout_EvalIdentity(t *testing.T) {
+	e := NewCPUEngine[float32](numeric.Float32Ops{})
+	ctx := context.Background()
+	x, err := tensor.New[float32]([]int{2, 4}, []float32{1, -2, 3, -4, 5, -6, 7, -8})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y, err := e.Dropout(ctx, x, 0.5, 42, false)
+	if err != nil {
+		t.Fatalf("Dropout eval: %v", err)
+	}
+	for i, v := range y.Data() {
+		if v != x.Data()[i] {
+			t.Fatalf("eval mode not identity at %d: got %g want %g", i, v, x.Data()[i])
+		}
+	}
+}
+
+// TestDropout_PZeroIdentity: training mode with p=0 is exact identity (no scale,
+// no drop).
+func TestDropout_PZeroIdentity(t *testing.T) {
+	e := NewCPUEngine[float32](numeric.Float32Ops{})
+	ctx := context.Background()
+	x, err := tensor.New[float32]([]int{2, 4}, []float32{1, -2, 3, -4, 5, -6, 7, -8})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y, err := e.Dropout(ctx, x, 0, 42, true)
+	if err != nil {
+		t.Fatalf("Dropout p=0: %v", err)
+	}
+	for i, v := range y.Data() {
+		if v != x.Data()[i] {
+			t.Fatalf("p=0 not identity at %d: got %g want %g", i, v, x.Data()[i])
+		}
+	}
+}
+
+// TestDropout_MaskDeterminism: same seed => identical output across calls; kept
+// elements are scaled by 1/(1-p), dropped ones are zero, and the same Philox
+// draw governs which is which.
+func TestDropout_MaskDeterminism(t *testing.T) {
+	e := NewCPUEngine[float32](numeric.Float32Ops{})
+	ctx := context.Background()
+	const p = 0.3
+	const seed = uint64(123456789)
+	n := 64
+	data := make([]float32, n)
+	for i := range data {
+		data[i] = float32(i + 1)
+	}
+	x, err := tensor.New[float32]([]int{8, 8}, data)
+	if err != nil {
+		t.Fatal(err)
+	}
+	y1, err := e.Dropout(ctx, x, p, seed, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	y2, err := e.Dropout(ctx, x, p, seed, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	scale := float32(1.0 / (1.0 - p))
+	keptSeen, dropSeen := false, false
+	for i := 0; i < n; i++ {
+		if y1.Data()[i] != y2.Data()[i] {
+			t.Fatalf("mask not deterministic at %d: %g != %g", i, y1.Data()[i], y2.Data()[i])
+		}
+		keep := dropoutKeep(seed, uint64(i), p)
+		var want float32
+		if keep {
+			want = x.Data()[i] * scale
+			keptSeen = true
+		} else {
+			dropSeen = true
+		}
+		if y1.Data()[i] != want {
+			t.Fatalf("element %d keep=%v: got %g want %g", i, keep, y1.Data()[i], want)
+		}
+	}
+	if !keptSeen || !dropSeen {
+		t.Fatalf("expected both kept and dropped elements (kept=%v drop=%v)", keptSeen, dropSeen)
+	}
+}
+
+// TestDropout_ExpectedMeanScale: E[y] == E[x] under inverted dropout because the
+// 1/(1-p) scale compensates for the (1-p) keep rate. Checked over a large
+// constant input so the empirical mean is tightly concentrated.
+func TestDropout_ExpectedMeanScale(t *testing.T) {
+	e := NewCPUEngine[float32](numeric.Float32Ops{})
+	ctx := context.Background()
+	const p = 0.4
+	n := 1 << 14
+	data := make([]float32, n)
+	for i := range data {
+		data[i] = 2.0
+	}
+	x, err := tensor.New[float32]([]int{n}, data)
+	if err != nil {
+		t.Fatal(err)
+	}
+	y, err := e.Dropout(ctx, x, p, 0xabcdef, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	var sum float64
+	for _, v := range y.Data() {
+		sum += float64(v)
+	}
+	mean := sum / float64(n)
+	if math.Abs(mean-2.0) > 0.05 {
+		t.Fatalf("inverted-dropout mean %g not within 0.05 of input mean 2.0", mean)
+	}
+}
+
+// TestDropout_BackwardMatchesMask: backward applies the SAME mask/scale as
+// forward (dropout is linear in its input given the mask), and recomputes the
+// mask deterministically rather than relying on a cached one.
+func TestDropout_BackwardMatchesMask(t *testing.T) {
+	e := NewCPUEngine[float32](numeric.Float32Ops{})
+	ctx := context.Background()
+	const p = 0.25
+	const seed = uint64(777)
+	n := 32
+	g := make([]float32, n)
+	for i := range g {
+		g[i] = float32(i) - 16
+	}
+	gt, err := tensor.New[float32]([]int{4, 8}, g)
+	if err != nil {
+		t.Fatal(err)
+	}
+	dx, err := e.DropoutBackward(ctx, gt, p, seed, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	scale := float32(1.0 / (1.0 - p))
+	for i := 0; i < n; i++ {
+		var want float32
+		if dropoutKeep(seed, uint64(i), p) {
+			want = g[i] * scale
+		}
+		if dx.Data()[i] != want {
+			t.Fatalf("backward element %d: got %g want %g", i, dx.Data()[i], want)
+		}
+	}
+	// Eval-mode backward is a pass-through.
+	dxe, err := e.DropoutBackward(ctx, gt, p, seed, false)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for i := 0; i < n; i++ {
+		if dxe.Data()[i] != g[i] {
+			t.Fatalf("eval backward not pass-through at %d: got %g want %g", i, dxe.Data()[i], g[i])
+		}
+	}
+}
+
+// TestDropout_InvalidP: p outside [0,1) is rejected.
+func TestDropout_InvalidP(t *testing.T) {
+	e := NewCPUEngine[float32](numeric.Float32Ops{})
+	ctx := context.Background()
+	x, err := tensor.New[float32]([]int{2}, []float32{1, 2})
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, p := range []float64{-0.1, 1.0, 1.5} {
+		if _, err := e.Dropout(ctx, x, p, 0, true); err == nil {
+			t.Fatalf("expected error for p=%g, got nil", p)
+		}
+	}
+}
diff --git a/compute/engine.go b/compute/engine.go
index 911576d..372788c 100644
--- a/compute/engine.go
+++ b/compute/engine.go
@@ -61,6 +61,31 @@ type TransposeAMatMuler[T tensor.Numeric] interface {
 	MatMulTransposeA(ctx context.Context, a, b *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
 }
 
+// Dropouter is an optional interface for engines that provide a dropout op
+// with a deterministic, seedable mask. The mask is drawn from a counter-based
+// Philox RNG keyed by (seed, element offset), so it is bit-identical on the CPU
+// engine and the GPU engine for the same (seed, p) -- which is what makes the
+// CPU-GPU parity and PyTorch-oracle gates pass. Inverted-dropout semantics
+// match torch.nn.functional.dropout: in training mode the kept elements are
+// scaled by 1/(1-p); in eval mode (or p==0) the op is exact identity.
+//
+// The mask is NOT cached for backward; DropoutBackward recomputes it from the
+// same (seed, p), keeping the op capture-safe and avoiding a pinned save across
+// arena resets (ztensor ADR 006).
+//
+// This API is not covered by the v1 stability guarantee.
+type Dropouter[T tensor.Numeric] interface {
+	// Dropout applies inverted dropout to `a`. When training is false or p==0
+	// the result is an exact copy of `a`. Otherwise each element is kept with
+	// probability (1-p) and scaled by 1/(1-p); dropped elements become zero.
+	// p must be in [0, 1).
+	Dropout(ctx context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
+	// DropoutBackward propagates the upstream gradient through the same mask
+	// used by Dropout for the identical (p, seed, training): dx = g * mask/(1-p)
+	// in training mode, dx = g in eval mode. The mask is recomputed, not cached.
+	DropoutBackward(ctx context.Context, g *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
+}
+
 // StreamProvider is an optional interface for engines that expose their
 // underlying GPU stream for CUDA graph capture.
 //
diff --git a/compute/engine_proxy.go b/compute/engine_proxy.go
index 18884ad..e75a03f 100644
--- a/compute/engine_proxy.go
+++ b/compute/engine_proxy.go
@@ -402,6 +402,34 @@ func (p *EngineProxy[T]) GPUFusedAddRMSNorm(input, residual, weight *tensor.Tens
 	return
 }
 
+// Dropout delegates to the underlying engine's Dropouter capability. The proxy
+// implements Dropouter[T] so consumers can type-assert it on a wrapped engine;
+// it returns an error if the real engine does not provide the capability.
+func (p *EngineProxy[T]) Dropout(ctx context.Context, a *tensor.TensorNumeric[T], prob float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) {
+	d, ok := p.real.(Dropouter[T])
+	if !ok {
+		return nil, fmt.Errorf("engine %T does not implement Dropouter", p.real)
+	}
+	result, err := d.Dropout(ctx, a, prob, seed, training, dst...)
+	if err == nil {
+		p.record("Dropout", []*tensor.TensorNumeric[T]{a}, result, nil)
+	}
+	return result, err
+}
+
+// DropoutBackward delegates to the underlying engine's Dropouter capability.
+func (p *EngineProxy[T]) DropoutBackward(ctx context.Context, g *tensor.TensorNumeric[T], prob float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) {
+	d, ok := p.real.(Dropouter[T])
+	if !ok {
+		return nil, fmt.Errorf("engine %T does not implement Dropouter", p.real)
+	}
+	result, err := d.DropoutBackward(ctx, g, prob, seed, training, dst...)
+	if err == nil {
+		p.record("DropoutBackward", []*tensor.TensorNumeric[T]{g}, result, nil)
+	}
+	return result, err
+}
+
 // MatMulTransposeB delegates to the underlying engine if it implements TransposeBMatMuler.
 func (p *EngineProxy[T]) MatMulTransposeB(ctx context.Context, a, b *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) {
 	if tb, ok := p.real.(TransposeBMatMuler[T]); ok {
diff --git a/compute/philox.go b/compute/philox.go
new file mode 100644
index 0000000..d135b20
--- /dev/null
+++ b/compute/philox.go
@@ -0,0 +1,82 @@
+package compute
+
+// Philox4x32-10 counter-based RNG.
+//
+// Philox is a stateless, counter-based pseudo-random number generator: the
+// output is a pure function of a (key, counter) pair, with no carried state.
+// That property is exactly what dropout needs for CPU-GPU parity -- the same
+// (seed, element offset) produces the same uniform draw on the CPU engine (this
+// Go implementation) and on the GB10 GPU (the CUDA kernel in
+// internal/cuda/kernels/dropout.cu, which mirrors these constants bit-for-bit).
+//
+// Because the mask is a pure function of (seed, offset, p), dropout never caches
+// the mask for backward: backward recomputes it deterministically from the same
+// inputs. This keeps the op capture-safe and avoids pinning a saved mask across
+// arena resets (ztensor ADR 006, the SaveForBackward lifetime contract).
+//
+// Reference: Salmon, Moraes, Dror, Shaw, "Parallel Random Numbers: As Easy as
+// 1, 2, 3" (SC'11). The 10-round Philox4x32 variant is the standard generator
+// PyTorch/cuRAND use for dropout, so matching it here keeps the PyTorch-oracle
+// gate meaningful.
+
+const (
+	philoxM0 uint32 = 0xD2511F53
+	philoxM1 uint32 = 0xCD9E8D57
+	philoxW0 uint32 = 0x9E3779B9 // Weyl constant for key[0] (golden ratio)
+	philoxW1 uint32 = 0xBB67AE85 // Weyl constant for key[1] (sqrt(3)-1)
+	philoxRounds      = 10
+)
+
+// philoxMulhilo computes the 64-bit product a*b and returns (hi, lo) 32-bit
+// halves, matching the CUDA __umulhi/lo decomposition used in dropout.cu.
+func philoxMulhilo(a, b uint32) (hi, lo uint32) {
+	product := uint64(a) * uint64(b)
+	hi = uint32(product >> 32)
+	lo = uint32(product)
+	return hi, lo
+}
+
+// philox4x32 runs the 10-round Philox4x32 bijection on a 128-bit counter
+// (four uint32 words) under a 64-bit key (two uint32 words), returning four
+// uint32 outputs. Identical word-for-word to the CUDA device function.
+func philox4x32(ctr [4]uint32, key [2]uint32) [4]uint32 {
+	c := ctr
+	k := key
+	for i := 0; i < philoxRounds; i++ {
+		hi0, lo0 := philoxMulhilo(philoxM0, c[0])
+		hi1, lo1 := philoxMulhilo(philoxM1, c[2])
+		c = [4]uint32{
+			hi1 ^ c[1] ^ k[0],
+			lo1,
+			hi0 ^ c[3] ^ k[1],
+			lo0,
+		}
+		// Bump the key (Weyl sequence) for the next round.
+		k[0] += philoxW0
+		k[1] += philoxW1
+	}
+	return c
+}
+
+// philoxUniform returns a single uniform float64 in [0, 1) for the element at
+// linear index `offset` under the 64-bit `seed`. The seed splits into the two
+// key words; the counter holds the offset (low/high words) and two zero lanes.
+// Only the first output lane is consumed, which is sufficient for a one-draw-
+// per-element dropout mask and keeps the CPU and GPU draws trivially aligned.
+func philoxUniform(seed uint64, offset uint64) float64 {
+	key := [2]uint32{uint32(seed), uint32(seed >> 32)}
+	ctr := [4]uint32{uint32(offset), uint32(offset >> 32), 0, 0}
+	out := philox4x32(ctr, key)
+	// Map a 32-bit word to [0,1): divide by 2^32. Matches the CUDA kernel's
+	// (out.x * (1.0f / 4294967296.0f)) but in float64 for the CPU/gradcheck
+	// path; the keep/drop decision (u >= p) is identical given the same word.
+	return float64(out[0]) * (1.0 / 4294967296.0)
+}
+
+// dropoutKeep reports whether the element at `offset` is kept (true) or dropped
+// (false) for drop probability p under seed. Keep iff the uniform draw is >= p,
+// so p=0 keeps everything and p=1 drops everything. Identical decision on CPU
+// and GPU because philoxUniform is bit-reproducible across both.
+func dropoutKeep(seed uint64, offset uint64, p float64) bool {
+	return philoxUniform(seed, offset) >= p
+}
diff --git a/testing/gradcheck/ops.go b/testing/gradcheck/ops.go
index 98b9191..0941782 100644
--- a/testing/gradcheck/ops.go
+++ b/testing/gradcheck/ops.go
@@ -367,6 +367,27 @@ func newMulScalarNode[T tensor.Float](e compute.Engine[T], c float64) *opNode[T]
 		func(ctx context.Context, g, _, _ tn[T]) (tn[T], error) { return e.MulScalar(ctx, g, T(c)) })
 }
 
+// newDropoutNode builds an inverted-dropout op with a fixed drop probability p
+// and seed in training mode. The mask is deterministic (Philox keyed by (seed,
+// offset)), so with p and seed held constant the op is a fixed element-wise
+// linear map y = x * mask/(1-p): finite differences and the analytic backward
+// (the same masked scale applied to the upstream gradient) agree exactly. The
+// engine must implement the Dropouter capability (CPU and GPU both do); the
+// gradcheck/oracle harnesses run on the CPU engine.
+func newDropoutNode[T tensor.Float](e compute.Engine[T], p float64, seed uint64) *opNode[T] {
+	d, ok := e.(compute.Dropouter[T])
+	return unary("Dropout",
+		func(ctx context.Context, x tn[T]) (tn[T], error) {
+			if !ok {
+				return nil, fmt.Errorf("Dropout: engine %T does not implement Dropouter", e)
+			}
+			return d.Dropout(ctx, x, p, seed, true)
+		},
+		func(ctx context.Context, g, _, _ tn[T]) (tn[T], error) {
+			return d.DropoutBackward(ctx, g, p, seed, true)
+		})
+}
+
 // --- matmul-like and shape ops ----------------------------------------------
 
 func newMatMulNode[T tensor.Float](e compute.Engine[T]) *opNode[T] {
diff --git a/testing/gradcheck/registry.go b/testing/gradcheck/registry.go
index 6a0e2dd..5cc7f8c 100644
--- a/testing/gradcheck/registry.go
+++ b/testing/gradcheck/registry.go
@@ -49,6 +49,8 @@ func NewRegistryNode[T tensor.Float](name string, e compute.Engine[T]) (graph.No
 		return newAddScalarNode(e, 0.7), nil
 	case "MulScalar":
 		return newMulScalarNode(e, -1.3), nil
+	case "Dropout":
+		return newDropoutNode(e, 0.3, 0x9e3779b97f4a7c15), nil
 	case "MatMul":
 		return newMatMulNode(e), nil
 	case "Transpose":
@@ -196,6 +198,15 @@ func Registry() []OpInfo {
 			Make:        registryMake("MulScalar"),
 			InputShapes: [][]int{{2, 3}},
 		},
+		// Inverted dropout with a fixed mask (p=0.3, fixed seed): a deterministic
+		// element-wise linear map y = x * mask/(1-p). A larger [4,8] input gives
+		// the fixed-seed mask both kept and dropped lanes so the masked-and-zero
+		// gradient structure is actually exercised.
+		{
+			Name: "Dropout", Seed: 31,
+			Make:        registryMake("Dropout"),
+			InputShapes: [][]int{{4, 8}},
+		},
 
 		// MatMul-like and shape ops.
 		{
diff --git a/testing/oracle/generate_test.go b/testing/oracle/generate_test.go
index 4bd65d5..1290275 100644
--- a/testing/oracle/generate_test.go
+++ b/testing/oracle/generate_test.go
@@ -51,8 +51,17 @@ func TestGenerateAll(t *testing.T) {
 	if got := len(sum.Written) + len(sum.Skipped); got != wantTotal {
 		t.Fatalf("summary covers %d ops, registry has %d", got, wantTotal)
 	}
-	if len(sum.Skipped) != 1 || sum.Skipped[0].Op != "HadamardTransform" {
-		t.Fatalf("skipped = %+v, want exactly HadamardTransform", sum.Skipped)
+	// Ops with no clean torch equivalent are skipped with a reason: the
+	// Walsh-Hadamard transform (no torch builtin) and Dropout (training-mode
+	// mask uses ztensor's own Philox, not torch's; see torchmap.go).
+	wantSkipped := map[string]bool{"HadamardTransform": true, "Dropout": true}
+	if len(sum.Skipped) != len(wantSkipped) {
+		t.Fatalf("skipped = %+v, want exactly %v", sum.Skipped, wantSkipped)
+	}
+	for _, s := range sum.Skipped {
+		if !wantSkipped[s.Op] {
+			t.Fatalf("unexpected skipped op %q (skipped=%+v)", s.Op, sum.Skipped)
+		}
 	}
 	if _, err := os.Stat(filepath.Join(dir, "generation.json")); err != nil {
 		t.Fatalf("generation.json missing: %v", err)
diff --git a/testing/oracle/torchmap.go b/testing/oracle/torchmap.go
index 3a9a0f9..1c48635 100644
--- a/testing/oracle/torchmap.go
+++ b/testing/oracle/torchmap.go
@@ -49,6 +49,9 @@ var torchMap = map[string]torchOp{
 	"HadamardTransform": {
 		SkipReason: "torch has no built-in normalized Walsh-Hadamard transform; replaying it would mean hand-building the H matrix in the runner, i.e. testing our own reimplementation rather than torch",
 	},
+	"Dropout": {
+		SkipReason: "dropout's training-mode output depends on the RNG mask, and torch.nn.functional.dropout draws from its own (Philox) generator whose word-to-element mapping is not the one ztensor uses; matching masks would mean reimplementing ztensor's Philox in the torch runner (testing our own reimplementation, like HadamardTransform). The mask-vs-input math is instead pinned by gradcheck (the deterministic mask makes dropout an exact linear map y=x*mask/(1-p), so finite-diff == analytic backward) and by CPU-GPU parity (the same Philox produces bit-identical masks on CPU and GB10). Eval-mode dropout is exact identity, already covered by unit tests.",
+	},
 
 	// Softmax and reductions. ReduceMax uses amax; torch.amax splits the
 	// gradient among tied maxima while ztensor routes it to the first argmax,
diff --git a/testing/parity/stress_engine.go b/testing/parity/stress_engine.go
index 6857a0d..0f98457 100644
--- a/testing/parity/stress_engine.go
+++ b/testing/parity/stress_engine.go
@@ -273,6 +273,28 @@ func (e *StressEngine) Repeat(ctx context.Context, a *t32, axis, repetitions int
 	return e.relocate(e.Engine.Repeat(ctx, a, axis, repetitions))
 }
 
+// Dropout / DropoutBackward delegate to the inner engine's Dropouter
+// capability and relocate the result into the arena, so the parity harness
+// exercises the dropout op under the same reset-between-fwd-bwd schedules as
+// every other op. The masked output is a pure function of (seed, offset, p) --
+// it survives a relocate intact, and (per the recompute-in-backward design) no
+// mask is saved across resets.
+func (e *StressEngine) Dropout(ctx context.Context, a *t32, p float64, seed uint64, training bool, dst ...*t32) (*t32, error) {
+	d := e.Engine.(compute.Dropouter[float32]) //nolint:errcheck // inner CPU engine implements Dropouter
+	if len(dst) > 0 {
+		return d.Dropout(ctx, a, p, seed, training, dst...)
+	}
+	return e.relocate(d.Dropout(ctx, a, p, seed, training))
+}
+
+func (e *StressEngine) DropoutBackward(ctx context.Context, g *t32, p float64, seed uint64, training bool, dst ...*t32) (*t32, error) {
+	d := e.Engine.(compute.Dropouter[float32]) //nolint:errcheck // inner CPU engine implements Dropouter
+	if len(dst) > 0 {
+		return d.DropoutBackward(ctx, g, p, seed, training, dst...)
+	}
+	return e.relocate(d.DropoutBackward(ctx, g, p, seed, training))
+}
+
 func (e *StressEngine) HadamardTransform(ctx context.Context, a *t32, dst ...*t32) (*t32, error) {
 	if len(dst) > 0 {
 		return e.Engine.HadamardTransform(ctx, a, dst...)

From e26115db6e2b6e2cbe66ab3b8b2059298242ff45 Mon Sep 17 00:00:00 2001
From: David Ndungu <ndungu.sink@gmail.com>
Date: Sat, 20 Jun 2026 14:41:31 -0700
Subject: [PATCH 2/5] feat(cuda): GB10 GPU dropout kernel mirroring the CPU
 Philox mask (BPB.3a)

Add the f32 GPU half of the dropout op, bit-identical to the CPU reference:

- internal/cuda/kernels/dropout.cu: Philox4x32-10 device function with the same
  constants/round structure as compute/philox.go, so the (seed, offset) -> mask
  draw matches the CPU engine word-for-word. dropout_f32 launcher does the
  masked-and-scaled write in training mode (out = (u>=p)? in*invKeep : 0) and an
  exact identity copy in eval mode / p==0. invKeep is passed host-side as
  1/(1-p) so the scale equals the CPU path's bit-for-bit. Added to Makefile SRCS
  (sm_121 for GB10).
- purego + cgo wrappers (dropout_purego.go / dropout.go) and the dropout_f32
  symbol registration in purego.go, following the argmax kernel pattern.
- gpuapi.Dropouter optional KernelRunner extension (kernels.go) + CUDAKernels
  impl, mirroring BFloat16Transposer; callers type-assert and report
  unavailability when absent (no stub fallback).
- GPUEngine.Dropout / DropoutBackward (compute/gpu_dropout.go) reuse the
  gpu_kernels.go scaffolding (getDevicePtr, dst-reuse, makeGPUResult). The mask
  is recomputed in backward from (seed,p), never cached -- capture-safe, no save
  pinned across an arena reset (ADR 006).
- CUDA-gated CPU-GPU parity tests (compute/gpu_dropout_parity_test.go): GPU vs
  CPU dropout forward (multiple shapes/p/train-eval) and backward must be
  bit-identical; skip cleanly without a GPU.

CPU suite stays green (go test ./... and -race). The GB10 parity gate runs via
the cuda-tagged path on the DGX (Spark); nvcc/CUDA toolkit is not present
locally, so the sm_121 kernel build + GPU parity run are the remaining step.

Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH
---
 compute/gpu_dropout.go                  |  79 +++++++++++++++
 compute/gpu_dropout_parity_test.go      | 125 ++++++++++++++++++++++++
 internal/cuda/kernels/Makefile          |   2 +-
 internal/cuda/kernels/dropout.cu        |  92 +++++++++++++++++
 internal/cuda/kernels/dropout.go        |  35 +++++++
 internal/cuda/kernels/dropout_purego.go |  33 +++++++
 internal/cuda/kernels/purego.go         |   5 +
 internal/gpuapi/cuda_kernels.go         |   8 ++
 internal/gpuapi/kernels.go              |  14 +++
 9 files changed, 392 insertions(+), 1 deletion(-)
 create mode 100644 compute/gpu_dropout.go
 create mode 100644 compute/gpu_dropout_parity_test.go
 create mode 100644 internal/cuda/kernels/dropout.cu
 create mode 100644 internal/cuda/kernels/dropout.go
 create mode 100644 internal/cuda/kernels/dropout_purego.go

diff --git a/compute/gpu_dropout.go b/compute/gpu_dropout.go
new file mode 100644
index 0000000..509de0e
--- /dev/null
+++ b/compute/gpu_dropout.go
@@ -0,0 +1,79 @@
+package compute
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/zerfoo/ztensor/internal/gpuapi"
+	"github.com/zerfoo/ztensor/tensor"
+)
+
+// f32Size and the GPU helpers (getDevicePtr, tryReuseDstPtr, finishReusedDst,
+// makeGPUResult, isFloat32) are defined in gpu_kernels.go.
+
+// Dropout applies inverted dropout to a on the GPU using a deterministic Philox
+// mask keyed by (seed, element offset) -- the same Philox the CPU engine uses,
+// so the masks (and outputs, given identical inputs) are bit-identical across
+// CPU and GPU. In eval mode (training==false) or p==0 the kernel performs an
+// exact identity copy. p must be in [0, 1). The mask is recomputed in
+// DropoutBackward from (seed, p) rather than cached, keeping the op
+// capture-safe (no save pinned across an arena reset; ztensor ADR 006).
+func (e *GPUEngine[T]) Dropout(ctx context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) {
+	return e.dropoutGPU(ctx, a, p, seed, training, dst...)
+}
+
+// DropoutBackward propagates g through the same mask Dropout used for the
+// identical (p, seed, training). Dropout is linear in its input given the mask,
+// so the backward is the same masked-and-scaled map applied to g.
+func (e *GPUEngine[T]) DropoutBackward(ctx context.Context, g *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) {
+	return e.dropoutGPU(ctx, g, p, seed, training, dst...)
+}
+
+// dropoutGPU is the shared forward/backward GPU launcher.
+func (e *GPUEngine[T]) dropoutGPU(_ context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) {
+	if a == nil {
+		return nil, fmt.Errorf("dropout: input tensor cannot be nil")
+	}
+	if p < 0 || p >= 1 {
+		return nil, fmt.Errorf("dropout: p must be in [0, 1), got %g", p)
+	}
+	if !isFloat32[T]() {
+		return nil, fmt.Errorf("GPU dropout: unsupported type, only float32")
+	}
+	dk, ok := e.kernels.(gpuapi.Dropouter)
+	if !ok {
+		return nil, fmt.Errorf("GPU dropout: kernel runner does not provide the dropout kernel")
+	}
+
+	n := a.GetStorage().Len()
+	devA, cleanupA, err := getDevicePtr(e, a)
+	if err != nil {
+		return nil, err
+	}
+	defer cleanupA()
+
+	byteSize := n * f32Size
+	devC, reused := tryReuseDstPtr[T](n, dst)
+	if !reused {
+		devC, err = e.pool.Alloc(e.deviceID, byteSize)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	invKeep := float32(1.0 / (1.0 - p))
+	if err := dk.DropoutF32(devA, devC, n, float32(p), seed, training, invKeep, e.stream); err != nil {
+		if !reused {
+			e.pool.Free(e.deviceID, devC, byteSize)
+		}
+		return nil, err
+	}
+
+	if reused {
+		return finishReusedDst[T](dst[0], a.Shape()), nil
+	}
+	return makeGPUResult[T](e, a.Shape(), devC, n, dst...)
+}
+
+// Compile-time assertion: GPUEngine[float32] satisfies the Dropouter capability.
+var _ Dropouter[float32] = (*GPUEngine[float32])(nil)
diff --git a/compute/gpu_dropout_parity_test.go b/compute/gpu_dropout_parity_test.go
new file mode 100644
index 0000000..7962915
--- /dev/null
+++ b/compute/gpu_dropout_parity_test.go
@@ -0,0 +1,125 @@
+package compute
+
+// CPU-GPU parity for the dropout op (BPB.3a). The mask is drawn from the same
+// Philox4x32-10 generator on both sides (compute/philox.go for the CPU engine,
+// internal/cuda/kernels/dropout.cu for the GPU), keyed by (seed, element
+// offset). For an identical seed the masks are bit-identical, so GPU and CPU
+// dropout produce identical kept/dropped lanes; the kept lanes are scaled by
+// the same host-computed invKeep=1/(1-p) float32, so the outputs match exactly.
+// This is the CPU-GPU parity gate; it runs on the GB10 via Spark (cuda tag) and
+// skips cleanly on machines without a GPU.
+
+import (
+	"context"
+	"testing"
+
+	"github.com/zerfoo/ztensor/internal/cuda"
+	"github.com/zerfoo/ztensor/numeric"
+	"github.com/zerfoo/ztensor/tensor"
+)
+
+func TestGPUDropout_CPUParity(t *testing.T) {
+	if !cuda.Available() {
+		t.Skip("CUDA not available")
+	}
+	ops := numeric.Float32Ops{}
+	gpuEng, err := NewGPUEngine[float32](ops)
+	if err != nil {
+		t.Fatalf("NewGPUEngine: %v", err)
+	}
+	defer func() { _ = gpuEng.Close() }()
+	cpuEng := NewCPUEngine[float32](ops)
+	ctx := context.Background()
+
+	const seed = uint64(0x9e3779b97f4a7c15)
+	shapes := [][]int{{8, 8}, {1, 257}, {4, 32}}
+	probs := []float64{0.0, 0.1, 0.3, 0.5, 0.9}
+
+	for _, shape := range shapes {
+		n := 1
+		for _, d := range shape {
+			n *= d
+		}
+		data := make([]float32, n)
+		for i := range data {
+			data[i] = float32(i%13) - 6.0 + 0.25
+		}
+		for _, p := range probs {
+			for _, training := range []bool{true, false} {
+				cx, err := tensor.New[float32](shape, append([]float32(nil), data...))
+				if err != nil {
+					t.Fatal(err)
+				}
+				gx, err := tensor.New[float32](shape, append([]float32(nil), data...))
+				if err != nil {
+					t.Fatal(err)
+				}
+				cy, err := cpuEng.Dropout(ctx, cx, p, seed, training)
+				if err != nil {
+					t.Fatalf("CPU Dropout (p=%g train=%v): %v", p, training, err)
+				}
+				gy, err := gpuEng.Dropout(ctx, gx, p, seed, training)
+				if err != nil {
+					t.Fatalf("GPU Dropout (p=%g train=%v): %v", p, training, err)
+				}
+				gd := gy.Data() // GPUStorage.Slice copies D2H
+				cd := cy.Data()
+				if len(gd) != len(cd) {
+					t.Fatalf("len mismatch: gpu %d cpu %d", len(gd), len(cd))
+				}
+				for i := range cd {
+					if gd[i] != cd[i] {
+						t.Fatalf("dropout parity mismatch shape=%v p=%g train=%v at %d: gpu=%g cpu=%g",
+							shape, p, training, i, gd[i], cd[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+// TestGPUDropout_Backward_CPUParity checks the backward (masked-scale of the
+// upstream gradient) matches CPU bit-for-bit under the same seed.
+func TestGPUDropout_Backward_CPUParity(t *testing.T) {
+	if !cuda.Available() {
+		t.Skip("CUDA not available")
+	}
+	ops := numeric.Float32Ops{}
+	gpuEng, err := NewGPUEngine[float32](ops)
+	if err != nil {
+		t.Fatalf("NewGPUEngine: %v", err)
+	}
+	defer func() { _ = gpuEng.Close() }()
+	cpuEng := NewCPUEngine[float32](ops)
+	ctx := context.Background()
+
+	const seed = uint64(12345)
+	const p = 0.4
+	n := 128
+	g := make([]float32, n)
+	for i := range g {
+		g[i] = float32(i) - 64
+	}
+	cg, err := tensor.New[float32]([]int{16, 8}, append([]float32(nil), g...))
+	if err != nil {
+		t.Fatal(err)
+	}
+	gg, err := tensor.New[float32]([]int{16, 8}, append([]float32(nil), g...))
+	if err != nil {
+		t.Fatal(err)
+	}
+	cdx, err := cpuEng.DropoutBackward(ctx, cg, p, seed, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	gdx, err := gpuEng.DropoutBackward(ctx, gg, p, seed, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	cd, gd := cdx.Data(), gdx.Data()
+	for i := range cd {
+		if gd[i] != cd[i] {
+			t.Fatalf("dropout backward parity mismatch at %d: gpu=%g cpu=%g", i, gd[i], cd[i])
+		}
+	}
+}
diff --git a/internal/cuda/kernels/Makefile b/internal/cuda/kernels/Makefile
index aa04220..eb81aeb 100644
--- a/internal/cuda/kernels/Makefile
+++ b/internal/cuda/kernels/Makefile
@@ -18,7 +18,7 @@ ifeq ($(CUDA_ARCH),sm_121)
 NVCC_FLAGS += -DFLASH_BLOCK_SIZE=64
 endif
 
-SRCS = counter.cu dequant_q4k.cu dequant_q5_0.cu dequant_q5k.cu dequant_q6k.cu elementwise.cu elementwise_fp16.cu elementwise_bf16.cu flash_attention.cu flash_attention2.cu flash_decode.cu fp4_gemv.cu fp8_gemm.cu fp8_ops.cu fused_add_rmsnorm.cu fused_encoder_fwd.cu fused_encoder_bwd.cu fused_norm_add.cu fused_qk_norm_rope.cu fused_adamw.cu fused_adamw_bf16.cu fused_norm_bf16.cu tiny_batched_gemm.cu fused_repeat_interleave.cu fused_rope.cu fused_softmax_vmul.cu fused_swiglu.cu gather.cu gather_q8.cu gemm_int8.cu gemm_int4.cu gemm_q4.cu gemm_q8.cu gemv_q4k.cu gemv_q4k_sm121.cu gemv_q5k.cu gemv_q5_0.cu gemv_q6k.cu gemv_warp.cu megakernel_ops.cu offset_memcpy.cu paged_attention.cu ragged_attention.cu rope_select.cu scaled_softmax.cu selective_scan.cu sgemv_m1.cu ternary_gemv.cu transpose.cu rmsnorm.cu argmax.cu
+SRCS = counter.cu dequant_q4k.cu dequant_q5_0.cu dequant_q5k.cu dequant_q6k.cu elementwise.cu elementwise_fp16.cu elementwise_bf16.cu flash_attention.cu flash_attention2.cu flash_decode.cu fp4_gemv.cu fp8_gemm.cu fp8_ops.cu fused_add_rmsnorm.cu fused_encoder_fwd.cu fused_encoder_bwd.cu fused_norm_add.cu fused_qk_norm_rope.cu fused_adamw.cu fused_adamw_bf16.cu fused_norm_bf16.cu tiny_batched_gemm.cu fused_repeat_interleave.cu fused_rope.cu fused_softmax_vmul.cu fused_swiglu.cu gather.cu gather_q8.cu gemm_int8.cu gemm_int4.cu gemm_q4.cu gemm_q8.cu gemv_q4k.cu gemv_q4k_sm121.cu gemv_q5k.cu gemv_q5_0.cu gemv_q6k.cu gemv_warp.cu megakernel_ops.cu offset_memcpy.cu paged_attention.cu ragged_attention.cu rope_select.cu scaled_softmax.cu selective_scan.cu sgemv_m1.cu ternary_gemv.cu transpose.cu rmsnorm.cu argmax.cu dropout.cu
 OBJS = $(SRCS:.cu=.o)
 PIC_OBJS = $(SRCS:.cu=.pic.o)
 LIB  = libkernels.a
diff --git a/internal/cuda/kernels/dropout.cu b/internal/cuda/kernels/dropout.cu
new file mode 100644
index 0000000..0183d05
--- /dev/null
+++ b/internal/cuda/kernels/dropout.cu
@@ -0,0 +1,92 @@
+// dropout.cu -- GB10 (sm_121) inverted-dropout with a deterministic Philox mask.
+//
+// The mask is drawn from a counter-based Philox4x32-10 generator keyed by
+// (seed, element offset), bit-identical to the CPU reference in
+// compute/philox.go. Because the draw is a pure function of (seed, offset, p),
+// the same seed produces the same mask on CPU and GPU -- which is what makes
+// ztensor's CPU-GPU parity gate pass -- and backward recomputes the mask from
+// (seed, p) rather than reading a cached buffer (capture-safe, no save pinned
+// across an arena reset; ztensor ADR 006).
+//
+// Inverted-dropout semantics (match torch.nn.functional.dropout): training mode
+// out[i] = keep(i) ? in[i] / (1 - p) : 0 with keep iff uniform >= p; eval mode
+// (training == 0) or p == 0 is exact identity. Forward and backward share one
+// kernel because dropout is linear in its input given the mask, so the host
+// passes the upstream gradient as `in` for the backward call.
+//
+// The Philox constants and round structure mirror compute/philox.go exactly.
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#define PHILOX_M0 0xD2511F53u
+#define PHILOX_M1 0xCD9E8D57u
+#define PHILOX_W0 0x9E3779B9u
+#define PHILOX_W1 0xBB67AE85u
+#define PHILOX_ROUNDS 10
+
+// Single Philox round: same word layout as the Go reference.
+__device__ __forceinline__ void philox_round(uint32_t c[4], const uint32_t k[2]) {
+    uint32_t hi0 = __umulhi(PHILOX_M0, c[0]);
+    uint32_t lo0 = PHILOX_M0 * c[0];
+    uint32_t hi1 = __umulhi(PHILOX_M1, c[2]);
+    uint32_t lo1 = PHILOX_M1 * c[2];
+    uint32_t n0 = hi1 ^ c[1] ^ k[0];
+    uint32_t n1 = lo1;
+    uint32_t n2 = hi0 ^ c[3] ^ k[1];
+    uint32_t n3 = lo0;
+    c[0] = n0; c[1] = n1; c[2] = n2; c[3] = n3;
+}
+
+// philox_uniform returns a uniform float in [0,1) for element `offset` under
+// `seed`, consuming only the first output lane (matches the Go reference).
+__device__ __forceinline__ float philox_uniform(uint64_t seed, uint64_t offset) {
+    uint32_t k[2] = { (uint32_t)seed, (uint32_t)(seed >> 32) };
+    uint32_t c[4] = { (uint32_t)offset, (uint32_t)(offset >> 32), 0u, 0u };
+#pragma unroll
+    for (int i = 0; i < PHILOX_ROUNDS; ++i) {
+        philox_round(c, k);
+        k[0] += PHILOX_W0;
+        k[1] += PHILOX_W1;
+    }
+    // Map a 32-bit word to [0,1) by dividing by 2^32; same keep/drop boundary
+    // (uniform >= p) the Go reference uses.
+    return (float)c[0] * (1.0f / 4294967296.0f);
+}
+
+__global__ void kernel_dropout(const float* __restrict__ in,
+                               float* __restrict__ out,
+                               int n, float p, uint64_t seed, float invKeep) {
+    int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (gid >= n) return;
+    float u = philox_uniform(seed, (uint64_t)gid);
+    out[gid] = (u >= p) ? (in[gid] * invKeep) : 0.0f;
+}
+
+__global__ void kernel_identity_copy(const float* __restrict__ in,
+                                     float* __restrict__ out, int n) {
+    int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (gid >= n) return;
+    out[gid] = in[gid];
+}
+
+extern "C" {
+
+// dropout_f32 applies inverted dropout to in[0..n-1], writing out[0..n-1].
+// training != 0 and p > 0 => masked-and-scaled; otherwise exact identity copy.
+// invKeep must be 1/(1-p) (computed host-side to match the CPU path bit-for-bit).
+cudaError_t dropout_f32(const float* in, float* out, int n,
+                        float p, uint64_t seed, int training, float invKeep,
+                        cudaStream_t stream) {
+    const int BLOCK = 256;
+    int grid = (n + BLOCK - 1) / BLOCK;
+    if (grid < 1) grid = 1;
+    if (training == 0 || p == 0.0f) {
+        kernel_identity_copy<<<grid, BLOCK, 0, stream>>>(in, out, n);
+        return cudaGetLastError();
+    }
+    kernel_dropout<<<grid, BLOCK, 0, stream>>>(in, out, n, p, seed, invKeep);
+    return cudaGetLastError();
+}
+
+} // extern "C"
diff --git a/internal/cuda/kernels/dropout.go b/internal/cuda/kernels/dropout.go
new file mode 100644
index 0000000..3b7458d
--- /dev/null
+++ b/internal/cuda/kernels/dropout.go
@@ -0,0 +1,35 @@
+//go:build cuda
+
+package kernels
+
+/*
+#cgo LDFLAGS: -L${SRCDIR} -lkernels -lcudart -lstdc++
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+extern cudaError_t dropout_f32(const float* in, float* out, int n,
+                               float p, uint64_t seed, int training, float invKeep,
+                               cudaStream_t stream);
+*/
+import "C"
+
+import "unsafe"
+
+// DropoutF32 launches the GPU inverted-dropout kernel with a deterministic
+// Philox mask keyed by (seed, element offset). in/out are [n] float32 device
+// pointers; when training is false or p==0 the kernel performs an exact identity
+// copy. invKeep must be 1/(1-p), computed host-side so the GPU result matches
+// the CPU path bit-for-bit. The same entry serves backward (pass the upstream
+// gradient as in) because dropout is linear in its input given the mask.
+func DropoutF32(in unsafe.Pointer, out unsafe.Pointer, n int,
+	p float32, seed uint64, training bool, invKeep float32, s unsafe.Pointer) error {
+	tr := C.int(0)
+	if training {
+		tr = 1
+	}
+	return checkCUDA(C.dropout_f32(
+		(*C.float)(in), (*C.float)(out), C.int(n),
+		C.float(p), C.uint64_t(seed), tr, C.float(invKeep),
+		stream(s),
+	), "dropout_f32")
+}
diff --git a/internal/cuda/kernels/dropout_purego.go b/internal/cuda/kernels/dropout_purego.go
new file mode 100644
index 0000000..100e54a
--- /dev/null
+++ b/internal/cuda/kernels/dropout_purego.go
@@ -0,0 +1,33 @@
+//go:build !cuda
+
+package kernels
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/zerfoo/ztensor/internal/cuda"
+)
+
+// DropoutF32 launches the GPU inverted-dropout kernel with a deterministic
+// Philox mask keyed by (seed, element offset). in/out are [n] float32 device
+// pointers; when training is false or p==0 the kernel performs an exact identity
+// copy. invKeep must be 1/(1-p), computed host-side so the GPU result matches
+// the CPU path bit-for-bit. The same entry serves backward (pass the upstream
+// gradient as in) because dropout is linear in its input given the mask.
+func DropoutF32(in unsafe.Pointer, out unsafe.Pointer, n int,
+	p float32, seed uint64, training bool, invKeep float32, s unsafe.Pointer) error {
+	k := klib()
+	if k == nil {
+		return fmt.Errorf("dropout kernel: kernels not available")
+	}
+	tr := uintptr(0)
+	if training {
+		tr = 1
+	}
+	ret := cuda.Ccall(k.launchDropoutF32,
+		uintptr(in), uintptr(out), uintptr(n),
+		uintptr(floatBits(p)), uintptr(seed), tr,
+		uintptr(floatBits(invKeep)), uintptr(s))
+	return checkKernel(ret, "dropout_f32")
+}
diff --git a/internal/cuda/kernels/purego.go b/internal/cuda/kernels/purego.go
index f2a9ebc..928adee 100644
--- a/internal/cuda/kernels/purego.go
+++ b/internal/cuda/kernels/purego.go
@@ -80,6 +80,9 @@ type KernelLib struct {
 	// argmax
 	launchArgmax uintptr
 
+	// dropout (deterministic Philox mask, inverted-dropout f32)
+	launchDropoutF32 uintptr
+
 	// fused_rope
 	launchFusedRoPEF32 uintptr
 
@@ -312,6 +315,8 @@ func openKernelLib() (*KernelLib, error) {
 			{"gemm_q8_f32", &k.launchGemmQ8F32},
 			// argmax
 			{"launch_argmax", &k.launchArgmax},
+			// dropout (deterministic Philox mask)
+			{"dropout_f32", &k.launchDropoutF32},
 			// fused_rope
 			{"fused_rope_f32", &k.launchFusedRoPEF32},
 			// fused_swiglu
diff --git a/internal/gpuapi/cuda_kernels.go b/internal/gpuapi/cuda_kernels.go
index 0060428..ac89ae7 100644
--- a/internal/gpuapi/cuda_kernels.go
+++ b/internal/gpuapi/cuda_kernels.go
@@ -418,6 +418,14 @@ func (k *CUDAKernels) FusedEncoderFwdAvailable() bool {
 	return kernels.FusedEncoderAvailable()
 }
 
+// DropoutF32 launches the on-device inverted-dropout kernel (deterministic
+// Philox mask). Forward and backward both call this; backward passes the
+// upstream gradient as in.
+func (k *CUDAKernels) DropoutF32(in, out unsafe.Pointer, n int, p float32, seed uint64, training bool, invKeep float32, s Stream) error {
+	return kernels.DropoutF32(in, out, n, p, seed, training, invKeep, streamPtr(s))
+}
+
 // Compile-time interface assertion.
 var _ KernelRunner = (*CUDAKernels)(nil)
 var _ BFloat16Transposer = (*CUDAKernels)(nil)
+var _ Dropouter = (*CUDAKernels)(nil)
diff --git a/internal/gpuapi/kernels.go b/internal/gpuapi/kernels.go
index b8f4c1a..c0e10c1 100644
--- a/internal/gpuapi/kernels.go
+++ b/internal/gpuapi/kernels.go
@@ -286,6 +286,20 @@ type KernelRunner interface {
 	FusedEncoderFwdAvailable() bool
 }
 
+// Dropouter is an optional KernelRunner extension providing an on-device
+// inverted-dropout kernel with a deterministic Philox mask keyed by (seed,
+// element offset). The mask is bit-identical to the CPU reference, so dropout
+// passes the CPU-GPU parity gate; invKeep is supplied host-side as 1/(1-p) so
+// the scale matches the CPU path exactly. The same entry serves forward and
+// backward (backward passes the upstream gradient as `in`) because dropout is
+// linear in its input given the mask. Only the CUDA backend implements it;
+// callers type-assert and report unavailability when it is absent.
+type Dropouter interface {
+	// DropoutF32 writes out[i] = (philox(seed,i) >= p) ? in[i]*invKeep : 0 in
+	// training mode; in eval mode (training==false) or p==0 it copies in->out.
+	DropoutF32(in, out unsafe.Pointer, n int, p float32, seed uint64, training bool, invKeep float32, stream Stream) error
+}
+
 // BFloat16Transposer is an optional KernelRunner extension providing on-device
 // bf16 (16-bit) transpose kernels. Without it, a GPU engine over bf16 must route
 // transposes to the CPU engine, whose host memcpy breaks CUDA-graph capture

From 2919372ea59be942a221c6047ac2f07da4c74f8a Mon Sep 17 00:00:00 2001
From: David Ndungu <ndungu.sink@gmail.com>
Date: Sat, 20 Jun 2026 14:42:29 -0700
Subject: [PATCH 3/5] ci(spark): GB10 dropout verify manifest (BPB.3a)

One-pod GB10 verify mirroring deploy/spark/bf16-verify-gb10.yaml: clone the
bpb3a-dropout branch, build libkernels.so for sm_121, run the CUDA-gated
dropout CPU-GPU parity tests (TestGPUDropout_*) and the full-registry GB10
parity run (testing/parity -run _GPU, which now includes Dropout).

Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH
---
 deploy/spark/dropout-verify-gb10.yaml | 56 +++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 deploy/spark/dropout-verify-gb10.yaml

diff --git a/deploy/spark/dropout-verify-gb10.yaml b/deploy/spark/dropout-verify-gb10.yaml
new file mode 100644
index 0000000..6866319
--- /dev/null
+++ b/deploy/spark/dropout-verify-gb10.yaml
@@ -0,0 +1,56 @@
+# GB10 verification for the dropout op (BPB.3a).
+#
+# One pod that (1) clones the bpb3a-dropout branch, (2) builds libkernels.so for
+# sm_121 with the host CUDA toolkit (nvcc on /usr/local/cuda, mounted read-only),
+# and (3) runs the CUDA-gated dropout CPU-GPU parity tests (TestGPUDropout_*) plus
+# the full-registry GB10 parity run (-run _GPU, which now includes Dropout) on the
+# GB10.
+#
+# Submit (token injected at submit time, never persisted in the manifest):
+#   TOKEN=$(gh auth token)
+#   sed "s#__GH_TOKEN__#$TOKEN#; s#__TAG__#$(git rev-parse --short=8 HEAD)#" \
+#     deploy/spark/dropout-verify-gb10.yaml \
+#     | curl -X POST http://192.168.86.250:8080/api/v1/pods \
+#         --data-binary @- -H 'Content-Type: application/yaml'
+#
+# memory limit set per lore L-0005 (cmd/lint-manifests).
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ztensor-dropout-verify-__TAG__
+  labels:
+    app: ztensor
+    role: gpu-verify
+spec:
+  restartPolicy: Never
+  containers:
+    - name: verify
+      image: docker.io/library/golang:1.26
+      command:
+        - /bin/sh
+        - -c
+        - "set -ex; export PATH=/usr/local/cuda/bin:$PATH; export LD_LIBRARY_PATH=/usr/local/cuda/lib64; git clone --depth 1 -b bpb3a-dropout https://__GH_TOKEN__@github.com/zerfoo/ztensor.git /work/zt; cd /work/zt; make -C internal/cuda/kernels shared CUDA_ARCH=sm_121; LD_LIBRARY_PATH=/work/zt/internal/cuda/kernels:/usr/local/cuda/lib64 go test ./compute/ -run GPUDropout -v -count=1; LD_LIBRARY_PATH=/work/zt/internal/cuda/kernels:/usr/local/cuda/lib64 go test ./testing/parity/ -run _GPU -v -count=1"
+      env:
+        - name: GOFLAGS
+          value: "-mod=mod"
+      resources:
+        limits:
+          nvidia.com/gpu: "1"
+          cpu: "4"
+          memory: "24Gi"
+        requests:
+          cpu: "1"
+          memory: "8Gi"
+      volumeMounts:
+        - name: cuda
+          mountPath: /usr/local/cuda
+          readOnly: true
+        - name: work
+          mountPath: /work
+  volumes:
+    - name: cuda
+      hostPath:
+        path: /usr/local/cuda
+        type: Directory
+    - name: work
+      emptyDir: {}

From d22ebf175f9bae2077a6fb9f227e835a815ce9a5 Mon Sep 17 00:00:00 2001
From: David Ndungu <ndungu.sink@gmail.com>
Date: Sat, 20 Jun 2026 14:55:16 -0700
Subject: [PATCH 4/5] fix(cuda): pass dropout p/invKeep as int bit patterns
 (purego ABI)

The first GB10 run failed: GPU dropout returned all zeros vs the CPU reference.
Root cause is the purego/dlopen launch ABI -- the AAPCS64 trampoline
(internal/cuda/purego_linux_arm64.s) loads every argument into integer
registers R0-R7 only and never populates the V float registers. A C kernel with
`float` parameters reads them from V registers (garbage/zero) AND, because
integer and float args consume separate AAPCS64 register sequences, every
argument after the first float is shifted -- so seed/training/stream were all
misread and the kernel wrote zeros.

Fix: pass p and invKeep as their 32-bit IEEE-754 bit patterns in uint32 integer
parameters and reinterpret them inside the kernel with __uint_as_float. The ABI
is now integer-only and identical between the CGO (-tags cuda) and purego
launch paths. Updated dropout.cu, dropout.go (cgo), dropout_purego.go.

Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH
---
 internal/cuda/kernels/dropout.cu        | 14 ++++++++++++--
 internal/cuda/kernels/dropout.go        | 11 ++++++++---
 internal/cuda/kernels/dropout_purego.go |  8 ++++++--
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/internal/cuda/kernels/dropout.cu b/internal/cuda/kernels/dropout.cu
index 0183d05..a4b8fcd 100644
--- a/internal/cuda/kernels/dropout.cu
+++ b/internal/cuda/kernels/dropout.cu
@@ -74,13 +74,23 @@ extern "C" {
 
 // dropout_f32 applies inverted dropout to in[0..n-1], writing out[0..n-1].
 // training != 0 and p > 0 => masked-and-scaled; otherwise exact identity copy.
-// invKeep must be 1/(1-p) (computed host-side to match the CPU path bit-for-bit).
+//
+// p and invKeep (= 1/(1-p)) are passed as their 32-bit IEEE-754 BIT PATTERNS in
+// integer parameters, not as `float`. The purego/dlopen launch path (no CGO)
+// loads every argument into integer registers only (the AAPCS64 trampoline
+// never populates the V float registers), so a `float` parameter would read
+// garbage and shift every following argument. Passing the bits as uint32 and
+// reinterpreting here with __uint_as_float keeps the ABI integer-only and
+// identical between the CGO and purego paths. invKeep is computed host-side so
+// the scale matches the CPU path bit-for-bit.
 cudaError_t dropout_f32(const float* in, float* out, int n,
-                        float p, uint64_t seed, int training, float invKeep,
+                        uint32_t pBits, uint64_t seed, int training, uint32_t invKeepBits,
                         cudaStream_t stream) {
     const int BLOCK = 256;
     int grid = (n + BLOCK - 1) / BLOCK;
     if (grid < 1) grid = 1;
+    float p = __uint_as_float(pBits);
+    float invKeep = __uint_as_float(invKeepBits);
     if (training == 0 || p == 0.0f) {
         kernel_identity_copy<<<grid, BLOCK, 0, stream>>>(in, out, n);
         return cudaGetLastError();
diff --git a/internal/cuda/kernels/dropout.go b/internal/cuda/kernels/dropout.go
index 3b7458d..9721ce3 100644
--- a/internal/cuda/kernels/dropout.go
+++ b/internal/cuda/kernels/dropout.go
@@ -8,12 +8,15 @@ package kernels
 #include <stdint.h>
 
 extern cudaError_t dropout_f32(const float* in, float* out, int n,
-                               float p, uint64_t seed, int training, float invKeep,
+                               uint32_t pBits, uint64_t seed, int training, uint32_t invKeepBits,
                                cudaStream_t stream);
 */
 import "C"
 
-import "unsafe"
+import (
+	"math"
+	"unsafe"
+)
 
 // DropoutF32 launches the GPU inverted-dropout kernel with a deterministic
 // Philox mask keyed by (seed, element offset). in/out are [n] float32 device
@@ -27,9 +30,11 @@ func DropoutF32(in unsafe.Pointer, out unsafe.Pointer, n int,
 	if training {
 		tr = 1
 	}
+	// Pass p and invKeep as float32 bit patterns in integer params so the ABI
+	// is identical to the purego launch path (which cannot use float registers).
 	return checkCUDA(C.dropout_f32(
 		(*C.float)(in), (*C.float)(out), C.int(n),
-		C.float(p), C.uint64_t(seed), tr, C.float(invKeep),
+		C.uint32_t(math.Float32bits(p)), C.uint64_t(seed), tr, C.uint32_t(math.Float32bits(invKeep)),
 		stream(s),
 	), "dropout_f32")
 }
diff --git a/internal/cuda/kernels/dropout_purego.go b/internal/cuda/kernels/dropout_purego.go
index 100e54a..304aa6a 100644
--- a/internal/cuda/kernels/dropout_purego.go
+++ b/internal/cuda/kernels/dropout_purego.go
@@ -4,6 +4,7 @@ package kernels
 
 import (
 	"fmt"
+	"math"
 	"unsafe"
 
 	"github.com/zerfoo/ztensor/internal/cuda"
@@ -25,9 +26,12 @@ func DropoutF32(in unsafe.Pointer, out unsafe.Pointer, n int,
 	if training {
 		tr = 1
 	}
+	// p and invKeep are passed as their float32 bit patterns in INTEGER
+	// registers (the purego AAPCS64 trampoline never populates the V float
+	// registers); the kernel reinterprets them with __uint_as_float.
 	ret := cuda.Ccall(k.launchDropoutF32,
 		uintptr(in), uintptr(out), uintptr(n),
-		uintptr(floatBits(p)), uintptr(seed), tr,
-		uintptr(floatBits(invKeep)), uintptr(s))
+		uintptr(math.Float32bits(p)), uintptr(seed), tr,
+		uintptr(math.Float32bits(invKeep)), uintptr(s))
 	return checkKernel(ret, "dropout_f32")
 }

From bfc77b4cbd9b8cbe2ad43342abb3870e0bab42dd Mon Sep 17 00:00:00 2001
From: David Ndungu <ndungu.sink@gmail.com>
Date: Sat, 20 Jun 2026 14:57:00 -0700
Subject: [PATCH 5/5] fix(cuda): reinterpret dropout p/invKeep bits host-side
 (not __uint_as_float)

nvcc rejected calling the device intrinsic __uint_as_float from the __host__
launcher dropout_f32. Reinterpret the uint32 bit patterns to float on the host
with a memcpy helper (host_uint_as_float) instead; the resulting float kernel
args marshal correctly through the CUDA <<<>>> launch (kernel-arg passing is
unaffected by the host dlopen ABI).

Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH
---
 internal/cuda/kernels/dropout.cu | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/internal/cuda/kernels/dropout.cu b/internal/cuda/kernels/dropout.cu
index a4b8fcd..2fd5481 100644
--- a/internal/cuda/kernels/dropout.cu
+++ b/internal/cuda/kernels/dropout.cu
@@ -18,6 +18,15 @@
 
 #include <cuda_runtime.h>
 #include <stdint.h>
+#include <string.h>
+
+// host_uint_as_float reinterprets a 32-bit pattern as a float on the HOST
+// (the device intrinsic __uint_as_float is not callable from host code).
+static inline float host_uint_as_float(uint32_t bits) {
+    float f;
+    memcpy(&f, &bits, sizeof(f));
+    return f;
+}
 
 #define PHILOX_M0 0xD2511F53u
 #define PHILOX_M1 0xCD9E8D57u
@@ -89,8 +98,8 @@ cudaError_t dropout_f32(const float* in, float* out, int n,
     const int BLOCK = 256;
     int grid = (n + BLOCK - 1) / BLOCK;
     if (grid < 1) grid = 1;
-    float p = __uint_as_float(pBits);
-    float invKeep = __uint_as_float(invKeepBits);
+    float p = host_uint_as_float(pBits);
+    float invKeep = host_uint_as_float(invKeepBits);
     if (training == 0 || p == 0.0f) {
         kernel_identity_copy<<<grid, BLOCK, 0, stream>>>(in, out, n);
         return cudaGetLastError();