From b4cbe8b6e3cb37f4afa7a15c337b7b07853cda53 Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Sat, 20 Jun 2026 14:36:23 -0700 Subject: [PATCH 1/5] feat(compute): CPU dropout op with deterministic Philox mask (BPB.3a) Add an inverted-dropout op to ztensor with a deterministic, seedable mask: - Philox4x32-10 counter-based RNG (compute/philox.go) keyed by (seed, element offset). The mask is a pure function of (seed, offset, p), so it is reproducible and will be bit-identical to the forthcoming CUDA kernel -- the property that makes CPU-GPU parity pass. The CPU Go impl is the reference; the GB10 kernel mirrors the same constants. - Dropouter[T] optional capability interface (compute/engine.go): Dropout + DropoutBackward. Inverted-dropout semantics match torch.nn.functional.dropout: training mode y = x*mask/(1-p) with mask~Bernoulli(1-p); eval mode / p==0 is exact identity. p in [0,1) is validated. - CPU implementation (compute/cpu_engine.go): the mask is recomputed in backward from (seed,p) rather than cached, keeping the op capture-safe and avoiding a pinned save across arena resets (ADR 006). Forward and backward share one masked-and-scaled kernel (dropout is linear in its input given the mask). - EngineProxy delegates the capability; the parity StressEngine relocates the masked output into the host arena like every other op so dropout runs the reset-between-fwd-bwd schedules. Gates (CPU half): - gradcheck: Dropout OpInfo (p=0.3, fixed seed, [4,8]) -- the deterministic mask makes dropout an exact linear map, finite-diff == analytic backward. PASS. - CPU-side parity (testing/parity arena-stress, RegistryGreen): PASS. - PyTorch oracle: SkipReason -- torch's training-mode mask uses its own Philox word->element mapping, not ztensor's; matching it would mean reimplementing ztensor's Philox in the torch runner (the HadamardTransform precedent). Mask-vs-input math is pinned by gradcheck + parity instead. - Unit tests: p=0 identity, eval identity, mask determinism, inverted-dropout mean preservation, backward==mask, invalid p. PASS under -race. GPU kernel (GB10 f32) + GB10 parity/oracle replay are the next milestone. Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH --- compute/cpu_engine.go | 61 +++++++++ compute/dropout_test.go | 221 ++++++++++++++++++++++++++++++++ compute/engine.go | 25 ++++ compute/engine_proxy.go | 28 ++++ compute/philox.go | 82 ++++++++++++ testing/gradcheck/ops.go | 21 +++ testing/gradcheck/registry.go | 11 ++ testing/oracle/generate_test.go | 13 +- testing/oracle/torchmap.go | 3 + testing/parity/stress_engine.go | 22 ++++ 10 files changed, 485 insertions(+), 2 deletions(-) create mode 100644 compute/dropout_test.go create mode 100644 compute/philox.go diff --git a/compute/cpu_engine.go b/compute/cpu_engine.go index eac9b88..b0527fd 100644 --- a/compute/cpu_engine.go +++ b/compute/cpu_engine.go @@ -966,6 +966,67 @@ func (e *CPUEngine[T]) RandomUniform(_ context.Context, t *tensor.TensorNumeric[ return nil } +// Dropout applies inverted dropout to a with a deterministic Philox mask keyed +// by (seed, element offset). In training mode each element is kept with +// probability (1-p) and scaled by 1/(1-p); dropped elements become zero. In +// eval mode (training==false) or when p==0 the op is an exact identity copy. +// The mask is a pure function of (seed, offset, p) and is recomputed in +// DropoutBackward rather than cached, so no save survives across arena resets. +func (e *CPUEngine[T]) Dropout(ctx context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) { + return e.dropoutMasked(ctx, a, p, seed, training, dst...) +} + +// DropoutBackward propagates g through the same mask Dropout used for the +// identical (p, seed, training). Because dropout is element-wise linear in its +// input with the mask fixed, the backward is the same masked-and-scaled map +// applied to the upstream gradient. +func (e *CPUEngine[T]) DropoutBackward(ctx context.Context, g *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) { + return e.dropoutMasked(ctx, g, p, seed, training, dst...) +} + +// dropoutMasked is the shared forward/backward kernel: out[i] = keep(i) ? +// in[i]/(1-p) : 0 in training mode, out[i] = in[i] in eval mode. Forward and +// backward are identical because dropout is linear in its input given the mask. +func (e *CPUEngine[T]) dropoutMasked(ctx context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) { + if a == nil { + return nil, errors.New("input tensor cannot be nil") + } + if p < 0 || p >= 1 { + return nil, fmt.Errorf("dropout: p must be in [0, 1), got %g", p) + } + result, err := e.getOrCreateDest(a.Shape(), dst...) + if err != nil { + return nil, err + } + aData := a.Data() + rData := result.Data() + // Eval mode or p==0: exact identity copy (no RNG, no scaling). + if !training || p == 0 { + if err := parallelForCtx(ctx, len(aData), func(start, end int) { + for i := start; i < end; i++ { //nolint:intrange + rData[i] = aData[i] + } + }); err != nil { + return nil, err + } + return result, nil + } + scale := e.ops.FromFloat64(1.0 / (1.0 - p)) + zero := e.ops.FromFloat64(0) + if err := parallelForCtx(ctx, len(aData), func(start, end int) { + for i := start; i < end; i++ { //nolint:intrange + if dropoutKeep(seed, uint64(i), p) { + rData[i] = e.ops.Mul(aData[i], scale) + } else { + rData[i] = zero + } + } + }); err != nil { + return nil, err + } + return result, nil +} + // Fill sets all elements of t to value. func (e *CPUEngine[T]) Fill(_ context.Context, t *tensor.TensorNumeric[T], value T) error { if t == nil { diff --git a/compute/dropout_test.go b/compute/dropout_test.go new file mode 100644 index 0000000..02d067f --- /dev/null +++ b/compute/dropout_test.go @@ -0,0 +1,221 @@ +package compute + +import ( + "context" + "math" + "testing" + + "github.com/zerfoo/ztensor/numeric" + "github.com/zerfoo/ztensor/tensor" +) + +// TestPhiloxDeterminism: the same (seed, offset) always yields the same draw, +// and different offsets/seeds yield different draws (so the mask is not constant). +func TestPhiloxDeterminism(t *testing.T) { + const seed = uint64(0x9e3779b97f4a7c15) + for off := uint64(0); off < 16; off++ { + a := philoxUniform(seed, off) + b := philoxUniform(seed, off) + if a != b { + t.Fatalf("philoxUniform not deterministic at offset %d: %g != %g", off, a, b) + } + if a < 0 || a >= 1 { + t.Fatalf("philoxUniform offset %d out of [0,1): %g", off, a) + } + } + if philoxUniform(seed, 0) == philoxUniform(seed, 1) { + t.Fatal("philoxUniform identical for offsets 0 and 1 (mask would be constant)") + } + if philoxUniform(1, 0) == philoxUniform(2, 0) { + t.Fatal("philoxUniform identical for seeds 1 and 2") + } +} + +// TestPhiloxUniformMean: over many draws the empirical mean is ~0.5, confirming +// the [0,1) mapping is unbiased (so 1-p is the true keep rate). +func TestPhiloxUniformMean(t *testing.T) { + const n = 1 << 16 + var sum float64 + for i := uint64(0); i < n; i++ { + sum += philoxUniform(0xdeadbeefcafef00d, i) + } + mean := sum / float64(n) + if math.Abs(mean-0.5) > 0.01 { + t.Fatalf("philox uniform mean %g not within 0.01 of 0.5", mean) + } +} + +// TestDropout_EvalIdentity: eval mode returns an exact copy regardless of p. +func TestDropout_EvalIdentity(t *testing.T) { + e := NewCPUEngine[float32](numeric.Float32Ops{}) + ctx := context.Background() + x, err := tensor.New[float32]([]int{2, 4}, []float32{1, -2, 3, -4, 5, -6, 7, -8}) + if err != nil { + t.Fatal(err) + } + y, err := e.Dropout(ctx, x, 0.5, 42, false) + if err != nil { + t.Fatalf("Dropout eval: %v", err) + } + for i, v := range y.Data() { + if v != x.Data()[i] { + t.Fatalf("eval mode not identity at %d: got %g want %g", i, v, x.Data()[i]) + } + } +} + +// TestDropout_PZeroIdentity: training mode with p=0 is exact identity (no scale, +// no drop). +func TestDropout_PZeroIdentity(t *testing.T) { + e := NewCPUEngine[float32](numeric.Float32Ops{}) + ctx := context.Background() + x, err := tensor.New[float32]([]int{2, 4}, []float32{1, -2, 3, -4, 5, -6, 7, -8}) + if err != nil { + t.Fatal(err) + } + y, err := e.Dropout(ctx, x, 0, 42, true) + if err != nil { + t.Fatalf("Dropout p=0: %v", err) + } + for i, v := range y.Data() { + if v != x.Data()[i] { + t.Fatalf("p=0 not identity at %d: got %g want %g", i, v, x.Data()[i]) + } + } +} + +// TestDropout_MaskDeterminism: same seed => identical output across calls; kept +// elements are scaled by 1/(1-p), dropped ones are zero, and the same Philox +// draw governs which is which. +func TestDropout_MaskDeterminism(t *testing.T) { + e := NewCPUEngine[float32](numeric.Float32Ops{}) + ctx := context.Background() + const p = 0.3 + const seed = uint64(123456789) + n := 64 + data := make([]float32, n) + for i := range data { + data[i] = float32(i + 1) + } + x, err := tensor.New[float32]([]int{8, 8}, data) + if err != nil { + t.Fatal(err) + } + y1, err := e.Dropout(ctx, x, p, seed, true) + if err != nil { + t.Fatal(err) + } + y2, err := e.Dropout(ctx, x, p, seed, true) + if err != nil { + t.Fatal(err) + } + scale := float32(1.0 / (1.0 - p)) + keptSeen, dropSeen := false, false + for i := 0; i < n; i++ { + if y1.Data()[i] != y2.Data()[i] { + t.Fatalf("mask not deterministic at %d: %g != %g", i, y1.Data()[i], y2.Data()[i]) + } + keep := dropoutKeep(seed, uint64(i), p) + var want float32 + if keep { + want = x.Data()[i] * scale + keptSeen = true + } else { + dropSeen = true + } + if y1.Data()[i] != want { + t.Fatalf("element %d keep=%v: got %g want %g", i, keep, y1.Data()[i], want) + } + } + if !keptSeen || !dropSeen { + t.Fatalf("expected both kept and dropped elements (kept=%v drop=%v)", keptSeen, dropSeen) + } +} + +// TestDropout_ExpectedMeanScale: E[y] == E[x] under inverted dropout because the +// 1/(1-p) scale compensates for the (1-p) keep rate. Checked over a large +// constant input so the empirical mean is tightly concentrated. +func TestDropout_ExpectedMeanScale(t *testing.T) { + e := NewCPUEngine[float32](numeric.Float32Ops{}) + ctx := context.Background() + const p = 0.4 + n := 1 << 14 + data := make([]float32, n) + for i := range data { + data[i] = 2.0 + } + x, err := tensor.New[float32]([]int{n}, data) + if err != nil { + t.Fatal(err) + } + y, err := e.Dropout(ctx, x, p, 0xabcdef, true) + if err != nil { + t.Fatal(err) + } + var sum float64 + for _, v := range y.Data() { + sum += float64(v) + } + mean := sum / float64(n) + if math.Abs(mean-2.0) > 0.05 { + t.Fatalf("inverted-dropout mean %g not within 0.05 of input mean 2.0", mean) + } +} + +// TestDropout_BackwardMatchesMask: backward applies the SAME mask/scale as +// forward (dropout is linear in its input given the mask), and recomputes the +// mask deterministically rather than relying on a cached one. +func TestDropout_BackwardMatchesMask(t *testing.T) { + e := NewCPUEngine[float32](numeric.Float32Ops{}) + ctx := context.Background() + const p = 0.25 + const seed = uint64(777) + n := 32 + g := make([]float32, n) + for i := range g { + g[i] = float32(i) - 16 + } + gt, err := tensor.New[float32]([]int{4, 8}, g) + if err != nil { + t.Fatal(err) + } + dx, err := e.DropoutBackward(ctx, gt, p, seed, true) + if err != nil { + t.Fatal(err) + } + scale := float32(1.0 / (1.0 - p)) + for i := 0; i < n; i++ { + var want float32 + if dropoutKeep(seed, uint64(i), p) { + want = g[i] * scale + } + if dx.Data()[i] != want { + t.Fatalf("backward element %d: got %g want %g", i, dx.Data()[i], want) + } + } + // Eval-mode backward is a pass-through. + dxe, err := e.DropoutBackward(ctx, gt, p, seed, false) + if err != nil { + t.Fatal(err) + } + for i := 0; i < n; i++ { + if dxe.Data()[i] != g[i] { + t.Fatalf("eval backward not pass-through at %d: got %g want %g", i, dxe.Data()[i], g[i]) + } + } +} + +// TestDropout_InvalidP: p outside [0,1) is rejected. +func TestDropout_InvalidP(t *testing.T) { + e := NewCPUEngine[float32](numeric.Float32Ops{}) + ctx := context.Background() + x, err := tensor.New[float32]([]int{2}, []float32{1, 2}) + if err != nil { + t.Fatal(err) + } + for _, p := range []float64{-0.1, 1.0, 1.5} { + if _, err := e.Dropout(ctx, x, p, 0, true); err == nil { + t.Fatalf("expected error for p=%g, got nil", p) + } + } +} diff --git a/compute/engine.go b/compute/engine.go index 911576d..372788c 100644 --- a/compute/engine.go +++ b/compute/engine.go @@ -61,6 +61,31 @@ type TransposeAMatMuler[T tensor.Numeric] interface { MatMulTransposeA(ctx context.Context, a, b *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) } +// Dropouter is an optional interface for engines that provide a dropout op +// with a deterministic, seedable mask. The mask is drawn from a counter-based +// Philox RNG keyed by (seed, element offset), so it is bit-identical on the CPU +// engine and the GPU engine for the same (seed, p) -- which is what makes the +// CPU-GPU parity and PyTorch-oracle gates pass. Inverted-dropout semantics +// match torch.nn.functional.dropout: in training mode the kept elements are +// scaled by 1/(1-p); in eval mode (or p==0) the op is exact identity. +// +// The mask is NOT cached for backward; DropoutBackward recomputes it from the +// same (seed, p), keeping the op capture-safe and avoiding a pinned save across +// arena resets (ztensor ADR 006). +// +// This API is not covered by the v1 stability guarantee. +type Dropouter[T tensor.Numeric] interface { + // Dropout applies inverted dropout to `a`. When training is false or p==0 + // the result is an exact copy of `a`. Otherwise each element is kept with + // probability (1-p) and scaled by 1/(1-p); dropped elements become zero. + // p must be in [0, 1). + Dropout(ctx context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) + // DropoutBackward propagates the upstream gradient through the same mask + // used by Dropout for the identical (p, seed, training): dx = g * mask/(1-p) + // in training mode, dx = g in eval mode. The mask is recomputed, not cached. + DropoutBackward(ctx context.Context, g *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) +} + // StreamProvider is an optional interface for engines that expose their // underlying GPU stream for CUDA graph capture. // diff --git a/compute/engine_proxy.go b/compute/engine_proxy.go index 18884ad..e75a03f 100644 --- a/compute/engine_proxy.go +++ b/compute/engine_proxy.go @@ -402,6 +402,34 @@ func (p *EngineProxy[T]) GPUFusedAddRMSNorm(input, residual, weight *tensor.Tens return } +// Dropout delegates to the underlying engine's Dropouter capability. The proxy +// implements Dropouter[T] so consumers can type-assert it on a wrapped engine; +// it returns an error if the real engine does not provide the capability. +func (p *EngineProxy[T]) Dropout(ctx context.Context, a *tensor.TensorNumeric[T], prob float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) { + d, ok := p.real.(Dropouter[T]) + if !ok { + return nil, fmt.Errorf("engine %T does not implement Dropouter", p.real) + } + result, err := d.Dropout(ctx, a, prob, seed, training, dst...) + if err == nil { + p.record("Dropout", []*tensor.TensorNumeric[T]{a}, result, nil) + } + return result, err +} + +// DropoutBackward delegates to the underlying engine's Dropouter capability. +func (p *EngineProxy[T]) DropoutBackward(ctx context.Context, g *tensor.TensorNumeric[T], prob float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) { + d, ok := p.real.(Dropouter[T]) + if !ok { + return nil, fmt.Errorf("engine %T does not implement Dropouter", p.real) + } + result, err := d.DropoutBackward(ctx, g, prob, seed, training, dst...) + if err == nil { + p.record("DropoutBackward", []*tensor.TensorNumeric[T]{g}, result, nil) + } + return result, err +} + // MatMulTransposeB delegates to the underlying engine if it implements TransposeBMatMuler. func (p *EngineProxy[T]) MatMulTransposeB(ctx context.Context, a, b *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) { if tb, ok := p.real.(TransposeBMatMuler[T]); ok { diff --git a/compute/philox.go b/compute/philox.go new file mode 100644 index 0000000..d135b20 --- /dev/null +++ b/compute/philox.go @@ -0,0 +1,82 @@ +package compute + +// Philox4x32-10 counter-based RNG. +// +// Philox is a stateless, counter-based pseudo-random number generator: the +// output is a pure function of a (key, counter) pair, with no carried state. +// That property is exactly what dropout needs for CPU-GPU parity -- the same +// (seed, element offset) produces the same uniform draw on the CPU engine (this +// Go implementation) and on the GB10 GPU (the CUDA kernel in +// internal/cuda/kernels/dropout.cu, which mirrors these constants bit-for-bit). +// +// Because the mask is a pure function of (seed, offset, p), dropout never caches +// the mask for backward: backward recomputes it deterministically from the same +// inputs. This keeps the op capture-safe and avoids pinning a saved mask across +// arena resets (ztensor ADR 006, the SaveForBackward lifetime contract). +// +// Reference: Salmon, Moraes, Dror, Shaw, "Parallel Random Numbers: As Easy as +// 1, 2, 3" (SC'11). The 10-round Philox4x32 variant is the standard generator +// PyTorch/cuRAND use for dropout, so matching it here keeps the PyTorch-oracle +// gate meaningful. + +const ( + philoxM0 uint32 = 0xD2511F53 + philoxM1 uint32 = 0xCD9E8D57 + philoxW0 uint32 = 0x9E3779B9 // Weyl constant for key[0] (golden ratio) + philoxW1 uint32 = 0xBB67AE85 // Weyl constant for key[1] (sqrt(3)-1) + philoxRounds = 10 +) + +// philoxMulhilo computes the 64-bit product a*b and returns (hi, lo) 32-bit +// halves, matching the CUDA __umulhi/lo decomposition used in dropout.cu. +func philoxMulhilo(a, b uint32) (hi, lo uint32) { + product := uint64(a) * uint64(b) + hi = uint32(product >> 32) + lo = uint32(product) + return hi, lo +} + +// philox4x32 runs the 10-round Philox4x32 bijection on a 128-bit counter +// (four uint32 words) under a 64-bit key (two uint32 words), returning four +// uint32 outputs. Identical word-for-word to the CUDA device function. +func philox4x32(ctr [4]uint32, key [2]uint32) [4]uint32 { + c := ctr + k := key + for i := 0; i < philoxRounds; i++ { + hi0, lo0 := philoxMulhilo(philoxM0, c[0]) + hi1, lo1 := philoxMulhilo(philoxM1, c[2]) + c = [4]uint32{ + hi1 ^ c[1] ^ k[0], + lo1, + hi0 ^ c[3] ^ k[1], + lo0, + } + // Bump the key (Weyl sequence) for the next round. + k[0] += philoxW0 + k[1] += philoxW1 + } + return c +} + +// philoxUniform returns a single uniform float64 in [0, 1) for the element at +// linear index `offset` under the 64-bit `seed`. The seed splits into the two +// key words; the counter holds the offset (low/high words) and two zero lanes. +// Only the first output lane is consumed, which is sufficient for a one-draw- +// per-element dropout mask and keeps the CPU and GPU draws trivially aligned. +func philoxUniform(seed uint64, offset uint64) float64 { + key := [2]uint32{uint32(seed), uint32(seed >> 32)} + ctr := [4]uint32{uint32(offset), uint32(offset >> 32), 0, 0} + out := philox4x32(ctr, key) + // Map a 32-bit word to [0,1): divide by 2^32. Matches the CUDA kernel's + // (out.x * (1.0f / 4294967296.0f)) but in float64 for the CPU/gradcheck + // path; the keep/drop decision (u >= p) is identical given the same word. + return float64(out[0]) * (1.0 / 4294967296.0) +} + +// dropoutKeep reports whether the element at `offset` is kept (true) or dropped +// (false) for drop probability p under seed. Keep iff the uniform draw is >= p, +// so p=0 keeps everything and p=1 drops everything. Identical decision on CPU +// and GPU because philoxUniform is bit-reproducible across both. +func dropoutKeep(seed uint64, offset uint64, p float64) bool { + return philoxUniform(seed, offset) >= p +} diff --git a/testing/gradcheck/ops.go b/testing/gradcheck/ops.go index 98b9191..0941782 100644 --- a/testing/gradcheck/ops.go +++ b/testing/gradcheck/ops.go @@ -367,6 +367,27 @@ func newMulScalarNode[T tensor.Float](e compute.Engine[T], c float64) *opNode[T] func(ctx context.Context, g, _, _ tn[T]) (tn[T], error) { return e.MulScalar(ctx, g, T(c)) }) } +// newDropoutNode builds an inverted-dropout op with a fixed drop probability p +// and seed in training mode. The mask is deterministic (Philox keyed by (seed, +// offset)), so with p and seed held constant the op is a fixed element-wise +// linear map y = x * mask/(1-p): finite differences and the analytic backward +// (the same masked scale applied to the upstream gradient) agree exactly. The +// engine must implement the Dropouter capability (CPU and GPU both do); the +// gradcheck/oracle harnesses run on the CPU engine. +func newDropoutNode[T tensor.Float](e compute.Engine[T], p float64, seed uint64) *opNode[T] { + d, ok := e.(compute.Dropouter[T]) + return unary("Dropout", + func(ctx context.Context, x tn[T]) (tn[T], error) { + if !ok { + return nil, fmt.Errorf("Dropout: engine %T does not implement Dropouter", e) + } + return d.Dropout(ctx, x, p, seed, true) + }, + func(ctx context.Context, g, _, _ tn[T]) (tn[T], error) { + return d.DropoutBackward(ctx, g, p, seed, true) + }) +} + // --- matmul-like and shape ops ---------------------------------------------- func newMatMulNode[T tensor.Float](e compute.Engine[T]) *opNode[T] { diff --git a/testing/gradcheck/registry.go b/testing/gradcheck/registry.go index 6a0e2dd..5cc7f8c 100644 --- a/testing/gradcheck/registry.go +++ b/testing/gradcheck/registry.go @@ -49,6 +49,8 @@ func NewRegistryNode[T tensor.Float](name string, e compute.Engine[T]) (graph.No return newAddScalarNode(e, 0.7), nil case "MulScalar": return newMulScalarNode(e, -1.3), nil + case "Dropout": + return newDropoutNode(e, 0.3, 0x9e3779b97f4a7c15), nil case "MatMul": return newMatMulNode(e), nil case "Transpose": @@ -196,6 +198,15 @@ func Registry() []OpInfo { Make: registryMake("MulScalar"), InputShapes: [][]int{{2, 3}}, }, + // Inverted dropout with a fixed mask (p=0.3, fixed seed): a deterministic + // element-wise linear map y = x * mask/(1-p). A larger [4,8] input gives + // the fixed-seed mask both kept and dropped lanes so the masked-and-zero + // gradient structure is actually exercised. + { + Name: "Dropout", Seed: 31, + Make: registryMake("Dropout"), + InputShapes: [][]int{{4, 8}}, + }, // MatMul-like and shape ops. { diff --git a/testing/oracle/generate_test.go b/testing/oracle/generate_test.go index 4bd65d5..1290275 100644 --- a/testing/oracle/generate_test.go +++ b/testing/oracle/generate_test.go @@ -51,8 +51,17 @@ func TestGenerateAll(t *testing.T) { if got := len(sum.Written) + len(sum.Skipped); got != wantTotal { t.Fatalf("summary covers %d ops, registry has %d", got, wantTotal) } - if len(sum.Skipped) != 1 || sum.Skipped[0].Op != "HadamardTransform" { - t.Fatalf("skipped = %+v, want exactly HadamardTransform", sum.Skipped) + // Ops with no clean torch equivalent are skipped with a reason: the + // Walsh-Hadamard transform (no torch builtin) and Dropout (training-mode + // mask uses ztensor's own Philox, not torch's; see torchmap.go). + wantSkipped := map[string]bool{"HadamardTransform": true, "Dropout": true} + if len(sum.Skipped) != len(wantSkipped) { + t.Fatalf("skipped = %+v, want exactly %v", sum.Skipped, wantSkipped) + } + for _, s := range sum.Skipped { + if !wantSkipped[s.Op] { + t.Fatalf("unexpected skipped op %q (skipped=%+v)", s.Op, sum.Skipped) + } } if _, err := os.Stat(filepath.Join(dir, "generation.json")); err != nil { t.Fatalf("generation.json missing: %v", err) diff --git a/testing/oracle/torchmap.go b/testing/oracle/torchmap.go index 3a9a0f9..1c48635 100644 --- a/testing/oracle/torchmap.go +++ b/testing/oracle/torchmap.go @@ -49,6 +49,9 @@ var torchMap = map[string]torchOp{ "HadamardTransform": { SkipReason: "torch has no built-in normalized Walsh-Hadamard transform; replaying it would mean hand-building the H matrix in the runner, i.e. testing our own reimplementation rather than torch", }, + "Dropout": { + SkipReason: "dropout's training-mode output depends on the RNG mask, and torch.nn.functional.dropout draws from its own (Philox) generator whose word-to-element mapping is not the one ztensor uses; matching masks would mean reimplementing ztensor's Philox in the torch runner (testing our own reimplementation, like HadamardTransform). The mask-vs-input math is instead pinned by gradcheck (the deterministic mask makes dropout an exact linear map y=x*mask/(1-p), so finite-diff == analytic backward) and by CPU-GPU parity (the same Philox produces bit-identical masks on CPU and GB10). Eval-mode dropout is exact identity, already covered by unit tests.", + }, // Softmax and reductions. ReduceMax uses amax; torch.amax splits the // gradient among tied maxima while ztensor routes it to the first argmax, diff --git a/testing/parity/stress_engine.go b/testing/parity/stress_engine.go index 6857a0d..0f98457 100644 --- a/testing/parity/stress_engine.go +++ b/testing/parity/stress_engine.go @@ -273,6 +273,28 @@ func (e *StressEngine) Repeat(ctx context.Context, a *t32, axis, repetitions int return e.relocate(e.Engine.Repeat(ctx, a, axis, repetitions)) } +// Dropout / DropoutBackward delegate to the inner engine's Dropouter +// capability and relocate the result into the arena, so the parity harness +// exercises the dropout op under the same reset-between-fwd-bwd schedules as +// every other op. The masked output is a pure function of (seed, offset, p) -- +// it survives a relocate intact, and (per the recompute-in-backward design) no +// mask is saved across resets. +func (e *StressEngine) Dropout(ctx context.Context, a *t32, p float64, seed uint64, training bool, dst ...*t32) (*t32, error) { + d := e.Engine.(compute.Dropouter[float32]) //nolint:errcheck // inner CPU engine implements Dropouter + if len(dst) > 0 { + return d.Dropout(ctx, a, p, seed, training, dst...) + } + return e.relocate(d.Dropout(ctx, a, p, seed, training)) +} + +func (e *StressEngine) DropoutBackward(ctx context.Context, g *t32, p float64, seed uint64, training bool, dst ...*t32) (*t32, error) { + d := e.Engine.(compute.Dropouter[float32]) //nolint:errcheck // inner CPU engine implements Dropouter + if len(dst) > 0 { + return d.DropoutBackward(ctx, g, p, seed, training, dst...) + } + return e.relocate(d.DropoutBackward(ctx, g, p, seed, training)) +} + func (e *StressEngine) HadamardTransform(ctx context.Context, a *t32, dst ...*t32) (*t32, error) { if len(dst) > 0 { return e.Engine.HadamardTransform(ctx, a, dst...) From e26115db6e2b6e2cbe66ab3b8b2059298242ff45 Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Sat, 20 Jun 2026 14:41:31 -0700 Subject: [PATCH 2/5] feat(cuda): GB10 GPU dropout kernel mirroring the CPU Philox mask (BPB.3a) Add the f32 GPU half of the dropout op, bit-identical to the CPU reference: - internal/cuda/kernels/dropout.cu: Philox4x32-10 device function with the same constants/round structure as compute/philox.go, so the (seed, offset) -> mask draw matches the CPU engine word-for-word. dropout_f32 launcher does the masked-and-scaled write in training mode (out = (u>=p)? in*invKeep : 0) and an exact identity copy in eval mode / p==0. invKeep is passed host-side as 1/(1-p) so the scale equals the CPU path's bit-for-bit. Added to Makefile SRCS (sm_121 for GB10). - purego + cgo wrappers (dropout_purego.go / dropout.go) and the dropout_f32 symbol registration in purego.go, following the argmax kernel pattern. - gpuapi.Dropouter optional KernelRunner extension (kernels.go) + CUDAKernels impl, mirroring BFloat16Transposer; callers type-assert and report unavailability when absent (no stub fallback). - GPUEngine.Dropout / DropoutBackward (compute/gpu_dropout.go) reuse the gpu_kernels.go scaffolding (getDevicePtr, dst-reuse, makeGPUResult). The mask is recomputed in backward from (seed,p), never cached -- capture-safe, no save pinned across an arena reset (ADR 006). - CUDA-gated CPU-GPU parity tests (compute/gpu_dropout_parity_test.go): GPU vs CPU dropout forward (multiple shapes/p/train-eval) and backward must be bit-identical; skip cleanly without a GPU. CPU suite stays green (go test ./... and -race). The GB10 parity gate runs via the cuda-tagged path on the DGX (Spark); nvcc/CUDA toolkit is not present locally, so the sm_121 kernel build + GPU parity run are the remaining step. Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH --- compute/gpu_dropout.go | 79 +++++++++++++++ compute/gpu_dropout_parity_test.go | 125 ++++++++++++++++++++++++ internal/cuda/kernels/Makefile | 2 +- internal/cuda/kernels/dropout.cu | 92 +++++++++++++++++ internal/cuda/kernels/dropout.go | 35 +++++++ internal/cuda/kernels/dropout_purego.go | 33 +++++++ internal/cuda/kernels/purego.go | 5 + internal/gpuapi/cuda_kernels.go | 8 ++ internal/gpuapi/kernels.go | 14 +++ 9 files changed, 392 insertions(+), 1 deletion(-) create mode 100644 compute/gpu_dropout.go create mode 100644 compute/gpu_dropout_parity_test.go create mode 100644 internal/cuda/kernels/dropout.cu create mode 100644 internal/cuda/kernels/dropout.go create mode 100644 internal/cuda/kernels/dropout_purego.go diff --git a/compute/gpu_dropout.go b/compute/gpu_dropout.go new file mode 100644 index 0000000..509de0e --- /dev/null +++ b/compute/gpu_dropout.go @@ -0,0 +1,79 @@ +package compute + +import ( + "context" + "fmt" + + "github.com/zerfoo/ztensor/internal/gpuapi" + "github.com/zerfoo/ztensor/tensor" +) + +// f32Size and the GPU helpers (getDevicePtr, tryReuseDstPtr, finishReusedDst, +// makeGPUResult, isFloat32) are defined in gpu_kernels.go. + +// Dropout applies inverted dropout to a on the GPU using a deterministic Philox +// mask keyed by (seed, element offset) -- the same Philox the CPU engine uses, +// so the masks (and outputs, given identical inputs) are bit-identical across +// CPU and GPU. In eval mode (training==false) or p==0 the kernel performs an +// exact identity copy. p must be in [0, 1). The mask is recomputed in +// DropoutBackward from (seed, p) rather than cached, keeping the op +// capture-safe (no save pinned across an arena reset; ztensor ADR 006). +func (e *GPUEngine[T]) Dropout(ctx context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) { + return e.dropoutGPU(ctx, a, p, seed, training, dst...) +} + +// DropoutBackward propagates g through the same mask Dropout used for the +// identical (p, seed, training). Dropout is linear in its input given the mask, +// so the backward is the same masked-and-scaled map applied to g. +func (e *GPUEngine[T]) DropoutBackward(ctx context.Context, g *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) { + return e.dropoutGPU(ctx, g, p, seed, training, dst...) +} + +// dropoutGPU is the shared forward/backward GPU launcher. +func (e *GPUEngine[T]) dropoutGPU(_ context.Context, a *tensor.TensorNumeric[T], p float64, seed uint64, training bool, dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) { + if a == nil { + return nil, fmt.Errorf("dropout: input tensor cannot be nil") + } + if p < 0 || p >= 1 { + return nil, fmt.Errorf("dropout: p must be in [0, 1), got %g", p) + } + if !isFloat32[T]() { + return nil, fmt.Errorf("GPU dropout: unsupported type, only float32") + } + dk, ok := e.kernels.(gpuapi.Dropouter) + if !ok { + return nil, fmt.Errorf("GPU dropout: kernel runner does not provide the dropout kernel") + } + + n := a.GetStorage().Len() + devA, cleanupA, err := getDevicePtr(e, a) + if err != nil { + return nil, err + } + defer cleanupA() + + byteSize := n * f32Size + devC, reused := tryReuseDstPtr[T](n, dst) + if !reused { + devC, err = e.pool.Alloc(e.deviceID, byteSize) + if err != nil { + return nil, err + } + } + + invKeep := float32(1.0 / (1.0 - p)) + if err := dk.DropoutF32(devA, devC, n, float32(p), seed, training, invKeep, e.stream); err != nil { + if !reused { + e.pool.Free(e.deviceID, devC, byteSize) + } + return nil, err + } + + if reused { + return finishReusedDst[T](dst[0], a.Shape()), nil + } + return makeGPUResult[T](e, a.Shape(), devC, n, dst...) +} + +// Compile-time assertion: GPUEngine[float32] satisfies the Dropouter capability. +var _ Dropouter[float32] = (*GPUEngine[float32])(nil) diff --git a/compute/gpu_dropout_parity_test.go b/compute/gpu_dropout_parity_test.go new file mode 100644 index 0000000..7962915 --- /dev/null +++ b/compute/gpu_dropout_parity_test.go @@ -0,0 +1,125 @@ +package compute + +// CPU-GPU parity for the dropout op (BPB.3a). The mask is drawn from the same +// Philox4x32-10 generator on both sides (compute/philox.go for the CPU engine, +// internal/cuda/kernels/dropout.cu for the GPU), keyed by (seed, element +// offset). For an identical seed the masks are bit-identical, so GPU and CPU +// dropout produce identical kept/dropped lanes; the kept lanes are scaled by +// the same host-computed invKeep=1/(1-p) float32, so the outputs match exactly. +// This is the CPU-GPU parity gate; it runs on the GB10 via Spark (cuda tag) and +// skips cleanly on machines without a GPU. + +import ( + "context" + "testing" + + "github.com/zerfoo/ztensor/internal/cuda" + "github.com/zerfoo/ztensor/numeric" + "github.com/zerfoo/ztensor/tensor" +) + +func TestGPUDropout_CPUParity(t *testing.T) { + if !cuda.Available() { + t.Skip("CUDA not available") + } + ops := numeric.Float32Ops{} + gpuEng, err := NewGPUEngine[float32](ops) + if err != nil { + t.Fatalf("NewGPUEngine: %v", err) + } + defer func() { _ = gpuEng.Close() }() + cpuEng := NewCPUEngine[float32](ops) + ctx := context.Background() + + const seed = uint64(0x9e3779b97f4a7c15) + shapes := [][]int{{8, 8}, {1, 257}, {4, 32}} + probs := []float64{0.0, 0.1, 0.3, 0.5, 0.9} + + for _, shape := range shapes { + n := 1 + for _, d := range shape { + n *= d + } + data := make([]float32, n) + for i := range data { + data[i] = float32(i%13) - 6.0 + 0.25 + } + for _, p := range probs { + for _, training := range []bool{true, false} { + cx, err := tensor.New[float32](shape, append([]float32(nil), data...)) + if err != nil { + t.Fatal(err) + } + gx, err := tensor.New[float32](shape, append([]float32(nil), data...)) + if err != nil { + t.Fatal(err) + } + cy, err := cpuEng.Dropout(ctx, cx, p, seed, training) + if err != nil { + t.Fatalf("CPU Dropout (p=%g train=%v): %v", p, training, err) + } + gy, err := gpuEng.Dropout(ctx, gx, p, seed, training) + if err != nil { + t.Fatalf("GPU Dropout (p=%g train=%v): %v", p, training, err) + } + gd := gy.Data() // GPUStorage.Slice copies D2H + cd := cy.Data() + if len(gd) != len(cd) { + t.Fatalf("len mismatch: gpu %d cpu %d", len(gd), len(cd)) + } + for i := range cd { + if gd[i] != cd[i] { + t.Fatalf("dropout parity mismatch shape=%v p=%g train=%v at %d: gpu=%g cpu=%g", + shape, p, training, i, gd[i], cd[i]) + } + } + } + } + } +} + +// TestGPUDropout_Backward_CPUParity checks the backward (masked-scale of the +// upstream gradient) matches CPU bit-for-bit under the same seed. +func TestGPUDropout_Backward_CPUParity(t *testing.T) { + if !cuda.Available() { + t.Skip("CUDA not available") + } + ops := numeric.Float32Ops{} + gpuEng, err := NewGPUEngine[float32](ops) + if err != nil { + t.Fatalf("NewGPUEngine: %v", err) + } + defer func() { _ = gpuEng.Close() }() + cpuEng := NewCPUEngine[float32](ops) + ctx := context.Background() + + const seed = uint64(12345) + const p = 0.4 + n := 128 + g := make([]float32, n) + for i := range g { + g[i] = float32(i) - 64 + } + cg, err := tensor.New[float32]([]int{16, 8}, append([]float32(nil), g...)) + if err != nil { + t.Fatal(err) + } + gg, err := tensor.New[float32]([]int{16, 8}, append([]float32(nil), g...)) + if err != nil { + t.Fatal(err) + } + cdx, err := cpuEng.DropoutBackward(ctx, cg, p, seed, true) + if err != nil { + t.Fatal(err) + } + gdx, err := gpuEng.DropoutBackward(ctx, gg, p, seed, true) + if err != nil { + t.Fatal(err) + } + cd, gd := cdx.Data(), gdx.Data() + for i := range cd { + if gd[i] != cd[i] { + t.Fatalf("dropout backward parity mismatch at %d: gpu=%g cpu=%g", i, gd[i], cd[i]) + } + } +} diff --git a/internal/cuda/kernels/Makefile b/internal/cuda/kernels/Makefile index aa04220..eb81aeb 100644 --- a/internal/cuda/kernels/Makefile +++ b/internal/cuda/kernels/Makefile @@ -18,7 +18,7 @@ ifeq ($(CUDA_ARCH),sm_121) NVCC_FLAGS += -DFLASH_BLOCK_SIZE=64 endif -SRCS = counter.cu dequant_q4k.cu dequant_q5_0.cu dequant_q5k.cu dequant_q6k.cu elementwise.cu elementwise_fp16.cu elementwise_bf16.cu flash_attention.cu flash_attention2.cu flash_decode.cu fp4_gemv.cu fp8_gemm.cu fp8_ops.cu fused_add_rmsnorm.cu fused_encoder_fwd.cu fused_encoder_bwd.cu fused_norm_add.cu fused_qk_norm_rope.cu fused_adamw.cu fused_adamw_bf16.cu fused_norm_bf16.cu tiny_batched_gemm.cu fused_repeat_interleave.cu fused_rope.cu fused_softmax_vmul.cu fused_swiglu.cu gather.cu gather_q8.cu gemm_int8.cu gemm_int4.cu gemm_q4.cu gemm_q8.cu gemv_q4k.cu gemv_q4k_sm121.cu gemv_q5k.cu gemv_q5_0.cu gemv_q6k.cu gemv_warp.cu megakernel_ops.cu offset_memcpy.cu paged_attention.cu ragged_attention.cu rope_select.cu scaled_softmax.cu selective_scan.cu sgemv_m1.cu ternary_gemv.cu transpose.cu rmsnorm.cu argmax.cu +SRCS = counter.cu dequant_q4k.cu dequant_q5_0.cu dequant_q5k.cu dequant_q6k.cu elementwise.cu elementwise_fp16.cu elementwise_bf16.cu flash_attention.cu flash_attention2.cu flash_decode.cu fp4_gemv.cu fp8_gemm.cu fp8_ops.cu fused_add_rmsnorm.cu fused_encoder_fwd.cu fused_encoder_bwd.cu fused_norm_add.cu fused_qk_norm_rope.cu fused_adamw.cu fused_adamw_bf16.cu fused_norm_bf16.cu tiny_batched_gemm.cu fused_repeat_interleave.cu fused_rope.cu fused_softmax_vmul.cu fused_swiglu.cu gather.cu gather_q8.cu gemm_int8.cu gemm_int4.cu gemm_q4.cu gemm_q8.cu gemv_q4k.cu gemv_q4k_sm121.cu gemv_q5k.cu gemv_q5_0.cu gemv_q6k.cu gemv_warp.cu megakernel_ops.cu offset_memcpy.cu paged_attention.cu ragged_attention.cu rope_select.cu scaled_softmax.cu selective_scan.cu sgemv_m1.cu ternary_gemv.cu transpose.cu rmsnorm.cu argmax.cu dropout.cu OBJS = $(SRCS:.cu=.o) PIC_OBJS = $(SRCS:.cu=.pic.o) LIB = libkernels.a diff --git a/internal/cuda/kernels/dropout.cu b/internal/cuda/kernels/dropout.cu new file mode 100644 index 0000000..0183d05 --- /dev/null +++ b/internal/cuda/kernels/dropout.cu @@ -0,0 +1,92 @@ +// dropout.cu -- GB10 (sm_121) inverted-dropout with a deterministic Philox mask. +// +// The mask is drawn from a counter-based Philox4x32-10 generator keyed by +// (seed, element offset), bit-identical to the CPU reference in +// compute/philox.go. Because the draw is a pure function of (seed, offset, p), +// the same seed produces the same mask on CPU and GPU -- which is what makes +// ztensor's CPU-GPU parity gate pass -- and backward recomputes the mask from +// (seed, p) rather than reading a cached buffer (capture-safe, no save pinned +// across an arena reset; ztensor ADR 006). +// +// Inverted-dropout semantics (match torch.nn.functional.dropout): training mode +// out[i] = keep(i) ? in[i] / (1 - p) : 0 with keep iff uniform >= p; eval mode +// (training == 0) or p == 0 is exact identity. Forward and backward share one +// kernel because dropout is linear in its input given the mask, so the host +// passes the upstream gradient as `in` for the backward call. +// +// The Philox constants and round structure mirror compute/philox.go exactly. + +#include +#include + +#define PHILOX_M0 0xD2511F53u +#define PHILOX_M1 0xCD9E8D57u +#define PHILOX_W0 0x9E3779B9u +#define PHILOX_W1 0xBB67AE85u +#define PHILOX_ROUNDS 10 + +// Single Philox round: same word layout as the Go reference. +__device__ __forceinline__ void philox_round(uint32_t c[4], const uint32_t k[2]) { + uint32_t hi0 = __umulhi(PHILOX_M0, c[0]); + uint32_t lo0 = PHILOX_M0 * c[0]; + uint32_t hi1 = __umulhi(PHILOX_M1, c[2]); + uint32_t lo1 = PHILOX_M1 * c[2]; + uint32_t n0 = hi1 ^ c[1] ^ k[0]; + uint32_t n1 = lo1; + uint32_t n2 = hi0 ^ c[3] ^ k[1]; + uint32_t n3 = lo0; + c[0] = n0; c[1] = n1; c[2] = n2; c[3] = n3; +} + +// philox_uniform returns a uniform float in [0,1) for element `offset` under +// `seed`, consuming only the first output lane (matches the Go reference). +__device__ __forceinline__ float philox_uniform(uint64_t seed, uint64_t offset) { + uint32_t k[2] = { (uint32_t)seed, (uint32_t)(seed >> 32) }; + uint32_t c[4] = { (uint32_t)offset, (uint32_t)(offset >> 32), 0u, 0u }; +#pragma unroll + for (int i = 0; i < PHILOX_ROUNDS; ++i) { + philox_round(c, k); + k[0] += PHILOX_W0; + k[1] += PHILOX_W1; + } + // Map a 32-bit word to [0,1) by dividing by 2^32; same keep/drop boundary + // (uniform >= p) the Go reference uses. + return (float)c[0] * (1.0f / 4294967296.0f); +} + +__global__ void kernel_dropout(const float* __restrict__ in, + float* __restrict__ out, + int n, float p, uint64_t seed, float invKeep) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid >= n) return; + float u = philox_uniform(seed, (uint64_t)gid); + out[gid] = (u >= p) ? (in[gid] * invKeep) : 0.0f; +} + +__global__ void kernel_identity_copy(const float* __restrict__ in, + float* __restrict__ out, int n) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid >= n) return; + out[gid] = in[gid]; +} + +extern "C" { + +// dropout_f32 applies inverted dropout to in[0..n-1], writing out[0..n-1]. +// training != 0 and p > 0 => masked-and-scaled; otherwise exact identity copy. +// invKeep must be 1/(1-p) (computed host-side to match the CPU path bit-for-bit). +cudaError_t dropout_f32(const float* in, float* out, int n, + float p, uint64_t seed, int training, float invKeep, + cudaStream_t stream) { + const int BLOCK = 256; + int grid = (n + BLOCK - 1) / BLOCK; + if (grid < 1) grid = 1; + if (training == 0 || p == 0.0f) { + kernel_identity_copy<<>>(in, out, n); + return cudaGetLastError(); + } + kernel_dropout<<>>(in, out, n, p, seed, invKeep); + return cudaGetLastError(); +} + +} // extern "C" diff --git a/internal/cuda/kernels/dropout.go b/internal/cuda/kernels/dropout.go new file mode 100644 index 0000000..3b7458d --- /dev/null +++ b/internal/cuda/kernels/dropout.go @@ -0,0 +1,35 @@ +//go:build cuda + +package kernels + +/* +#cgo LDFLAGS: -L${SRCDIR} -lkernels -lcudart -lstdc++ +#include +#include + +extern cudaError_t dropout_f32(const float* in, float* out, int n, + float p, uint64_t seed, int training, float invKeep, + cudaStream_t stream); +*/ +import "C" + +import "unsafe" + +// DropoutF32 launches the GPU inverted-dropout kernel with a deterministic +// Philox mask keyed by (seed, element offset). in/out are [n] float32 device +// pointers; when training is false or p==0 the kernel performs an exact identity +// copy. invKeep must be 1/(1-p), computed host-side so the GPU result matches +// the CPU path bit-for-bit. The same entry serves backward (pass the upstream +// gradient as in) because dropout is linear in its input given the mask. +func DropoutF32(in unsafe.Pointer, out unsafe.Pointer, n int, + p float32, seed uint64, training bool, invKeep float32, s unsafe.Pointer) error { + tr := C.int(0) + if training { + tr = 1 + } + return checkCUDA(C.dropout_f32( + (*C.float)(in), (*C.float)(out), C.int(n), + C.float(p), C.uint64_t(seed), tr, C.float(invKeep), + stream(s), + ), "dropout_f32") +} diff --git a/internal/cuda/kernels/dropout_purego.go b/internal/cuda/kernels/dropout_purego.go new file mode 100644 index 0000000..100e54a --- /dev/null +++ b/internal/cuda/kernels/dropout_purego.go @@ -0,0 +1,33 @@ +//go:build !cuda + +package kernels + +import ( + "fmt" + "unsafe" + + "github.com/zerfoo/ztensor/internal/cuda" +) + +// DropoutF32 launches the GPU inverted-dropout kernel with a deterministic +// Philox mask keyed by (seed, element offset). in/out are [n] float32 device +// pointers; when training is false or p==0 the kernel performs an exact identity +// copy. invKeep must be 1/(1-p), computed host-side so the GPU result matches +// the CPU path bit-for-bit. The same entry serves backward (pass the upstream +// gradient as in) because dropout is linear in its input given the mask. +func DropoutF32(in unsafe.Pointer, out unsafe.Pointer, n int, + p float32, seed uint64, training bool, invKeep float32, s unsafe.Pointer) error { + k := klib() + if k == nil { + return fmt.Errorf("dropout kernel: kernels not available") + } + tr := uintptr(0) + if training { + tr = 1 + } + ret := cuda.Ccall(k.launchDropoutF32, + uintptr(in), uintptr(out), uintptr(n), + uintptr(floatBits(p)), uintptr(seed), tr, + uintptr(floatBits(invKeep)), uintptr(s)) + return checkKernel(ret, "dropout_f32") +} diff --git a/internal/cuda/kernels/purego.go b/internal/cuda/kernels/purego.go index f2a9ebc..928adee 100644 --- a/internal/cuda/kernels/purego.go +++ b/internal/cuda/kernels/purego.go @@ -80,6 +80,9 @@ type KernelLib struct { // argmax launchArgmax uintptr + // dropout (deterministic Philox mask, inverted-dropout f32) + launchDropoutF32 uintptr + // fused_rope launchFusedRoPEF32 uintptr @@ -312,6 +315,8 @@ func openKernelLib() (*KernelLib, error) { {"gemm_q8_f32", &k.launchGemmQ8F32}, // argmax {"launch_argmax", &k.launchArgmax}, + // dropout (deterministic Philox mask) + {"dropout_f32", &k.launchDropoutF32}, // fused_rope {"fused_rope_f32", &k.launchFusedRoPEF32}, // fused_swiglu diff --git a/internal/gpuapi/cuda_kernels.go b/internal/gpuapi/cuda_kernels.go index 0060428..ac89ae7 100644 --- a/internal/gpuapi/cuda_kernels.go +++ b/internal/gpuapi/cuda_kernels.go @@ -418,6 +418,14 @@ func (k *CUDAKernels) FusedEncoderFwdAvailable() bool { return kernels.FusedEncoderAvailable() } +// DropoutF32 launches the on-device inverted-dropout kernel (deterministic +// Philox mask). Forward and backward both call this; backward passes the +// upstream gradient as in. +func (k *CUDAKernels) DropoutF32(in, out unsafe.Pointer, n int, p float32, seed uint64, training bool, invKeep float32, s Stream) error { + return kernels.DropoutF32(in, out, n, p, seed, training, invKeep, streamPtr(s)) +} + // Compile-time interface assertion. var _ KernelRunner = (*CUDAKernels)(nil) var _ BFloat16Transposer = (*CUDAKernels)(nil) +var _ Dropouter = (*CUDAKernels)(nil) diff --git a/internal/gpuapi/kernels.go b/internal/gpuapi/kernels.go index b8f4c1a..c0e10c1 100644 --- a/internal/gpuapi/kernels.go +++ b/internal/gpuapi/kernels.go @@ -286,6 +286,20 @@ type KernelRunner interface { FusedEncoderFwdAvailable() bool } +// Dropouter is an optional KernelRunner extension providing an on-device +// inverted-dropout kernel with a deterministic Philox mask keyed by (seed, +// element offset). The mask is bit-identical to the CPU reference, so dropout +// passes the CPU-GPU parity gate; invKeep is supplied host-side as 1/(1-p) so +// the scale matches the CPU path exactly. The same entry serves forward and +// backward (backward passes the upstream gradient as `in`) because dropout is +// linear in its input given the mask. Only the CUDA backend implements it; +// callers type-assert and report unavailability when it is absent. +type Dropouter interface { + // DropoutF32 writes out[i] = (philox(seed,i) >= p) ? in[i]*invKeep : 0 in + // training mode; in eval mode (training==false) or p==0 it copies in->out. + DropoutF32(in, out unsafe.Pointer, n int, p float32, seed uint64, training bool, invKeep float32, stream Stream) error +} + // BFloat16Transposer is an optional KernelRunner extension providing on-device // bf16 (16-bit) transpose kernels. Without it, a GPU engine over bf16 must route // transposes to the CPU engine, whose host memcpy breaks CUDA-graph capture From 2919372ea59be942a221c6047ac2f07da4c74f8a Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Sat, 20 Jun 2026 14:42:29 -0700 Subject: [PATCH 3/5] ci(spark): GB10 dropout verify manifest (BPB.3a) One-pod GB10 verify mirroring deploy/spark/bf16-verify-gb10.yaml: clone the bpb3a-dropout branch, build libkernels.so for sm_121, run the CUDA-gated dropout CPU-GPU parity tests (TestGPUDropout_*) and the full-registry GB10 parity run (testing/parity -run _GPU, which now includes Dropout). Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH --- deploy/spark/dropout-verify-gb10.yaml | 56 +++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 deploy/spark/dropout-verify-gb10.yaml diff --git a/deploy/spark/dropout-verify-gb10.yaml b/deploy/spark/dropout-verify-gb10.yaml new file mode 100644 index 0000000..6866319 --- /dev/null +++ b/deploy/spark/dropout-verify-gb10.yaml @@ -0,0 +1,56 @@ +# GB10 verification for the dropout op (BPB.3a). +# +# One pod that (1) clones the bpb3a-dropout branch, (2) builds libkernels.so for +# sm_121 with the host CUDA toolkit (nvcc on /usr/local/cuda, mounted read-only), +# and (3) runs the CUDA-gated dropout CPU-GPU parity tests (TestGPUDropout_*) plus +# the full-registry GB10 parity run (-run _GPU, which now includes Dropout) on the +# GB10. +# +# Submit (token injected at submit time, never persisted in the manifest): +# TOKEN=$(gh auth token) +# sed "s#__GH_TOKEN__#$TOKEN#; s#__TAG__#$(git rev-parse --short=8 HEAD)#" \ +# deploy/spark/dropout-verify-gb10.yaml \ +# | curl -X POST http://192.168.86.250:8080/api/v1/pods \ +# --data-binary @- -H 'Content-Type: application/yaml' +# +# memory limit set per lore L-0005 (cmd/lint-manifests). +apiVersion: v1 +kind: Pod +metadata: + name: ztensor-dropout-verify-__TAG__ + labels: + app: ztensor + role: gpu-verify +spec: + restartPolicy: Never + containers: + - name: verify + image: docker.io/library/golang:1.26 + command: + - /bin/sh + - -c + - "set -ex; export PATH=/usr/local/cuda/bin:$PATH; export LD_LIBRARY_PATH=/usr/local/cuda/lib64; git clone --depth 1 -b bpb3a-dropout https://__GH_TOKEN__@github.com/zerfoo/ztensor.git /work/zt; cd /work/zt; make -C internal/cuda/kernels shared CUDA_ARCH=sm_121; LD_LIBRARY_PATH=/work/zt/internal/cuda/kernels:/usr/local/cuda/lib64 go test ./compute/ -run GPUDropout -v -count=1; LD_LIBRARY_PATH=/work/zt/internal/cuda/kernels:/usr/local/cuda/lib64 go test ./testing/parity/ -run _GPU -v -count=1" + env: + - name: GOFLAGS + value: "-mod=mod" + resources: + limits: + nvidia.com/gpu: "1" + cpu: "4" + memory: "24Gi" + requests: + cpu: "1" + memory: "8Gi" + volumeMounts: + - name: cuda + mountPath: /usr/local/cuda + readOnly: true + - name: work + mountPath: /work + volumes: + - name: cuda + hostPath: + path: /usr/local/cuda + type: Directory + - name: work + emptyDir: {} From d22ebf175f9bae2077a6fb9f227e835a815ce9a5 Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Sat, 20 Jun 2026 14:55:16 -0700 Subject: [PATCH 4/5] fix(cuda): pass dropout p/invKeep as int bit patterns (purego ABI) The first GB10 run failed: GPU dropout returned all zeros vs the CPU reference. Root cause is the purego/dlopen launch ABI -- the AAPCS64 trampoline (internal/cuda/purego_linux_arm64.s) loads every argument into integer registers R0-R7 only and never populates the V float registers. A C kernel with `float` parameters reads them from V registers (garbage/zero) AND, because integer and float args consume separate AAPCS64 register sequences, every argument after the first float is shifted -- so seed/training/stream were all misread and the kernel wrote zeros. Fix: pass p and invKeep as their 32-bit IEEE-754 bit patterns in uint32 integer parameters and reinterpret them inside the kernel with __uint_as_float. The ABI is now integer-only and identical between the CGO (-tags cuda) and purego launch paths. Updated dropout.cu, dropout.go (cgo), dropout_purego.go. Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH --- internal/cuda/kernels/dropout.cu | 14 ++++++++++++-- internal/cuda/kernels/dropout.go | 11 ++++++++--- internal/cuda/kernels/dropout_purego.go | 8 ++++++-- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/internal/cuda/kernels/dropout.cu b/internal/cuda/kernels/dropout.cu index 0183d05..a4b8fcd 100644 --- a/internal/cuda/kernels/dropout.cu +++ b/internal/cuda/kernels/dropout.cu @@ -74,13 +74,23 @@ extern "C" { // dropout_f32 applies inverted dropout to in[0..n-1], writing out[0..n-1]. // training != 0 and p > 0 => masked-and-scaled; otherwise exact identity copy. -// invKeep must be 1/(1-p) (computed host-side to match the CPU path bit-for-bit). +// +// p and invKeep (= 1/(1-p)) are passed as their 32-bit IEEE-754 BIT PATTERNS in +// integer parameters, not as `float`. The purego/dlopen launch path (no CGO) +// loads every argument into integer registers only (the AAPCS64 trampoline +// never populates the V float registers), so a `float` parameter would read +// garbage and shift every following argument. Passing the bits as uint32 and +// reinterpreting here with __uint_as_float keeps the ABI integer-only and +// identical between the CGO and purego paths. invKeep is computed host-side so +// the scale matches the CPU path bit-for-bit. cudaError_t dropout_f32(const float* in, float* out, int n, - float p, uint64_t seed, int training, float invKeep, + uint32_t pBits, uint64_t seed, int training, uint32_t invKeepBits, cudaStream_t stream) { const int BLOCK = 256; int grid = (n + BLOCK - 1) / BLOCK; if (grid < 1) grid = 1; + float p = __uint_as_float(pBits); + float invKeep = __uint_as_float(invKeepBits); if (training == 0 || p == 0.0f) { kernel_identity_copy<<>>(in, out, n); return cudaGetLastError(); diff --git a/internal/cuda/kernels/dropout.go b/internal/cuda/kernels/dropout.go index 3b7458d..9721ce3 100644 --- a/internal/cuda/kernels/dropout.go +++ b/internal/cuda/kernels/dropout.go @@ -8,12 +8,15 @@ package kernels #include extern cudaError_t dropout_f32(const float* in, float* out, int n, - float p, uint64_t seed, int training, float invKeep, + uint32_t pBits, uint64_t seed, int training, uint32_t invKeepBits, cudaStream_t stream); */ import "C" -import "unsafe" +import ( + "math" + "unsafe" +) // DropoutF32 launches the GPU inverted-dropout kernel with a deterministic // Philox mask keyed by (seed, element offset). in/out are [n] float32 device @@ -27,9 +30,11 @@ func DropoutF32(in unsafe.Pointer, out unsafe.Pointer, n int, if training { tr = 1 } + // Pass p and invKeep as float32 bit patterns in integer params so the ABI + // is identical to the purego launch path (which cannot use float registers). return checkCUDA(C.dropout_f32( (*C.float)(in), (*C.float)(out), C.int(n), - C.float(p), C.uint64_t(seed), tr, C.float(invKeep), + C.uint32_t(math.Float32bits(p)), C.uint64_t(seed), tr, C.uint32_t(math.Float32bits(invKeep)), stream(s), ), "dropout_f32") } diff --git a/internal/cuda/kernels/dropout_purego.go b/internal/cuda/kernels/dropout_purego.go index 100e54a..304aa6a 100644 --- a/internal/cuda/kernels/dropout_purego.go +++ b/internal/cuda/kernels/dropout_purego.go @@ -4,6 +4,7 @@ package kernels import ( "fmt" + "math" "unsafe" "github.com/zerfoo/ztensor/internal/cuda" @@ -25,9 +26,12 @@ func DropoutF32(in unsafe.Pointer, out unsafe.Pointer, n int, if training { tr = 1 } + // p and invKeep are passed as their float32 bit patterns in INTEGER + // registers (the purego AAPCS64 trampoline never populates the V float + // registers); the kernel reinterprets them with __uint_as_float. ret := cuda.Ccall(k.launchDropoutF32, uintptr(in), uintptr(out), uintptr(n), - uintptr(floatBits(p)), uintptr(seed), tr, - uintptr(floatBits(invKeep)), uintptr(s)) + uintptr(math.Float32bits(p)), uintptr(seed), tr, + uintptr(math.Float32bits(invKeep)), uintptr(s)) return checkKernel(ret, "dropout_f32") } From bfc77b4cbd9b8cbe2ad43342abb3870e0bab42dd Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Sat, 20 Jun 2026 14:57:00 -0700 Subject: [PATCH 5/5] fix(cuda): reinterpret dropout p/invKeep bits host-side (not __uint_as_float) nvcc rejected calling the device intrinsic __uint_as_float from the __host__ launcher dropout_f32. Reinterpret the uint32 bit patterns to float on the host with a memcpy helper (host_uint_as_float) instead; the resulting float kernel args marshal correctly through the CUDA <<<>>> launch (kernel-arg passing is unaffected by the host dlopen ABI). Claude-Session: https://claude.ai/code/session_01So96MEV1hiThH4XqCd6rLH --- internal/cuda/kernels/dropout.cu | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/internal/cuda/kernels/dropout.cu b/internal/cuda/kernels/dropout.cu index a4b8fcd..2fd5481 100644 --- a/internal/cuda/kernels/dropout.cu +++ b/internal/cuda/kernels/dropout.cu @@ -18,6 +18,15 @@ #include #include +#include + +// host_uint_as_float reinterprets a 32-bit pattern as a float on the HOST +// (the device intrinsic __uint_as_float is not callable from host code). +static inline float host_uint_as_float(uint32_t bits) { + float f; + memcpy(&f, &bits, sizeof(f)); + return f; +} #define PHILOX_M0 0xD2511F53u #define PHILOX_M1 0xCD9E8D57u @@ -89,8 +98,8 @@ cudaError_t dropout_f32(const float* in, float* out, int n, const int BLOCK = 256; int grid = (n + BLOCK - 1) / BLOCK; if (grid < 1) grid = 1; - float p = __uint_as_float(pBits); - float invKeep = __uint_as_float(invKeepBits); + float p = host_uint_as_float(pBits); + float invKeep = host_uint_as_float(invKeepBits); if (training == 0 || p == 0.0f) { kernel_identity_copy<<>>(in, out, n); return cudaGetLastError();