Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions internal/router/healthz_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,15 @@ func TestHealthzShape(t *testing.T) {
reader := migrations.NewReader(sqlDB, 0, nil)

app := fiber.New()
// Pin the process-start instant 42 seconds in the past so the
// uptime_seconds assertion below has a stable expected value.
// router.go reads processStartFunc() per request — production
// captures it at package load via time.Now(), the test fixture
// here mirrors the same code shape with a fixed offset.
fixedStart := time.Now().Add(-42 * time.Second)
app.Get("/healthz", func(c *fiber.Ctx) error {
m := reader.Get(c.UserContext())
uptimeSeconds := int64(time.Since(fixedStart).Seconds())
return c.JSON(fiber.Map{
"ok": true,
"service": "instanode-api",
Expand All @@ -59,6 +66,9 @@ func TestHealthzShape(t *testing.T) {
// fixture aligned with prod so a future deletion of the
// router.go emit also fails here.
"now": time.Now().UTC().Format("2006-01-02T15:04:05.000Z"),
// BUG-P272 (QA 2026-05-29): mirror the router.go addition of
// uptime_seconds so the wire-shape contract is pinned here too.
"uptime_seconds": uptimeSeconds,
})
})

Expand Down Expand Up @@ -118,6 +128,23 @@ func TestHealthzShape(t *testing.T) {
require.Equal(t, float64(22), got["migration_count"]) // JSON numbers decode as float64
require.Equal(t, "ok", got["migration_status"])

// BUG-P272 (QA 2026-05-29): `uptime_seconds` is the new pod-liveness
// field. Must be a non-negative integer-valued JSON number. The
// fixture pinned the start instant to ~42s ago, so 41-44s is the
// tolerant range (in-process fiber.Test takes <50ms; ±2s is room
// to spare for slow CI runners). Drift outside that window is
// either a clock bug or a regression that swapped the offset.
require.NotNil(t, got["uptime_seconds"], "BUG-P272: /healthz must emit uptime_seconds (int64)")
uptimeRaw, ok := got["uptime_seconds"].(float64)
require.True(t, ok, "BUG-P272: uptime_seconds must be a JSON number (decoded as float64)")
require.GreaterOrEqual(t, uptimeRaw, 40.0, "BUG-P272: uptime_seconds floor — fixture pinned start 42s ago")
require.LessOrEqual(t, uptimeRaw, 45.0, "BUG-P272: uptime_seconds ceiling — fixture pinned start 42s ago")
// JSON-number must round-trip to an integer (no fractional seconds —
// sub-second jitter has no diagnostic value at this surface and
// would defeat HTTP-cache dedup for any proxy in front).
require.Equal(t, float64(int64(uptimeRaw)), uptimeRaw,
"BUG-P272: uptime_seconds must round to an integer; got %f", uptimeRaw)

require.NoError(t, mock.ExpectationsWereMet())
}

Expand Down
72 changes: 72 additions & 0 deletions internal/router/probe_options_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package router

import (
"io"
"net/http/httptest"
"strings"
"testing"

"github.com/gofiber/fiber/v2"
"github.com/stretchr/testify/require"
)

// TestProbeOptionsHandlerShape pins the exact wire shape returned by
// probeOptionsHandler — 204 No Content, an `Allow` header carrying
// the allow set passed in, and an empty body. BUG-API-024 / BUG-API-025
// (QA 2026-05-29): a bare `OPTIONS /<probe>` from a curl /
// uptime-checker / SDK probe (no `Origin` header → fiberCORS skips)
// used to fall through to Fiber's "no route for verb" path and return
// 405. The handler closes that gap on the shallow probe surfaces
// (/livez, /healthz, /readyz, /openapi.json).
func TestProbeOptionsHandlerShape(t *testing.T) {
app := fiber.New()
app.Options("/probe", probeOptionsHandler("GET, HEAD, OPTIONS"))

resp, err := app.Test(httptest.NewRequest("OPTIONS", "/probe", nil))
require.NoError(t, err)
require.Equal(t, fiber.StatusNoContent, resp.StatusCode,
"BUG-API-024/025: bare OPTIONS without Origin must be 204, not 405")
require.Equal(t, "GET, HEAD, OPTIONS", resp.Header.Get("Allow"),
"BUG-API-024/025: Allow header must mirror the handler argument so HTTP-conformant clients see the same allow set whether they read it from a 405 envelope or a 204 OPTIONS body")

body, err := io.ReadAll(resp.Body)
require.NoError(t, err)
require.Equal(t, "", string(body),
"BUG-API-024/025: 204 body must be empty per RFC 7231 §6.3.5")
}

// TestProbeOptionsHandlerCoversAllShallowProbes is the registry-style
// guarantee (rule 18 — "registry-iterating regression tests, not
// hand-typed lists"). Every shallow probe surface registered in
// router.go MUST have an OPTIONS handler that returns 204 — adding a
// new probe surface without the matching OPTIONS shim is exactly the
// bug class BUG-API-024 / BUG-API-025 logged.
//
// The list below is the registry of shallow probe paths. It is kept
// in this test file so a future contributor adding a new probe MUST
// either add it here (and wire the OPTIONS handler) or document the
// exemption inline.
func TestProbeOptionsHandlerCoversAllShallowProbes(t *testing.T) {
app := fiber.New()
shallowProbes := []string{
"/livez",
"/healthz",
"/readyz",
"/openapi.json",
}
for _, p := range shallowProbes {
app.Options(p, probeOptionsHandler("GET, HEAD, OPTIONS"))
}

for _, p := range shallowProbes {
resp, err := app.Test(httptest.NewRequest("OPTIONS", p, nil))
require.NoError(t, err, "OPTIONS %s should not error", p)
require.Equal(t, fiber.StatusNoContent, resp.StatusCode,
"BUG-API-024/025: OPTIONS %s must return 204 (got %d)", p, resp.StatusCode)
allow := resp.Header.Get("Allow")
require.Contains(t, strings.Split(allow, ", "), "OPTIONS",
"BUG-API-024/025: Allow header on OPTIONS %s must list OPTIONS itself; got %q", p, allow)
require.Contains(t, strings.Split(allow, ", "), "GET",
"BUG-API-024/025: Allow header on OPTIONS %s must list GET; got %q", p, allow)
}
}
70 changes: 70 additions & 0 deletions internal/router/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,43 @@ func New(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *middleware.G
return app
}

// processStart is captured once per process so /healthz can report
// `uptime_seconds` for liveness diagnostics without an external clock.
// Sampled via time.Now() at package-load time — there is no production
// reason to mutate it after first read. Tests that want to assert a
// specific uptime override it via processStartFunc.
//
// BUG-P272 (QA 2026-05-29): `/healthz` previously did not expose
// uptime, leaving canaries / agents unable to distinguish "freshly
// rolled pod" from "pod that's been alive for hours" without a
// separate metrics scrape. The new `uptime_seconds` field plus the
// existing `build_time` give a self-contained "when did this pod
// start and how long has it been up" signal on every shallow probe.
var processStart = time.Now()

// processStartFunc is the seam that lets tests pin the uptime value.
// Production code reads processStart directly; test code can swap
// processStartFunc to a fixed instant to assert the JSON field.
var processStartFunc = func() time.Time { return processStart }

// probeOptionsHandler returns a Fiber handler that responds 204 + an
// Allow header to bare `OPTIONS /<probe>` calls. Without it Fiber's
// "no route for verb" path returns 405 — fine for browser preflight
// (the CORS middleware lower in the chain handles Origin-bearing
// requests) but surprising for curl / uptime-checker / SDK probes
// that do not set Origin. Used by the shallow probe surfaces
// (/livez, /healthz, /readyz, /openapi.json) per BUG-API-024 /
// BUG-API-025. The Allow header mirrors the verbs the routed
// handlers expose so an HTTP-conformant client sees the same allow
// set whether it reads it from a 405 envelope or a 204 OPTIONS
// response.
func probeOptionsHandler(allow string) fiber.Handler {
return func(c *fiber.Ctx) error {
c.Set("Allow", allow)
return c.SendStatus(fiber.StatusNoContent)
}
}

// NewWithHooks is the production entrypoint — returns both the Fiber
// app and the ShutdownHooks needed for graceful shutdown.
func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *middleware.GeoDBs, emailClient *email.Client, planRegistry *plans.Registry, provClient *provisioner.Client, nrApp *newrelic.Application) (*fiber.App, ShutdownHooks) {
Expand Down Expand Up @@ -152,6 +189,15 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid
app.Get("/livez", func(c *fiber.Ctx) error {
return c.JSON(fiber.Map{"alive": true})
})
// BUG-API-025 (QA 2026-05-29): bare `OPTIONS /livez` (no Origin
// header, e.g. curl/uptime-checker style probes) used to fall
// through to Fiber's "no route" handler and return 405 because
// the CORS middleware (which is the usual OPTIONS responder)
// skips when Origin is unset. Browser preflight (Origin present)
// still flows through fiberCORS below — this OPTIONS shim only
// covers the no-Origin probe lane and returns 204 with the same
// Allow header set Fiber would have emitted on a routed 405.
app.Options("/livez", probeOptionsHandler("GET, HEAD, OPTIONS"))

// ── Middleware chain (order matters) ─────────────────────────────────────
// SecurityHeaders runs BEFORE RequestID so the static defense-in-depth
Expand Down Expand Up @@ -392,6 +438,13 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid
// stamps benefit from the new value too — see handlers/readyz.go
// where worker/provisioner probes report "instanode-worker" and
// "instanode-provisioner" in sibling repos.
// BUG-P272 (QA 2026-05-29): expose `uptime_seconds` (int64) so
// canaries / agents can answer "how long has this pod been up?"
// without a separate metrics scrape. Computed from processStartFunc()
// (a test-overridable seam) and rounded to seconds — sub-second
// jitter has no diagnostic value at this surface and would defeat
// HTTP-cache deduplication if anyone proxies the response.
uptimeSeconds := int64(time.Since(processStartFunc()).Seconds())
return c.JSON(fiber.Map{
"ok": true,
"service": "instanode-api",
Expand All @@ -402,8 +455,15 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid
"migration_count": mstate.Count,
"migration_status": mstate.Status,
"now": time.Now().UTC().Format("2006-01-02T15:04:05.000Z"),
"uptime_seconds": uptimeSeconds,
})
})
// BUG-API-025 (QA 2026-05-29): mirror the /livez OPTIONS shim on
// /healthz so curl/uptime-checker style probes that send a bare
// `OPTIONS /healthz` (no Origin) get a 204 instead of the 405 Fiber
// returns when no route matches the verb. Browser CORS preflight
// (Origin present) still flows through fiberCORS below.
app.Options("/healthz", probeOptionsHandler("GET, HEAD, OPTIONS"))

// /readyz — deep, component-by-component readiness probe wired to
// the k8s readinessProbe (NOT livenessProbe — see /healthz above
Expand All @@ -414,6 +474,8 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid
// motivation (RETRO 2026-05-20).
readyzH := handlers.NewReadyzHandler(cfg, db, rdb, provClient)
app.Get("/readyz", readyzH.Get)
// BUG-API-025 (QA 2026-05-29): same OPTIONS shim as /healthz/livez.
app.Options("/readyz", probeOptionsHandler("GET, HEAD, OPTIONS"))

// OpenAPI spec — machine-readable description of the agent-facing API.
// T19 P0-1 (BugHunt 2026-05-20): pass ENVIRONMENT so the served spec
Expand All @@ -427,6 +489,14 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid
// the session_token redirected there.
handlers.SetReturnToAllowsLocalhost(cfg.Environment != "production")
app.Get("/openapi.json", handlers.ServeOpenAPI)
// BUG-API-024 (QA 2026-05-29): bare `OPTIONS /openapi.json` (no
// Origin header — e.g. an SDK doing a preflight probe before a
// custom-header GET) used to return 405 because no route matched
// the verb. The CORS middleware below only responds when Origin
// is set. Register an explicit OPTIONS handler so the no-Origin
// probe lane returns 204 + the Allow header set the browser CORS
// preflight would otherwise see via fiberCORS.
app.Options("/openapi.json", probeOptionsHandler("GET, HEAD, OPTIONS"))

// /llms.txt — agent discovery doc, 302 to marketing where it's the
// source of truth. Agents that hit api.instanode.dev first land here
Expand Down
Loading