openshift · mkowalski · Jun 11, 2026
diff --git a/pkg/monitortests/network/onpremhaproxy/monitortest.go b/pkg/monitortests/network/onpremhaproxy/monitortest.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"regexp"
+	"sort"
 	"strings"
 	"time"
 
@@ -211,22 +212,159 @@ func (*operatorLogAnalyzer) EvaluateTestsFromConstructedIntervals(ctx context.Co
 		testNameToFailures[testName] = append(testNameToFailures[testName], interval.String())
 	}
 
+	ret := []*junitapi.JUnitTestCase{}
 	if !somethingFailed {
-		return []*junitapi.JUnitTestCase{success}, nil
+		ret = append(ret, success)
+	} else {
+		failure := &junitapi.JUnitTestCase{
+			Name: testName,
+			FailureOutput: &junitapi.FailureOutput{
+				//Message: fmt.Sprint("something happened with haproxy"),
+				Output: "Haproxy detected some kubeapi-servers down. It's not necessarily an issue, it's expected over the course of installation. Go and check messages. Look at intervals in sippy to see a full graph of which haproxy instance detected which kubeapi-server as down. Plotted on a time axis, you will see if at any point in time all the kubeapi-servers were down. Only then, it is an issue.",
+			},
+			SystemOut: strings.Join(testNameToFailures[testName], "\n"),
+			//SystemErr: fmt.Sprintf("syserr; found %d lines in the failure map", len(testNameToFailures[testName])),
+		}
+
+		// Marked flaky until we have monitored it for consistency
+		ret = append(ret, failure, success)
+	}
+
+	ret = append(ret, evaluateFullAPIOutages(leaseIntervals)...)
+
+	return ret, nil
+}
+
+// fullOutageBackendThreshold is the number of distinct kube-apiserver backends that have to be
+// reported down at the same time by a single haproxy instance to consider it a full API outage.
+// On-prem HA deployments run three control plane nodes, so three backends down at the same time
+// mean the API is not reachable through the loadbalancer at all.
+const fullOutageBackendThreshold = 3
+
+// apiOutageWindow is a time range during which a single haproxy instance considered at least
+// fullOutageBackendThreshold kube-apiserver backends down at the same time.
+type apiOutageWindow struct {
+	from time.Time
+	to   time.Time
+}
+
+// findFullAPIOutageWindows takes the constructed OnPremHaproxyDetectsDown intervals and returns,
+// per node running haproxy, the time windows during which that haproxy instance reported at least
+// `threshold` distinct kube-apiserver backends down at the same time.
+func findFullAPIOutageWindows(downIntervals monitorapi.Intervals, threshold int) map[string][]apiOutageWindow {
+	type sweepEvent struct {
+		at    time.Time
+		delta int
+	}
+
+	eventsPerNode := map[string][]sweepEvent{}
+	for _, interval := range downIntervals {
+		// The locator key has the form "<node running haproxy>___<kube-apiserver backend>".
+		pairKey := interval.Locator.Keys[monitorapi.LocatorOnPremKubeapiUnreachableFromHaproxyKey]
+		parts := strings.SplitN(pairKey, "___", 2)
+		if len(parts) != 2 {
+			continue
+		}
+		reportingNode := parts[0]
+		eventsPerNode[reportingNode] = append(eventsPerNode[reportingNode],
+			sweepEvent{at: interval.From, delta: 1},
+			sweepEvent{at: interval.To, delta: -1},
+		)
+	}
+
+	ret := map[string][]apiOutageWindow{}
+	for node, events := range eventsPerNode {
+		// Sort by time. On equal timestamps process the "backend recovered" events first so that a
+		// backend recovering at the very same second another one goes down does not produce an
+		// artificial overlap.
+		sort.Slice(events, func(i, j int) bool {
+			if events[i].at.Equal(events[j].at) {
+				return events[i].delta < events[j].delta
+			}
+			return events[i].at.Before(events[j].at)
+		})
+
+		// Sweep over the events counting how many backends are down at any given moment. Intervals of
+		// a single backend never overlap by construction, so the number of open intervals equals the
+		// number of distinct backends being down.
+		windows := []apiOutageWindow{}
+		downCount := 0
+		inOutage := false
+		var outageStart time.Time
+		for _, event := range events {
+			downCount += event.delta
+			switch {
+			case !inOutage && downCount >= threshold:
+				inOutage = true
+				outageStart = event.at
+			case inOutage && downCount < threshold:
+				inOutage = false
+				windows = append(windows, apiOutageWindow{from: outageStart, to: event.at})
+			}
+		}
+
+		// Merge windows that touch each other. Log timestamps have second granularity, so a backend
+		// recovering and another one going down within the same second would otherwise split a single
+		// outage into two.
+		merged := []apiOutageWindow{}
+		for _, window := range windows {
+			if len(merged) > 0 && !window.from.After(merged[len(merged)-1].to) {
+				merged[len(merged)-1].to = window.to
+				continue
+			}
+			merged = append(merged, window)
+		}
+		if len(merged) > 0 {
+			ret[node] = merged
+		}
+	}
+
+	return ret
+}
+
+// evaluateFullAPIOutages produces a junit result failing whenever a single haproxy instance
+// reported all kube-apiserver backends down at the same time. The first occurrence for every
+// haproxy instance is tolerated: when haproxy starts during the installation, all kube-apiservers
+// are expected to be down until they come up for the first time. Any later occurrence means the
+// API was completely unreachable through the on-prem loadbalancer.
+func evaluateFullAPIOutages(downIntervals monitorapi.Intervals) []*junitapi.JUnitTestCase {
+	const testName = "[Jira: Networking / On-Prem Host Networking] Haproxy must not detect all kubeapi servers down simultaneously"
+
+	outagesPerNode := findFullAPIOutageWindows(downIntervals, fullOutageBackendThreshold)
+
+	nodes := make([]string, 0, len(outagesPerNode))
+	for node := range outagesPerNode {
+		nodes = append(nodes, node)
+	}
+	sort.Strings(nodes)
+
+	failures := []string{}
+	for _, node := range nodes {
+		// The first full outage observed by every haproxy instance is the initial state: when haproxy
+		// starts during the installation, none of the kube-apiservers is up yet.
+		for _, window := range outagesPerNode[node][1:] {
+			failures = append(failures, fmt.Sprintf(
+				"haproxy on node %s reported %d or more kube-apiserver backends down at the same time between %s and %s (%s)",
+				node, fullOutageBackendThreshold, window.from.Format(time.RFC3339), window.to.Format(time.RFC3339), window.to.Sub(window.from)))
+		}
+	}
+
+	if len(failures) == 0 {
+		return []*junitapi.JUnitTestCase{{Name: testName}}
 	}
 
 	failure := &junitapi.JUnitTestCase{
 		Name: testName,
 		FailureOutput: &junitapi.FailureOutput{
-			//Message: fmt.Sprint("something happened with haproxy"),
-			Output: "Haproxy detected some kubeapi-servers down. It's not necessarily an issue, it's expected over the course of installation. Go and check messages. Look at intervals in sippy to see a full graph of which haproxy instance detected which kubeapi-server as down. Plotted on a time axis, you will see if at any point in time all the kubeapi-servers were down. Only then, it is an issue.",
+			Output: "Haproxy detected all kube-apiserver backends down at the same time after the initial startup window. " +
+				"The first occurrence for every haproxy instance is expected: when haproxy starts during the installation, all kube-apiservers are down until they come up for the first time. " +
+				"Any subsequent occurrence means the API was completely unreachable through the on-prem loadbalancer. " +
+				"Look at the onprem-haproxy rows in the intervals chart to see which haproxy instance detected which kube-apiserver as down.",
 		},
-		SystemOut: strings.Join(testNameToFailures[testName], "\n"),
-		//SystemErr: fmt.Sprintf("syserr; found %d lines in the failure map", len(testNameToFailures[testName])),
+		SystemOut: strings.Join(failures, "\n"),
 	}
 
-	// Marked flaky until we have monitored it for consistency
-	return []*junitapi.JUnitTestCase{failure, success}, nil
+	return []*junitapi.JUnitTestCase{failure}
 }
 
 func (w *operatorLogAnalyzer) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {

diff --git a/pkg/monitortests/network/onpremhaproxy/monitortest_test.go b/pkg/monitortests/network/onpremhaproxy/monitortest_test.go
@@ -0,0 +1,217 @@
+package onpremhaproxy
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/openshift/origin/pkg/monitor/monitorapi"
+)
+
+var testStart = time.Date(2024, time.October, 28, 7, 0, 0, 0, time.UTC)
+
+// at is a helper returning a time relative to the beginning of the test run.
+func at(seconds int) time.Time {
+	return testStart.Add(time.Duration(seconds) * time.Second)
+}
+
+// haproxyDownInterval builds a constructed OnPremHaproxyDetectsDown interval the same way
+// ConstructComputedIntervals does.
+func haproxyDownInterval(reportingNode, backend string, from, to time.Time) monitorapi.Interval {
+	return monitorapi.NewInterval(monitorapi.SourceHaproxyMonitor, monitorapi.Info).
+		Locator(monitorapi.Locator{Keys: map[monitorapi.LocatorKey]string{
+			monitorapi.LocatorOnPremKubeapiUnreachableFromHaproxyKey: fmt.Sprintf("%s___%s", reportingNode, backend),
+		}}).
+		Message(monitorapi.NewMessage().Reason(monitorapi.OnPremHaproxyDetectsDown).
+			Constructed(monitorapi.ConstructionOwnerOnPremHaproxy).
+			HumanMessage(fmt.Sprintf("Kubeapi on %s is detected dead by %s", backend, reportingNode))).
+		Display().
+		Build(from, to)
+}
+
+func TestFindFullAPIOutageWindows(t *testing.T) {
+	tests := []struct {
+		name      string
+		intervals monitorapi.Intervals
+		expected  map[string][]apiOutageWindow
+	}{
+		{
+			name:      "no intervals",
+			intervals: monitorapi.Intervals{},
+			expected:  map[string][]apiOutageWindow{},
+		},
+		{
+			name: "single backend flapping is not an outage",
+			intervals: monitorapi.Intervals{
+				haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)),
+				haproxyDownInterval("master-0", "masters/master-1", at(120), at(180)),
+			},
+			expected: map[string][]apiOutageWindow{},
+		},
+		{
+			name: "two backends down at the same time is not a full outage",
+			intervals: monitorapi.Intervals{
+				haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)),
+				haproxyDownInterval("master-0", "masters/master-2", at(30), at(90)),
+			},
+			expected: map[string][]apiOutageWindow{},
+		},
+		{
+			name: "three backends down at the same time",
+			intervals: monitorapi.Intervals{
+				haproxyDownInterval("master-0", "masters/master-0", at(0), at(50)),
+				haproxyDownInterval("master-0", "masters/master-1", at(10), at(60)),
+				haproxyDownInterval("master-0", "masters/master-2", at(20), at(40)),
+			},
+			expected: map[string][]apiOutageWindow{
+				"master-0": {{from: at(20), to: at(40)}},
+			},
+		},
+		{
+			name: "backends down on different haproxy instances do not add up",
+			intervals: monitorapi.Intervals{
+				haproxyDownInterval("master-0", "masters/master-0", at(0), at(60)),
+				haproxyDownInterval("master-1", "masters/master-1", at(0), at(60)),
+				haproxyDownInterval("master-2", "masters/master-2", at(0), at(60)),
+			},
+			expected: map[string][]apiOutageWindow{},
+		},
+		{
+			name: "two separate full outages",
+			intervals: monitorapi.Intervals{
+				haproxyDownInterval("master-0", "masters/master-0", at(0), at(60)),
+				haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)),
+				haproxyDownInterval("master-0", "masters/master-2", at(0), at(60)),
+				haproxyDownInterval("master-0", "masters/master-0", at(600), at(630)),
+				haproxyDownInterval("master-0", "masters/master-1", at(600), at(630)),
+				haproxyDownInterval("master-0", "masters/master-2", at(600), at(630)),
+			},
+			expected: map[string][]apiOutageWindow{
+				"master-0": {
+					{from: at(0), to: at(60)},
+					{from: at(600), to: at(630)},
+				},
+			},
+		},
+		{
+			name: "recovery at the same second as another backend goes down is not an overlap",
+			intervals: monitorapi.Intervals{
+				haproxyDownInterval("master-0", "masters/master-0", at(0), at(20)),
+				haproxyDownInterval("master-0", "masters/master-1", at(0), at(20)),
+				haproxyDownInterval("master-0", "masters/master-2", at(20), at(40)),
+			},
+			expected: map[string][]apiOutageWindow{},
+		},
+		{
+			name: "one backend recovering and going down within the outage keeps a single window",
+			intervals: monitorapi.Intervals{
+				haproxyDownInterval("master-0", "masters/master-0", at(0), at(100)),
+				haproxyDownInterval("master-0", "masters/master-1", at(0), at(100)),
+				haproxyDownInterval("master-0", "masters/master-2", at(0), at(50)),
+				haproxyDownInterval("master-0", "masters/master-2", at(50), at(100)),
+			},
+			expected: map[string][]apiOutageWindow{
+				"master-0": {{from: at(0), to: at(100)}},
+			},
+		},
+		{
+			name: "outages tracked separately per haproxy instance",
+			intervals: monitorapi.Intervals{
+				haproxyDownInterval("master-0", "masters/master-0", at(0), at(60)),
+				haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)),
+				haproxyDownInterval("master-0", "masters/master-2", at(0), at(60)),
+				haproxyDownInterval("master-1", "masters/master-0", at(300), at(360)),
+				haproxyDownInterval("master-1", "masters/master-1", at(300), at(360)),
+				haproxyDownInterval("master-1", "masters/master-2", at(300), at(360)),
+			},
+			expected: map[string][]apiOutageWindow{
+				"master-0": {{from: at(0), to: at(60)}},
+				"master-1": {{from: at(300), to: at(360)}},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			actual := findFullAPIOutageWindows(tt.intervals, fullOutageBackendThreshold)
+			assert.Equal(t, tt.expected, actual)
+		})
+	}
+}
+
+func TestEvaluateFullAPIOutages(t *testing.T) {
+	// installOutage simulates the expected initial state: when haproxy starts during the
+	// installation, all kube-apiservers are down until they come up for the first time.
+	installOutage := func(reportingNode string) monitorapi.Intervals {
+		return monitorapi.Intervals{
+			haproxyDownInterval(reportingNode, "masters/master-0", at(0), at(300)),
+			haproxyDownInterval(reportingNode, "masters/master-1", at(0), at(360)),
+			haproxyDownInterval(reportingNode, "masters/master-2", at(0), at(420)),
+		}
+	}
+
+	tests := []struct {
+		name            string
+		intervals       monitorapi.Intervals
+		expectFailure   bool
+		expectedOutputs []string
+	}{
+		{
+			name:          "no intervals",
+			intervals:     monitorapi.Intervals{},
+			expectFailure: false,
+		},
+		{
+			name:          "only the installation outage",
+			intervals:     installOutage("master-0"),
+			expectFailure: false,
+		},
+		{
+			name: "full outage after the installation",
+			intervals: append(installOutage("master-0"),
+				haproxyDownInterval("master-0", "masters/master-0", at(3600), at(3630)),
+				haproxyDownInterval("master-0", "masters/master-1", at(3600), at(3630)),
+				haproxyDownInterval("master-0", "masters/master-2", at(3600), at(3630)),
+			),
+			expectFailure: true,
+			expectedOutputs: []string{
+				"haproxy on node master-0",
+				at(3600).Format(time.RFC3339),
+				at(3630).Format(time.RFC3339),
+			},
+		},
+		{
+			name:          "initial outage tolerated separately per haproxy instance",
+			intervals:     append(installOutage("master-0"), installOutage("master-1")...),
+			expectFailure: false,
+		},
+		{
+			name: "partial outage after the installation does not fail",
+			intervals: append(installOutage("master-0"),
+				haproxyDownInterval("master-0", "masters/master-0", at(3600), at(3630)),
+				haproxyDownInterval("master-0", "masters/master-1", at(3600), at(3630)),
+			),
+			expectFailure: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			junits := evaluateFullAPIOutages(tt.intervals)
+			require.Len(t, junits, 1)
+
+			if !tt.expectFailure {
+				assert.Nil(t, junits[0].FailureOutput, "expected the test to pass")
+				return
+			}
+
+			require.NotNil(t, junits[0].FailureOutput, "expected the test to fail")
+			for _, expectedOutput := range tt.expectedOutputs {
+				assert.Contains(t, junits[0].SystemOut, expectedOutput)
+			}
+		})
+	}
+}