diff --git a/pkg/monitortests/network/onpremhaproxy/monitortest.go b/pkg/monitortests/network/onpremhaproxy/monitortest.go index 694be0902350..eb2da803d681 100644 --- a/pkg/monitortests/network/onpremhaproxy/monitortest.go +++ b/pkg/monitortests/network/onpremhaproxy/monitortest.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "regexp" + "sort" "strings" "time" @@ -211,22 +212,159 @@ func (*operatorLogAnalyzer) EvaluateTestsFromConstructedIntervals(ctx context.Co testNameToFailures[testName] = append(testNameToFailures[testName], interval.String()) } + ret := []*junitapi.JUnitTestCase{} if !somethingFailed { - return []*junitapi.JUnitTestCase{success}, nil + ret = append(ret, success) + } else { + failure := &junitapi.JUnitTestCase{ + Name: testName, + FailureOutput: &junitapi.FailureOutput{ + //Message: fmt.Sprint("something happened with haproxy"), + Output: "Haproxy detected some kubeapi-servers down. It's not necessarily an issue, it's expected over the course of installation. Go and check messages. Look at intervals in sippy to see a full graph of which haproxy instance detected which kubeapi-server as down. Plotted on a time axis, you will see if at any point in time all the kubeapi-servers were down. Only then, it is an issue.", + }, + SystemOut: strings.Join(testNameToFailures[testName], "\n"), + //SystemErr: fmt.Sprintf("syserr; found %d lines in the failure map", len(testNameToFailures[testName])), + } + + // Marked flaky until we have monitored it for consistency + ret = append(ret, failure, success) + } + + ret = append(ret, evaluateFullAPIOutages(leaseIntervals)...) + + return ret, nil +} + +// fullOutageBackendThreshold is the number of distinct kube-apiserver backends that have to be +// reported down at the same time by a single haproxy instance to consider it a full API outage. +// On-prem HA deployments run three control plane nodes, so three backends down at the same time +// mean the API is not reachable through the loadbalancer at all. +const fullOutageBackendThreshold = 3 + +// apiOutageWindow is a time range during which a single haproxy instance considered at least +// fullOutageBackendThreshold kube-apiserver backends down at the same time. +type apiOutageWindow struct { + from time.Time + to time.Time +} + +// findFullAPIOutageWindows takes the constructed OnPremHaproxyDetectsDown intervals and returns, +// per node running haproxy, the time windows during which that haproxy instance reported at least +// `threshold` distinct kube-apiserver backends down at the same time. +func findFullAPIOutageWindows(downIntervals monitorapi.Intervals, threshold int) map[string][]apiOutageWindow { + type sweepEvent struct { + at time.Time + delta int + } + + eventsPerNode := map[string][]sweepEvent{} + for _, interval := range downIntervals { + // The locator key has the form "___". + pairKey := interval.Locator.Keys[monitorapi.LocatorOnPremKubeapiUnreachableFromHaproxyKey] + parts := strings.SplitN(pairKey, "___", 2) + if len(parts) != 2 { + continue + } + reportingNode := parts[0] + eventsPerNode[reportingNode] = append(eventsPerNode[reportingNode], + sweepEvent{at: interval.From, delta: 1}, + sweepEvent{at: interval.To, delta: -1}, + ) + } + + ret := map[string][]apiOutageWindow{} + for node, events := range eventsPerNode { + // Sort by time. On equal timestamps process the "backend recovered" events first so that a + // backend recovering at the very same second another one goes down does not produce an + // artificial overlap. + sort.Slice(events, func(i, j int) bool { + if events[i].at.Equal(events[j].at) { + return events[i].delta < events[j].delta + } + return events[i].at.Before(events[j].at) + }) + + // Sweep over the events counting how many backends are down at any given moment. Intervals of + // a single backend never overlap by construction, so the number of open intervals equals the + // number of distinct backends being down. + windows := []apiOutageWindow{} + downCount := 0 + inOutage := false + var outageStart time.Time + for _, event := range events { + downCount += event.delta + switch { + case !inOutage && downCount >= threshold: + inOutage = true + outageStart = event.at + case inOutage && downCount < threshold: + inOutage = false + windows = append(windows, apiOutageWindow{from: outageStart, to: event.at}) + } + } + + // Merge windows that touch each other. Log timestamps have second granularity, so a backend + // recovering and another one going down within the same second would otherwise split a single + // outage into two. + merged := []apiOutageWindow{} + for _, window := range windows { + if len(merged) > 0 && !window.from.After(merged[len(merged)-1].to) { + merged[len(merged)-1].to = window.to + continue + } + merged = append(merged, window) + } + if len(merged) > 0 { + ret[node] = merged + } + } + + return ret +} + +// evaluateFullAPIOutages produces a junit result failing whenever a single haproxy instance +// reported all kube-apiserver backends down at the same time. The first occurrence for every +// haproxy instance is tolerated: when haproxy starts during the installation, all kube-apiservers +// are expected to be down until they come up for the first time. Any later occurrence means the +// API was completely unreachable through the on-prem loadbalancer. +func evaluateFullAPIOutages(downIntervals monitorapi.Intervals) []*junitapi.JUnitTestCase { + const testName = "[Jira: Networking / On-Prem Host Networking] Haproxy must not detect all kubeapi servers down simultaneously" + + outagesPerNode := findFullAPIOutageWindows(downIntervals, fullOutageBackendThreshold) + + nodes := make([]string, 0, len(outagesPerNode)) + for node := range outagesPerNode { + nodes = append(nodes, node) + } + sort.Strings(nodes) + + failures := []string{} + for _, node := range nodes { + // The first full outage observed by every haproxy instance is the initial state: when haproxy + // starts during the installation, none of the kube-apiservers is up yet. + for _, window := range outagesPerNode[node][1:] { + failures = append(failures, fmt.Sprintf( + "haproxy on node %s reported %d or more kube-apiserver backends down at the same time between %s and %s (%s)", + node, fullOutageBackendThreshold, window.from.Format(time.RFC3339), window.to.Format(time.RFC3339), window.to.Sub(window.from))) + } + } + + if len(failures) == 0 { + return []*junitapi.JUnitTestCase{{Name: testName}} } failure := &junitapi.JUnitTestCase{ Name: testName, FailureOutput: &junitapi.FailureOutput{ - //Message: fmt.Sprint("something happened with haproxy"), - Output: "Haproxy detected some kubeapi-servers down. It's not necessarily an issue, it's expected over the course of installation. Go and check messages. Look at intervals in sippy to see a full graph of which haproxy instance detected which kubeapi-server as down. Plotted on a time axis, you will see if at any point in time all the kubeapi-servers were down. Only then, it is an issue.", + Output: "Haproxy detected all kube-apiserver backends down at the same time after the initial startup window. " + + "The first occurrence for every haproxy instance is expected: when haproxy starts during the installation, all kube-apiservers are down until they come up for the first time. " + + "Any subsequent occurrence means the API was completely unreachable through the on-prem loadbalancer. " + + "Look at the onprem-haproxy rows in the intervals chart to see which haproxy instance detected which kube-apiserver as down.", }, - SystemOut: strings.Join(testNameToFailures[testName], "\n"), - //SystemErr: fmt.Sprintf("syserr; found %d lines in the failure map", len(testNameToFailures[testName])), + SystemOut: strings.Join(failures, "\n"), } - // Marked flaky until we have monitored it for consistency - return []*junitapi.JUnitTestCase{failure, success}, nil + return []*junitapi.JUnitTestCase{failure} } func (w *operatorLogAnalyzer) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error { diff --git a/pkg/monitortests/network/onpremhaproxy/monitortest_test.go b/pkg/monitortests/network/onpremhaproxy/monitortest_test.go new file mode 100644 index 000000000000..4730f2131463 --- /dev/null +++ b/pkg/monitortests/network/onpremhaproxy/monitortest_test.go @@ -0,0 +1,217 @@ +package onpremhaproxy + +import ( + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/openshift/origin/pkg/monitor/monitorapi" +) + +var testStart = time.Date(2024, time.October, 28, 7, 0, 0, 0, time.UTC) + +// at is a helper returning a time relative to the beginning of the test run. +func at(seconds int) time.Time { + return testStart.Add(time.Duration(seconds) * time.Second) +} + +// haproxyDownInterval builds a constructed OnPremHaproxyDetectsDown interval the same way +// ConstructComputedIntervals does. +func haproxyDownInterval(reportingNode, backend string, from, to time.Time) monitorapi.Interval { + return monitorapi.NewInterval(monitorapi.SourceHaproxyMonitor, monitorapi.Info). + Locator(monitorapi.Locator{Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorOnPremKubeapiUnreachableFromHaproxyKey: fmt.Sprintf("%s___%s", reportingNode, backend), + }}). + Message(monitorapi.NewMessage().Reason(monitorapi.OnPremHaproxyDetectsDown). + Constructed(monitorapi.ConstructionOwnerOnPremHaproxy). + HumanMessage(fmt.Sprintf("Kubeapi on %s is detected dead by %s", backend, reportingNode))). + Display(). + Build(from, to) +} + +func TestFindFullAPIOutageWindows(t *testing.T) { + tests := []struct { + name string + intervals monitorapi.Intervals + expected map[string][]apiOutageWindow + }{ + { + name: "no intervals", + intervals: monitorapi.Intervals{}, + expected: map[string][]apiOutageWindow{}, + }, + { + name: "single backend flapping is not an outage", + intervals: monitorapi.Intervals{ + haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)), + haproxyDownInterval("master-0", "masters/master-1", at(120), at(180)), + }, + expected: map[string][]apiOutageWindow{}, + }, + { + name: "two backends down at the same time is not a full outage", + intervals: monitorapi.Intervals{ + haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)), + haproxyDownInterval("master-0", "masters/master-2", at(30), at(90)), + }, + expected: map[string][]apiOutageWindow{}, + }, + { + name: "three backends down at the same time", + intervals: monitorapi.Intervals{ + haproxyDownInterval("master-0", "masters/master-0", at(0), at(50)), + haproxyDownInterval("master-0", "masters/master-1", at(10), at(60)), + haproxyDownInterval("master-0", "masters/master-2", at(20), at(40)), + }, + expected: map[string][]apiOutageWindow{ + "master-0": {{from: at(20), to: at(40)}}, + }, + }, + { + name: "backends down on different haproxy instances do not add up", + intervals: monitorapi.Intervals{ + haproxyDownInterval("master-0", "masters/master-0", at(0), at(60)), + haproxyDownInterval("master-1", "masters/master-1", at(0), at(60)), + haproxyDownInterval("master-2", "masters/master-2", at(0), at(60)), + }, + expected: map[string][]apiOutageWindow{}, + }, + { + name: "two separate full outages", + intervals: monitorapi.Intervals{ + haproxyDownInterval("master-0", "masters/master-0", at(0), at(60)), + haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)), + haproxyDownInterval("master-0", "masters/master-2", at(0), at(60)), + haproxyDownInterval("master-0", "masters/master-0", at(600), at(630)), + haproxyDownInterval("master-0", "masters/master-1", at(600), at(630)), + haproxyDownInterval("master-0", "masters/master-2", at(600), at(630)), + }, + expected: map[string][]apiOutageWindow{ + "master-0": { + {from: at(0), to: at(60)}, + {from: at(600), to: at(630)}, + }, + }, + }, + { + name: "recovery at the same second as another backend goes down is not an overlap", + intervals: monitorapi.Intervals{ + haproxyDownInterval("master-0", "masters/master-0", at(0), at(20)), + haproxyDownInterval("master-0", "masters/master-1", at(0), at(20)), + haproxyDownInterval("master-0", "masters/master-2", at(20), at(40)), + }, + expected: map[string][]apiOutageWindow{}, + }, + { + name: "one backend recovering and going down within the outage keeps a single window", + intervals: monitorapi.Intervals{ + haproxyDownInterval("master-0", "masters/master-0", at(0), at(100)), + haproxyDownInterval("master-0", "masters/master-1", at(0), at(100)), + haproxyDownInterval("master-0", "masters/master-2", at(0), at(50)), + haproxyDownInterval("master-0", "masters/master-2", at(50), at(100)), + }, + expected: map[string][]apiOutageWindow{ + "master-0": {{from: at(0), to: at(100)}}, + }, + }, + { + name: "outages tracked separately per haproxy instance", + intervals: monitorapi.Intervals{ + haproxyDownInterval("master-0", "masters/master-0", at(0), at(60)), + haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)), + haproxyDownInterval("master-0", "masters/master-2", at(0), at(60)), + haproxyDownInterval("master-1", "masters/master-0", at(300), at(360)), + haproxyDownInterval("master-1", "masters/master-1", at(300), at(360)), + haproxyDownInterval("master-1", "masters/master-2", at(300), at(360)), + }, + expected: map[string][]apiOutageWindow{ + "master-0": {{from: at(0), to: at(60)}}, + "master-1": {{from: at(300), to: at(360)}}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actual := findFullAPIOutageWindows(tt.intervals, fullOutageBackendThreshold) + assert.Equal(t, tt.expected, actual) + }) + } +} + +func TestEvaluateFullAPIOutages(t *testing.T) { + // installOutage simulates the expected initial state: when haproxy starts during the + // installation, all kube-apiservers are down until they come up for the first time. + installOutage := func(reportingNode string) monitorapi.Intervals { + return monitorapi.Intervals{ + haproxyDownInterval(reportingNode, "masters/master-0", at(0), at(300)), + haproxyDownInterval(reportingNode, "masters/master-1", at(0), at(360)), + haproxyDownInterval(reportingNode, "masters/master-2", at(0), at(420)), + } + } + + tests := []struct { + name string + intervals monitorapi.Intervals + expectFailure bool + expectedOutputs []string + }{ + { + name: "no intervals", + intervals: monitorapi.Intervals{}, + expectFailure: false, + }, + { + name: "only the installation outage", + intervals: installOutage("master-0"), + expectFailure: false, + }, + { + name: "full outage after the installation", + intervals: append(installOutage("master-0"), + haproxyDownInterval("master-0", "masters/master-0", at(3600), at(3630)), + haproxyDownInterval("master-0", "masters/master-1", at(3600), at(3630)), + haproxyDownInterval("master-0", "masters/master-2", at(3600), at(3630)), + ), + expectFailure: true, + expectedOutputs: []string{ + "haproxy on node master-0", + at(3600).Format(time.RFC3339), + at(3630).Format(time.RFC3339), + }, + }, + { + name: "initial outage tolerated separately per haproxy instance", + intervals: append(installOutage("master-0"), installOutage("master-1")...), + expectFailure: false, + }, + { + name: "partial outage after the installation does not fail", + intervals: append(installOutage("master-0"), + haproxyDownInterval("master-0", "masters/master-0", at(3600), at(3630)), + haproxyDownInterval("master-0", "masters/master-1", at(3600), at(3630)), + ), + expectFailure: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + junits := evaluateFullAPIOutages(tt.intervals) + require.Len(t, junits, 1) + + if !tt.expectFailure { + assert.Nil(t, junits[0].FailureOutput, "expected the test to pass") + return + } + + require.NotNil(t, junits[0].FailureOutput, "expected the test to fail") + for _, expectedOutput := range tt.expectedOutputs { + assert.Contains(t, junits[0].SystemOut, expectedOutput) + } + }) + } +}