Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 145 additions & 7 deletions pkg/monitortests/network/onpremhaproxy/monitortest.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"regexp"
"sort"
"strings"
"time"

Expand Down Expand Up @@ -211,22 +212,159 @@ func (*operatorLogAnalyzer) EvaluateTestsFromConstructedIntervals(ctx context.Co
testNameToFailures[testName] = append(testNameToFailures[testName], interval.String())
}

ret := []*junitapi.JUnitTestCase{}
if !somethingFailed {
return []*junitapi.JUnitTestCase{success}, nil
ret = append(ret, success)
} else {
failure := &junitapi.JUnitTestCase{
Name: testName,
FailureOutput: &junitapi.FailureOutput{
//Message: fmt.Sprint("something happened with haproxy"),
Output: "Haproxy detected some kubeapi-servers down. It's not necessarily an issue, it's expected over the course of installation. Go and check messages. Look at intervals in sippy to see a full graph of which haproxy instance detected which kubeapi-server as down. Plotted on a time axis, you will see if at any point in time all the kubeapi-servers were down. Only then, it is an issue.",
},
SystemOut: strings.Join(testNameToFailures[testName], "\n"),
//SystemErr: fmt.Sprintf("syserr; found %d lines in the failure map", len(testNameToFailures[testName])),
}

// Marked flaky until we have monitored it for consistency
ret = append(ret, failure, success)
}

ret = append(ret, evaluateFullAPIOutages(leaseIntervals)...)

return ret, nil
}

// fullOutageBackendThreshold is the number of distinct kube-apiserver backends that have to be
// reported down at the same time by a single haproxy instance to consider it a full API outage.
// On-prem HA deployments run three control plane nodes, so three backends down at the same time
// mean the API is not reachable through the loadbalancer at all.
const fullOutageBackendThreshold = 3

// apiOutageWindow is a time range during which a single haproxy instance considered at least
// fullOutageBackendThreshold kube-apiserver backends down at the same time.
type apiOutageWindow struct {
from time.Time
to time.Time
}

// findFullAPIOutageWindows takes the constructed OnPremHaproxyDetectsDown intervals and returns,
// per node running haproxy, the time windows during which that haproxy instance reported at least
// `threshold` distinct kube-apiserver backends down at the same time.
func findFullAPIOutageWindows(downIntervals monitorapi.Intervals, threshold int) map[string][]apiOutageWindow {
type sweepEvent struct {
at time.Time
delta int
}

eventsPerNode := map[string][]sweepEvent{}
for _, interval := range downIntervals {
// The locator key has the form "<node running haproxy>___<kube-apiserver backend>".
pairKey := interval.Locator.Keys[monitorapi.LocatorOnPremKubeapiUnreachableFromHaproxyKey]
parts := strings.SplitN(pairKey, "___", 2)
if len(parts) != 2 {
continue
}
reportingNode := parts[0]
eventsPerNode[reportingNode] = append(eventsPerNode[reportingNode],
sweepEvent{at: interval.From, delta: 1},
sweepEvent{at: interval.To, delta: -1},
)
}

ret := map[string][]apiOutageWindow{}
for node, events := range eventsPerNode {
// Sort by time. On equal timestamps process the "backend recovered" events first so that a
// backend recovering at the very same second another one goes down does not produce an
// artificial overlap.
sort.Slice(events, func(i, j int) bool {
if events[i].at.Equal(events[j].at) {
return events[i].delta < events[j].delta
}
return events[i].at.Before(events[j].at)
})

// Sweep over the events counting how many backends are down at any given moment. Intervals of
// a single backend never overlap by construction, so the number of open intervals equals the
// number of distinct backends being down.
windows := []apiOutageWindow{}
downCount := 0
inOutage := false
var outageStart time.Time
for _, event := range events {
downCount += event.delta
switch {
case !inOutage && downCount >= threshold:
inOutage = true
outageStart = event.at
case inOutage && downCount < threshold:
inOutage = false
windows = append(windows, apiOutageWindow{from: outageStart, to: event.at})
}
}

// Merge windows that touch each other. Log timestamps have second granularity, so a backend
// recovering and another one going down within the same second would otherwise split a single
// outage into two.
merged := []apiOutageWindow{}
for _, window := range windows {
if len(merged) > 0 && !window.from.After(merged[len(merged)-1].to) {
merged[len(merged)-1].to = window.to
continue
}
merged = append(merged, window)
}
if len(merged) > 0 {
ret[node] = merged
}
}

return ret
}

// evaluateFullAPIOutages produces a junit result failing whenever a single haproxy instance
// reported all kube-apiserver backends down at the same time. The first occurrence for every
// haproxy instance is tolerated: when haproxy starts during the installation, all kube-apiservers
// are expected to be down until they come up for the first time. Any later occurrence means the
// API was completely unreachable through the on-prem loadbalancer.
func evaluateFullAPIOutages(downIntervals monitorapi.Intervals) []*junitapi.JUnitTestCase {
const testName = "[Jira: Networking / On-Prem Host Networking] Haproxy must not detect all kubeapi servers down simultaneously"

outagesPerNode := findFullAPIOutageWindows(downIntervals, fullOutageBackendThreshold)

nodes := make([]string, 0, len(outagesPerNode))
for node := range outagesPerNode {
nodes = append(nodes, node)
}
sort.Strings(nodes)

failures := []string{}
for _, node := range nodes {
// The first full outage observed by every haproxy instance is the initial state: when haproxy
// starts during the installation, none of the kube-apiservers is up yet.
for _, window := range outagesPerNode[node][1:] {
failures = append(failures, fmt.Sprintf(
"haproxy on node %s reported %d or more kube-apiserver backends down at the same time between %s and %s (%s)",
node, fullOutageBackendThreshold, window.from.Format(time.RFC3339), window.to.Format(time.RFC3339), window.to.Sub(window.from)))
}
}

if len(failures) == 0 {
return []*junitapi.JUnitTestCase{{Name: testName}}
}

failure := &junitapi.JUnitTestCase{
Name: testName,
FailureOutput: &junitapi.FailureOutput{
//Message: fmt.Sprint("something happened with haproxy"),
Output: "Haproxy detected some kubeapi-servers down. It's not necessarily an issue, it's expected over the course of installation. Go and check messages. Look at intervals in sippy to see a full graph of which haproxy instance detected which kubeapi-server as down. Plotted on a time axis, you will see if at any point in time all the kubeapi-servers were down. Only then, it is an issue.",
Output: "Haproxy detected all kube-apiserver backends down at the same time after the initial startup window. " +
"The first occurrence for every haproxy instance is expected: when haproxy starts during the installation, all kube-apiservers are down until they come up for the first time. " +
"Any subsequent occurrence means the API was completely unreachable through the on-prem loadbalancer. " +
"Look at the onprem-haproxy rows in the intervals chart to see which haproxy instance detected which kube-apiserver as down.",
},
SystemOut: strings.Join(testNameToFailures[testName], "\n"),
//SystemErr: fmt.Sprintf("syserr; found %d lines in the failure map", len(testNameToFailures[testName])),
SystemOut: strings.Join(failures, "\n"),
}

// Marked flaky until we have monitored it for consistency
return []*junitapi.JUnitTestCase{failure, success}, nil
return []*junitapi.JUnitTestCase{failure}
}

func (w *operatorLogAnalyzer) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
Expand Down
217 changes: 217 additions & 0 deletions pkg/monitortests/network/onpremhaproxy/monitortest_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
package onpremhaproxy

import (
"fmt"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/openshift/origin/pkg/monitor/monitorapi"
)

var testStart = time.Date(2024, time.October, 28, 7, 0, 0, 0, time.UTC)

// at is a helper returning a time relative to the beginning of the test run.
func at(seconds int) time.Time {
return testStart.Add(time.Duration(seconds) * time.Second)
}

// haproxyDownInterval builds a constructed OnPremHaproxyDetectsDown interval the same way
// ConstructComputedIntervals does.
func haproxyDownInterval(reportingNode, backend string, from, to time.Time) monitorapi.Interval {
return monitorapi.NewInterval(monitorapi.SourceHaproxyMonitor, monitorapi.Info).
Locator(monitorapi.Locator{Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorOnPremKubeapiUnreachableFromHaproxyKey: fmt.Sprintf("%s___%s", reportingNode, backend),
}}).
Message(monitorapi.NewMessage().Reason(monitorapi.OnPremHaproxyDetectsDown).
Constructed(monitorapi.ConstructionOwnerOnPremHaproxy).
HumanMessage(fmt.Sprintf("Kubeapi on %s is detected dead by %s", backend, reportingNode))).
Display().
Build(from, to)
}

func TestFindFullAPIOutageWindows(t *testing.T) {
tests := []struct {
name string
intervals monitorapi.Intervals
expected map[string][]apiOutageWindow
}{
{
name: "no intervals",
intervals: monitorapi.Intervals{},
expected: map[string][]apiOutageWindow{},
},
{
name: "single backend flapping is not an outage",
intervals: monitorapi.Intervals{
haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)),
haproxyDownInterval("master-0", "masters/master-1", at(120), at(180)),
},
expected: map[string][]apiOutageWindow{},
},
{
name: "two backends down at the same time is not a full outage",
intervals: monitorapi.Intervals{
haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)),
haproxyDownInterval("master-0", "masters/master-2", at(30), at(90)),
},
expected: map[string][]apiOutageWindow{},
},
{
name: "three backends down at the same time",
intervals: monitorapi.Intervals{
haproxyDownInterval("master-0", "masters/master-0", at(0), at(50)),
haproxyDownInterval("master-0", "masters/master-1", at(10), at(60)),
haproxyDownInterval("master-0", "masters/master-2", at(20), at(40)),
},
expected: map[string][]apiOutageWindow{
"master-0": {{from: at(20), to: at(40)}},
},
},
{
name: "backends down on different haproxy instances do not add up",
intervals: monitorapi.Intervals{
haproxyDownInterval("master-0", "masters/master-0", at(0), at(60)),
haproxyDownInterval("master-1", "masters/master-1", at(0), at(60)),
haproxyDownInterval("master-2", "masters/master-2", at(0), at(60)),
},
expected: map[string][]apiOutageWindow{},
},
{
name: "two separate full outages",
intervals: monitorapi.Intervals{
haproxyDownInterval("master-0", "masters/master-0", at(0), at(60)),
haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)),
haproxyDownInterval("master-0", "masters/master-2", at(0), at(60)),
haproxyDownInterval("master-0", "masters/master-0", at(600), at(630)),
haproxyDownInterval("master-0", "masters/master-1", at(600), at(630)),
haproxyDownInterval("master-0", "masters/master-2", at(600), at(630)),
},
expected: map[string][]apiOutageWindow{
"master-0": {
{from: at(0), to: at(60)},
{from: at(600), to: at(630)},
},
},
},
{
name: "recovery at the same second as another backend goes down is not an overlap",
intervals: monitorapi.Intervals{
haproxyDownInterval("master-0", "masters/master-0", at(0), at(20)),
haproxyDownInterval("master-0", "masters/master-1", at(0), at(20)),
haproxyDownInterval("master-0", "masters/master-2", at(20), at(40)),
},
expected: map[string][]apiOutageWindow{},
},
{
name: "one backend recovering and going down within the outage keeps a single window",
intervals: monitorapi.Intervals{
haproxyDownInterval("master-0", "masters/master-0", at(0), at(100)),
haproxyDownInterval("master-0", "masters/master-1", at(0), at(100)),
haproxyDownInterval("master-0", "masters/master-2", at(0), at(50)),
haproxyDownInterval("master-0", "masters/master-2", at(50), at(100)),
},
expected: map[string][]apiOutageWindow{
"master-0": {{from: at(0), to: at(100)}},
},
},
{
name: "outages tracked separately per haproxy instance",
intervals: monitorapi.Intervals{
haproxyDownInterval("master-0", "masters/master-0", at(0), at(60)),
haproxyDownInterval("master-0", "masters/master-1", at(0), at(60)),
haproxyDownInterval("master-0", "masters/master-2", at(0), at(60)),
haproxyDownInterval("master-1", "masters/master-0", at(300), at(360)),
haproxyDownInterval("master-1", "masters/master-1", at(300), at(360)),
haproxyDownInterval("master-1", "masters/master-2", at(300), at(360)),
},
expected: map[string][]apiOutageWindow{
"master-0": {{from: at(0), to: at(60)}},
"master-1": {{from: at(300), to: at(360)}},
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
actual := findFullAPIOutageWindows(tt.intervals, fullOutageBackendThreshold)
assert.Equal(t, tt.expected, actual)
})
}
}

func TestEvaluateFullAPIOutages(t *testing.T) {
// installOutage simulates the expected initial state: when haproxy starts during the
// installation, all kube-apiservers are down until they come up for the first time.
installOutage := func(reportingNode string) monitorapi.Intervals {
return monitorapi.Intervals{
haproxyDownInterval(reportingNode, "masters/master-0", at(0), at(300)),
haproxyDownInterval(reportingNode, "masters/master-1", at(0), at(360)),
haproxyDownInterval(reportingNode, "masters/master-2", at(0), at(420)),
}
}

tests := []struct {
name string
intervals monitorapi.Intervals
expectFailure bool
expectedOutputs []string
}{
{
name: "no intervals",
intervals: monitorapi.Intervals{},
expectFailure: false,
},
{
name: "only the installation outage",
intervals: installOutage("master-0"),
expectFailure: false,
},
{
name: "full outage after the installation",
intervals: append(installOutage("master-0"),
haproxyDownInterval("master-0", "masters/master-0", at(3600), at(3630)),
haproxyDownInterval("master-0", "masters/master-1", at(3600), at(3630)),
haproxyDownInterval("master-0", "masters/master-2", at(3600), at(3630)),
),
expectFailure: true,
expectedOutputs: []string{
"haproxy on node master-0",
at(3600).Format(time.RFC3339),
at(3630).Format(time.RFC3339),
},
},
{
name: "initial outage tolerated separately per haproxy instance",
intervals: append(installOutage("master-0"), installOutage("master-1")...),
expectFailure: false,
},
{
name: "partial outage after the installation does not fail",
intervals: append(installOutage("master-0"),
haproxyDownInterval("master-0", "masters/master-0", at(3600), at(3630)),
haproxyDownInterval("master-0", "masters/master-1", at(3600), at(3630)),
),
expectFailure: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
junits := evaluateFullAPIOutages(tt.intervals)
require.Len(t, junits, 1)

if !tt.expectFailure {
assert.Nil(t, junits[0].FailureOutput, "expected the test to pass")
return
}

require.NotNil(t, junits[0].FailureOutput, "expected the test to fail")
for _, expectedOutput := range tt.expectedOutputs {
assert.Contains(t, junits[0].SystemOut, expectedOutput)
}
})
}
}