Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 81 additions & 11 deletions test/extended/two_node/tnf_kubelet_disruption.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,27 @@ package two_node
import (
"context"
"fmt"
"regexp"
"time"

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"
v1 "github.com/openshift/api/config/v1"
"github.com/openshift/origin/test/extended/etcd/helpers"
"github.com/openshift/origin/test/extended/two_node/utils"
"github.com/openshift/origin/test/extended/two_node/utils/services"
exutil "github.com/openshift/origin/test/extended/util"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
nodeutil "k8s.io/kubernetes/pkg/util/node"
"k8s.io/kubernetes/test/e2e/framework"
)

const (
kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios
kubeletRestoreTimeout = 5 * time.Minute // Time to wait for kubelet service restore
kubeletGracePeriod = 30 * time.Second // Grace period for kubelet to start/stop
kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios
kubeletRestoreTimeout = 5 * time.Minute // Time to wait for kubelet service restore
kubeletGracePeriod = 30 * time.Second // Grace period for kubelet to start/stop
etcdStableDuringDisruption = 5 * time.Minute // Duration to assert etcd member stays healthy during disruption
failureWindowClockSkewBuffer = 1 * time.Minute // Buffer for clock skew when checking resource failure history
)

var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node with Fencing cluster", func() {
Expand Down Expand Up @@ -80,12 +84,13 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
nodeList, _ := utils.GetNodes(oc, utils.AllNodes)
cleanupNode := nodeList.Items[1] // Use second node for cleanup commands

g.By(fmt.Sprintf("Cleanup: Clearing any kubelet resource bans using node %s", cleanupNode.Name))
cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, "kubelet-clone")
if cleanupErr != nil {
framework.Logf("Warning: Failed to clear kubelet-clone resource: %v (expected if no bans were active)", cleanupErr)
} else {
framework.Logf("Successfully cleared kubelet-clone resource bans and failures")
g.By(fmt.Sprintf("Cleanup: Clearing any kubelet and etcd resource bans using node %s", cleanupNode.Name))
for _, resource := range []string{"kubelet-clone", "etcd-clone"} {
if cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, resource); cleanupErr != nil {
framework.Logf("Warning: Failed to clear %s: %v (expected if no bans were active)", resource, cleanupErr)
} else {
framework.Logf("Successfully cleared %s resource bans and failures", resource)
}
}

g.By("Cleanup: Validating etcd cluster health")
Expand Down Expand Up @@ -136,15 +141,24 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
return !nodeutil.IsNodeReady(nodeObj)
}, kubeletDisruptionTimeout, utils.FiveSecondPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s is not in state Ready after kubelet resource ban is applied", targetNode.Name))

g.By("Verifying PacemakerHealthCheckDegraded condition reports kubelet failure on target node")
err = services.WaitForPacemakerHealthCheckDegraded(oc, "Kubelet", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)
o.Expect(err).NotTo(o.HaveOccurred(), "Pacemaker health check should report degraded due to kubelet constraint")
// Assert degraded resource is Kubelet and that it is the node we banned (operator message format: "<node> node is unhealthy: Kubelet ...")
o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{"Kubelet", targetNode.Name})).To(o.Succeed())

g.By("Validating etcd cluster remains healthy with surviving node")
o.Consistently(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name)
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name))
}, etcdStableDuringDisruption, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name))

g.By("Clearing kubelet resource bans to allow normal operation")
err = utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
o.Expect(err).To(o.BeNil(), "Expected to clear kubelet resource bans without errors")

g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery")
o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())

g.By("Validating both nodes are Ready")
for _, node := range nodes {
o.Eventually(func() bool {
Expand Down Expand Up @@ -211,7 +225,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual

g.By("Verifying Pacemaker recorded the kubelet failure in operation history")
// Use a time window from when we stopped kubelet to now
failureWindow := time.Since(stopTime) + time.Minute // Add buffer for clock skew
failureWindow := time.Since(stopTime) + failureWindowClockSkewBuffer
hasFailure, failures, err := utils.HasRecentResourceFailure(oc, survivingNode.Name, "kubelet-clone", failureWindow)
o.Expect(err).To(o.BeNil(), "Expected to check resource failure history without errors")
o.Expect(hasFailure).To(o.BeTrue(), "Pacemaker should have recorded kubelet failure in operation history")
Expand All @@ -238,5 +252,61 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
return utils.ValidateEssentialOperatorsAvailable(oc)
}, kubeletRestoreTimeout, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), "Essential operators should be available")
})
})

// Etcd constraint / health check test lives in a separate Describe without [OCPFeatureGate:DualReplica];
// we do not add new tests under the FeatureGate-gated suite.
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node etcd constraint and health check", func() {
defer g.GinkgoRecover()

var (
oc = exutil.NewCLIWithoutNamespace("two-node-etcd-constraint").AsAdmin()
etcdClientFactory *helpers.EtcdClientFactoryImpl
)

g.BeforeEach(func() {
utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)
etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient())
utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory)
})

g.It("should recover from etcd resource location constraint with health check degraded then healthy", func() {
nodeList, err := utils.GetNodes(oc, utils.AllNodes)
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")
nodes := nodeList.Items
targetNode := nodes[0]
survivingNode := nodes[1]

g.By("Ensuring both nodes are healthy before applying etcd constraint")
for _, node := range nodes {
o.Expect(nodeutil.IsNodeReady(&node)).To(o.BeTrue(), fmt.Sprintf("Node %s should be ready", node.Name))
}

g.By(fmt.Sprintf("Banning etcd resource from node %s (location constraint)", targetNode.Name))
err = utils.AddConstraint(oc, survivingNode.Name, "etcd-clone", targetNode.Name)
o.Expect(err).To(o.BeNil(), "Expected to ban etcd-clone from target node")
g.DeferCleanup(func() {
_ = utils.RemoveConstraint(oc, survivingNode.Name, "etcd-clone")
})

g.By("Verifying PacemakerHealthCheckDegraded condition reports etcd failure on target node")
// Operator message format: "<nodeName> node is unhealthy: Etcd has failed" (or "is stopped", etc.)
degradedPattern := regexp.QuoteMeta(targetNode.Name) + ` node is unhealthy: Etcd .*`
err = services.WaitForPacemakerHealthCheckDegraded(oc, degradedPattern, healthCheckDegradedTimeout, utils.FiveSecondPollInterval)
o.Expect(err).NotTo(o.HaveOccurred(), "Pacemaker health check should report degraded due to etcd constraint")
o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{"Etcd", targetNode.Name})).To(o.Succeed())

g.By("Removing etcd-clone constraint to restore normal operation")
err = utils.RemoveConstraint(oc, survivingNode.Name, "etcd-clone")
o.Expect(err).To(o.BeNil(), "Expected to clear etcd-clone constraint")

g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery")
o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())

g.By("Validating etcd cluster is healthy")
o.Eventually(func() error {
return utils.LogEtcdClusterStatus(oc, "after etcd constraint removal", etcdClientFactory)
}, kubeletRestoreTimeout, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred())
})
})
6 changes: 6 additions & 0 deletions test/extended/two_node/tnf_node_replacement.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
g.By("Destroying the target VM")
destroyVM(&testConfig)

g.By("Verifying that a fencing event was recorded for the target node")
o.Expect(services.WaitForFencingEvent(oc, []string{testConfig.TargetNode.Name}, healthCheckDegradedTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed())

// Wait for etcd to stop on the surviving node
g.By("Waiting for etcd to stop on the surviving node")
waitForEtcdToStop(&testConfig)
Expand Down Expand Up @@ -256,6 +259,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
g.By("Verifying the cluster is fully restored")
verifyRestoredCluster(&testConfig, oc)

g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery")
o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed())

g.By("Successfully completed node replacement process")
e2e.Logf("Node replacement process completed. Backup files created in: %s", backupDir)
})
Expand Down
200 changes: 200 additions & 0 deletions test/extended/two_node/tnf_pacemaker_healthcheck.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
package two_node

import (
"context"
"fmt"
"math/rand"
"strings"
"sync"
"time"

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"
v1 "github.com/openshift/api/config/v1"
"github.com/openshift/origin/test/extended/etcd/helpers"
"github.com/openshift/origin/test/extended/two_node/utils"
"github.com/openshift/origin/test/extended/two_node/utils/services"
exutil "github.com/openshift/origin/test/extended/util"
corev1 "k8s.io/api/core/v1"
e2e "k8s.io/kubernetes/test/e2e/framework"
)

const (
// healthCheckUpdatedTimeout is the time to wait for the Pacemaker health check condition to update (degraded or healthy).
healthCheckUpdatedTimeout = 2 * time.Minute
healthCheckDegradedTimeout = healthCheckUpdatedTimeout
healthCheckHealthyTimeout = healthCheckUpdatedTimeout
// Longer timeouts for tests that trigger a fencing event (ungraceful shutdown, cold-boot, network disruption):
// API server can be slow to recover, so we wait up to 5 minutes before asserting PacemakerHealthCheckDegraded/Healthy.
healthCheckDegradedTimeoutAfterFencing = 5 * time.Minute
healthCheckHealthyTimeoutAfterFencing = 5 * time.Minute
// StatusUnknownDegradedThreshold and StatusStalenessThreshold in CEO are 5 minutes; we must block for at least this long before asserting degraded.
staleMinBlockDuration = 5 * time.Minute
// After blocking, allow time for healthcheck controller (30s resync) to observe degraded.
staleCRDegradedTimeout = 2 * time.Minute
staleTimestampDegradedTimeout = 2 * time.Minute
// Interval for background delete loops: delete as soon as resources appear (match aggressive manual watch cadence).
staleTestDeleteInterval = 2 * time.Second
pacemakerClusterCRName = "cluster"
statusCollectorLabel = "app.kubernetes.io/name=pacemaker-status-collector"
etcdNamespaceFencing = "openshift-etcd"
)

var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial][Disruptive] Pacemaker health check disruptive scenarios", func() {
defer g.GinkgoRecover()

var (
oc = exutil.NewCLIWithoutNamespace("tnf-pacemaker-healthcheck").AsAdmin()
etcdClientFactory *helpers.EtcdClientFactoryImpl
peerNode corev1.Node
targetNode corev1.Node
)

g.BeforeEach(func() {
utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)
etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient())
utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory)

nodes, err := utils.GetNodes(oc, utils.AllNodes)
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
randomIndex := rand.Intn(len(nodes.Items))
peerNode = nodes.Items[randomIndex]
targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)]
})

g.It("should report degraded when a node is in standby then healthy after unstandby", func() {
g.By(fmt.Sprintf("Putting %s in standby from %s", targetNode.Name, peerNode.Name))
o.Expect(utils.PcsNodeStandby(oc, peerNode.Name, targetNode.Name)).To(o.Succeed())

g.By("Verifying PacemakerHealthCheckDegraded condition reports target node in standby")
o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "standby", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{targetNode.Name, "standby"})).To(o.Succeed())

g.By(fmt.Sprintf("Bringing %s out of standby", targetNode.Name))
o.Expect(utils.PcsNodeUnstandby(oc, peerNode.Name, targetNode.Name)).To(o.Succeed())

g.By("Verifying PacemakerHealthCheckDegraded condition clears")
o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
})

g.It("should report degraded when cluster is in maintenance mode then healthy after clearing", func() {
g.By("Setting cluster maintenance mode")
o.Expect(utils.PcsPropertySetMaintenanceMode(oc, peerNode.Name, true)).To(o.Succeed())

g.By("Verifying PacemakerHealthCheckDegraded condition reports maintenance")
o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "maintenance", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())

g.By("Clearing cluster maintenance mode")
o.Expect(utils.PcsPropertySetMaintenanceMode(oc, peerNode.Name, false)).To(o.Succeed())

g.By("Verifying PacemakerHealthCheckDegraded condition clears")
o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
})

g.It("should report degraded when a node is in maintenance mode then healthy after unmaintenance", func() {
g.By(fmt.Sprintf("Putting %s in node maintenance from %s", targetNode.Name, peerNode.Name))
o.Expect(utils.PcsNodeMaintenance(oc, peerNode.Name, targetNode.Name)).To(o.Succeed())

g.By("Verifying PacemakerHealthCheckDegraded condition reports target node in maintenance")
o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "maintenance", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{targetNode.Name, "maintenance"})).To(o.Succeed())

g.By(fmt.Sprintf("Bringing %s out of node maintenance", targetNode.Name))
o.Expect(utils.PcsNodeUnmaintenance(oc, peerNode.Name, targetNode.Name)).To(o.Succeed())

g.By("Verifying PacemakerHealthCheckDegraded condition clears")
o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
})

})

var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial] Pacemaker health check stale status scenarios", func() {
defer g.GinkgoRecover()

var (
oc = exutil.NewCLIWithoutNamespace("tnf-pacemaker-stale").AsAdmin()
etcdClientFactory *helpers.EtcdClientFactoryImpl
)

g.BeforeEach(func() {
utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)
etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient())
utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory)
})

g.It("should report degraded when PacemakerCluster CR is repeatedly deleted then healthy after CR is allowed to exist", func() {
ctx, cancel := context.WithCancel(context.Background())
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
ticker := time.NewTicker(staleTestDeleteInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
out, err := oc.AsAdmin().Run("delete").Args("pacemakercluster", pacemakerClusterCRName, "--ignore-not-found").Output()
if err != nil {
e2e.Logf("Staleness CR delete loop: delete pacemakercluster/%s failed: %v (output: %q)", pacemakerClusterCRName, err, string(out))
} else if strings.TrimSpace(string(out)) != "" {
e2e.Logf("Staleness CR delete loop: %s", string(out))
}
}
}
}()

g.By("Deleting PacemakerCluster CR for 5 minutes so operator exceeds StatusUnknownDegradedThreshold")
time.Sleep(staleMinBlockDuration)

g.By("Waiting for PacemakerHealthCheckDegraded (CR not found)")
o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "not found", staleCRDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())

// Only stop the delete loop after asserting degraded; otherwise the operator could recreate the CR before we observe not found.
g.By("Stopping CR delete loop and allowing operator to recreate CR")
cancel()
wg.Wait()

g.By("Verifying PacemakerHealthCheckDegraded condition clears")
o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
})

g.It("should report degraded when status collector jobs are repeatedly deleted then healthy after jobs can run", func() {
ctx, cancel := context.WithCancel(context.Background())
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
ticker := time.NewTicker(staleTestDeleteInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
out, err := oc.AsAdmin().Run("delete").Args("jobs", "-n", etcdNamespaceFencing, "-l", statusCollectorLabel, "--ignore-not-found").Output()
if err != nil {
e2e.Logf("Staleness job delete loop: delete jobs -l %s -n %s failed: %v (output: %q)", statusCollectorLabel, etcdNamespaceFencing, err, string(out))
} else if strings.TrimSpace(string(out)) != "" {
e2e.Logf("Staleness job delete loop: %s", string(out))
}
}
}
}()

g.By("Blocking status collector for 5 minutes so CR lastUpdated exceeds StatusStalenessThreshold")
time.Sleep(staleMinBlockDuration)

g.By("Waiting for PacemakerHealthCheckDegraded (stale status)")
o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "stale", staleTimestampDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())

// Only stop the delete loop after asserting degraded; otherwise a job could complete and update the CR before we observe stale.
g.By("Stopping job delete loop and allowing cronjob to run")
cancel()
wg.Wait()

g.By("Verifying PacemakerHealthCheckDegraded condition clears")
o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
})
})
Loading