openshift · jaypoulz · Feb 5, 2026
diff --git a/test/extended/two_node/tnf_kubelet_disruption.go b/test/extended/two_node/tnf_kubelet_disruption.go
@@ -3,23 +3,27 @@ package two_node
 import (
 	"context"
 	"fmt"
+	"regexp"
 	"time"
 
 	g "github.com/onsi/ginkgo/v2"
 	o "github.com/onsi/gomega"
 	v1 "github.com/openshift/api/config/v1"
 	"github.com/openshift/origin/test/extended/etcd/helpers"
 	"github.com/openshift/origin/test/extended/two_node/utils"
+	"github.com/openshift/origin/test/extended/two_node/utils/services"
 	exutil "github.com/openshift/origin/test/extended/util"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	nodeutil "k8s.io/kubernetes/pkg/util/node"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
 
 const (
-	kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios
-	kubeletRestoreTimeout    = 5 * time.Minute  // Time to wait for kubelet service restore
-	kubeletGracePeriod       = 30 * time.Second // Grace period for kubelet to start/stop
+	kubeletDisruptionTimeout     = 10 * time.Minute // Timeout for kubelet disruption scenarios
+	kubeletRestoreTimeout        = 5 * time.Minute  // Time to wait for kubelet service restore
+	kubeletGracePeriod           = 30 * time.Second // Grace period for kubelet to start/stop
+	etcdStableDuringDisruption   = 5 * time.Minute  // Duration to assert etcd member stays healthy during disruption
+	failureWindowClockSkewBuffer = 1 * time.Minute  // Buffer for clock skew when checking resource failure history
 )
 
 var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node with Fencing cluster", func() {
@@ -80,12 +84,13 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 		nodeList, _ := utils.GetNodes(oc, utils.AllNodes)
 		cleanupNode := nodeList.Items[1] // Use second node for cleanup commands
 
-		g.By(fmt.Sprintf("Cleanup: Clearing any kubelet resource bans using node %s", cleanupNode.Name))
-		cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, "kubelet-clone")
-		if cleanupErr != nil {
-			framework.Logf("Warning: Failed to clear kubelet-clone resource: %v (expected if no bans were active)", cleanupErr)
-		} else {
-			framework.Logf("Successfully cleared kubelet-clone resource bans and failures")
+		g.By(fmt.Sprintf("Cleanup: Clearing any kubelet and etcd resource bans using node %s", cleanupNode.Name))
+		for _, resource := range []string{"kubelet-clone", "etcd-clone"} {
+			if cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, resource); cleanupErr != nil {
+				framework.Logf("Warning: Failed to clear %s: %v (expected if no bans were active)", resource, cleanupErr)
+			} else {
+				framework.Logf("Successfully cleared %s resource bans and failures", resource)
+			}
 		}
 
 		g.By("Cleanup: Validating etcd cluster health")
@@ -136,15 +141,24 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 			return !nodeutil.IsNodeReady(nodeObj)
 		}, kubeletDisruptionTimeout, utils.FiveSecondPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s is not in state Ready after kubelet resource ban is applied", targetNode.Name))
 
+		g.By("Verifying PacemakerHealthCheckDegraded condition reports kubelet failure on target node")
+		err = services.WaitForPacemakerHealthCheckDegraded(oc, "Kubelet", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)
+		o.Expect(err).NotTo(o.HaveOccurred(), "Pacemaker health check should report degraded due to kubelet constraint")
+		// Assert degraded resource is Kubelet and that it is the node we banned (operator message format: "<node> node is unhealthy: Kubelet ...")
+		o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{"Kubelet", targetNode.Name})).To(o.Succeed())
+
 		g.By("Validating etcd cluster remains healthy with surviving node")
 		o.Consistently(func() error {
 			return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name)
-		}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name))
+		}, etcdStableDuringDisruption, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name))
 
 		g.By("Clearing kubelet resource bans to allow normal operation")
 		err = utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
 		o.Expect(err).To(o.BeNil(), "Expected to clear kubelet resource bans without errors")
 
+		g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery")
+		o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+
 		g.By("Validating both nodes are Ready")
 		for _, node := range nodes {
 			o.Eventually(func() bool {
@@ -211,7 +225,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 
 		g.By("Verifying Pacemaker recorded the kubelet failure in operation history")
 		// Use a time window from when we stopped kubelet to now
-		failureWindow := time.Since(stopTime) + time.Minute // Add buffer for clock skew
+		failureWindow := time.Since(stopTime) + failureWindowClockSkewBuffer
 		hasFailure, failures, err := utils.HasRecentResourceFailure(oc, survivingNode.Name, "kubelet-clone", failureWindow)
 		o.Expect(err).To(o.BeNil(), "Expected to check resource failure history without errors")
 		o.Expect(hasFailure).To(o.BeTrue(), "Pacemaker should have recorded kubelet failure in operation history")
@@ -238,5 +252,61 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 			return utils.ValidateEssentialOperatorsAvailable(oc)
 		}, kubeletRestoreTimeout, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), "Essential operators should be available")
 	})
+})
+
+// Etcd constraint / health check test lives in a separate Describe without [OCPFeatureGate:DualReplica];
+// we do not add new tests under the FeatureGate-gated suite.
+var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node etcd constraint and health check", func() {
+	defer g.GinkgoRecover()
+
+	var (
+		oc                = exutil.NewCLIWithoutNamespace("two-node-etcd-constraint").AsAdmin()
+		etcdClientFactory *helpers.EtcdClientFactoryImpl
+	)
+
+	g.BeforeEach(func() {
+		utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)
+		etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient())
+		utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory)
+	})
 
+	g.It("should recover from etcd resource location constraint with health check degraded then healthy", func() {
+		nodeList, err := utils.GetNodes(oc, utils.AllNodes)
+		o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
+		o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")
+		nodes := nodeList.Items
+		targetNode := nodes[0]
+		survivingNode := nodes[1]
+
+		g.By("Ensuring both nodes are healthy before applying etcd constraint")
+		for _, node := range nodes {
+			o.Expect(nodeutil.IsNodeReady(&node)).To(o.BeTrue(), fmt.Sprintf("Node %s should be ready", node.Name))
+		}
+
+		g.By(fmt.Sprintf("Banning etcd resource from node %s (location constraint)", targetNode.Name))
+		err = utils.AddConstraint(oc, survivingNode.Name, "etcd-clone", targetNode.Name)
+		o.Expect(err).To(o.BeNil(), "Expected to ban etcd-clone from target node")
+		g.DeferCleanup(func() {
+			_ = utils.RemoveConstraint(oc, survivingNode.Name, "etcd-clone")
+		})
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition reports etcd failure on target node")
+		// Operator message format: "<nodeName> node is unhealthy: Etcd has failed" (or "is stopped", etc.)
+		degradedPattern := regexp.QuoteMeta(targetNode.Name) + ` node is unhealthy: Etcd .*`
+		err = services.WaitForPacemakerHealthCheckDegraded(oc, degradedPattern, healthCheckDegradedTimeout, utils.FiveSecondPollInterval)
+		o.Expect(err).NotTo(o.HaveOccurred(), "Pacemaker health check should report degraded due to etcd constraint")
+		o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{"Etcd", targetNode.Name})).To(o.Succeed())
+
+		g.By("Removing etcd-clone constraint to restore normal operation")
+		err = utils.RemoveConstraint(oc, survivingNode.Name, "etcd-clone")
+		o.Expect(err).To(o.BeNil(), "Expected to clear etcd-clone constraint")
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery")
+		o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+
+		g.By("Validating etcd cluster is healthy")
+		o.Eventually(func() error {
+			return utils.LogEtcdClusterStatus(oc, "after etcd constraint removal", etcdClientFactory)
+		}, kubeletRestoreTimeout, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred())
+	})
 })
diff --git a/test/extended/two_node/tnf_node_replacement.go b/test/extended/two_node/tnf_node_replacement.go
@@ -228,6 +228,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 		g.By("Destroying the target VM")
 		destroyVM(&testConfig)
 
+		g.By("Verifying that a fencing event was recorded for the target node")
+		o.Expect(services.WaitForFencingEvent(oc, []string{testConfig.TargetNode.Name}, healthCheckDegradedTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed())
+
 		// Wait for etcd to stop on the surviving node
 		g.By("Waiting for etcd to stop on the surviving node")
 		waitForEtcdToStop(&testConfig)
@@ -256,6 +259,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 		g.By("Verifying the cluster is fully restored")
 		verifyRestoredCluster(&testConfig, oc)
 
+		g.By("Verifying PacemakerHealthCheckDegraded condition clears after recovery")
+		o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeoutAfterFencing, utils.FiveSecondPollInterval)).To(o.Succeed())
+
 		g.By("Successfully completed node replacement process")
 		e2e.Logf("Node replacement process completed. Backup files created in: %s", backupDir)
 	})

diff --git a/test/extended/two_node/tnf_pacemaker_healthcheck.go b/test/extended/two_node/tnf_pacemaker_healthcheck.go
@@ -0,0 +1,200 @@
+package two_node
+
+import (
+	"context"
+	"fmt"
+	"math/rand"
+	"strings"
+	"sync"
+	"time"
+
+	g "github.com/onsi/ginkgo/v2"
+	o "github.com/onsi/gomega"
+	v1 "github.com/openshift/api/config/v1"
+	"github.com/openshift/origin/test/extended/etcd/helpers"
+	"github.com/openshift/origin/test/extended/two_node/utils"
+	"github.com/openshift/origin/test/extended/two_node/utils/services"
+	exutil "github.com/openshift/origin/test/extended/util"
+	corev1 "k8s.io/api/core/v1"
+	e2e "k8s.io/kubernetes/test/e2e/framework"
+)
+
+const (
+	// healthCheckUpdatedTimeout is the time to wait for the Pacemaker health check condition to update (degraded or healthy).
+	healthCheckUpdatedTimeout  = 2 * time.Minute
+	healthCheckDegradedTimeout = healthCheckUpdatedTimeout
+	healthCheckHealthyTimeout  = healthCheckUpdatedTimeout
+	// Longer timeouts for tests that trigger a fencing event (ungraceful shutdown, cold-boot, network disruption):
+	// API server can be slow to recover, so we wait up to 5 minutes before asserting PacemakerHealthCheckDegraded/Healthy.
+	healthCheckDegradedTimeoutAfterFencing = 5 * time.Minute
+	healthCheckHealthyTimeoutAfterFencing  = 5 * time.Minute
+	// StatusUnknownDegradedThreshold and StatusStalenessThreshold in CEO are 5 minutes; we must block for at least this long before asserting degraded.
+	staleMinBlockDuration = 5 * time.Minute
+	// After blocking, allow time for healthcheck controller (30s resync) to observe degraded.
+	staleCRDegradedTimeout        = 2 * time.Minute
+	staleTimestampDegradedTimeout = 2 * time.Minute
+	// Interval for background delete loops: delete as soon as resources appear (match aggressive manual watch cadence).
+	staleTestDeleteInterval = 2 * time.Second
+	pacemakerClusterCRName  = "cluster"
+	statusCollectorLabel    = "app.kubernetes.io/name=pacemaker-status-collector"
+	etcdNamespaceFencing    = "openshift-etcd"
+)
+
+var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial][Disruptive] Pacemaker health check disruptive scenarios", func() {
+	defer g.GinkgoRecover()
+
+	var (
+		oc                = exutil.NewCLIWithoutNamespace("tnf-pacemaker-healthcheck").AsAdmin()
+		etcdClientFactory *helpers.EtcdClientFactoryImpl
+		peerNode          corev1.Node
+		targetNode        corev1.Node
+	)
+
+	g.BeforeEach(func() {
+		utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)
+		etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient())
+		utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory)
+
+		nodes, err := utils.GetNodes(oc, utils.AllNodes)
+		o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
+		randomIndex := rand.Intn(len(nodes.Items))
+		peerNode = nodes.Items[randomIndex]
+		targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)]
+	})
+
+	g.It("should report degraded when a node is in standby then healthy after unstandby", func() {
+		g.By(fmt.Sprintf("Putting %s in standby from %s", targetNode.Name, peerNode.Name))
+		o.Expect(utils.PcsNodeStandby(oc, peerNode.Name, targetNode.Name)).To(o.Succeed())
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition reports target node in standby")
+		o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "standby", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+		o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{targetNode.Name, "standby"})).To(o.Succeed())
+
+		g.By(fmt.Sprintf("Bringing %s out of standby", targetNode.Name))
+		o.Expect(utils.PcsNodeUnstandby(oc, peerNode.Name, targetNode.Name)).To(o.Succeed())
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition clears")
+		o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+	})
+
+	g.It("should report degraded when cluster is in maintenance mode then healthy after clearing", func() {
+		g.By("Setting cluster maintenance mode")
+		o.Expect(utils.PcsPropertySetMaintenanceMode(oc, peerNode.Name, true)).To(o.Succeed())
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition reports maintenance")
+		o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "maintenance", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+
+		g.By("Clearing cluster maintenance mode")
+		o.Expect(utils.PcsPropertySetMaintenanceMode(oc, peerNode.Name, false)).To(o.Succeed())
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition clears")
+		o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+	})
+
+	g.It("should report degraded when a node is in maintenance mode then healthy after unmaintenance", func() {
+		g.By(fmt.Sprintf("Putting %s in node maintenance from %s", targetNode.Name, peerNode.Name))
+		o.Expect(utils.PcsNodeMaintenance(oc, peerNode.Name, targetNode.Name)).To(o.Succeed())
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition reports target node in maintenance")
+		o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "maintenance", healthCheckDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+		o.Expect(services.AssertPacemakerHealthCheckContains(oc, []string{targetNode.Name, "maintenance"})).To(o.Succeed())
+
+		g.By(fmt.Sprintf("Bringing %s out of node maintenance", targetNode.Name))
+		o.Expect(utils.PcsNodeUnmaintenance(oc, peerNode.Name, targetNode.Name)).To(o.Succeed())
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition clears")
+		o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+	})
+
+})
+
+var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][Suite:openshift/two-node][Serial] Pacemaker health check stale status scenarios", func() {
+	defer g.GinkgoRecover()
+
+	var (
+		oc                = exutil.NewCLIWithoutNamespace("tnf-pacemaker-stale").AsAdmin()
+		etcdClientFactory *helpers.EtcdClientFactoryImpl
+	)
+
+	g.BeforeEach(func() {
+		utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)
+		etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient())
+		utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory)
+	})
+
+	g.It("should report degraded when PacemakerCluster CR is repeatedly deleted then healthy after CR is allowed to exist", func() {
+		ctx, cancel := context.WithCancel(context.Background())
+		var wg sync.WaitGroup
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			ticker := time.NewTicker(staleTestDeleteInterval)
+			defer ticker.Stop()
+			for {
+				select {
+				case <-ctx.Done():
+					return
+				case <-ticker.C:
+					out, err := oc.AsAdmin().Run("delete").Args("pacemakercluster", pacemakerClusterCRName, "--ignore-not-found").Output()
+					if err != nil {
+						e2e.Logf("Staleness CR delete loop: delete pacemakercluster/%s failed: %v (output: %q)", pacemakerClusterCRName, err, string(out))
+					} else if strings.TrimSpace(string(out)) != "" {
+						e2e.Logf("Staleness CR delete loop: %s", string(out))
+					}
+				}
+			}
+		}()
+
+		g.By("Deleting PacemakerCluster CR for 5 minutes so operator exceeds StatusUnknownDegradedThreshold")
+		time.Sleep(staleMinBlockDuration)
+
+		g.By("Waiting for PacemakerHealthCheckDegraded (CR not found)")
+		o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "not found", staleCRDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+
+		// Only stop the delete loop after asserting degraded; otherwise the operator could recreate the CR before we observe not found.
+		g.By("Stopping CR delete loop and allowing operator to recreate CR")
+		cancel()
+		wg.Wait()
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition clears")
+		o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+	})
+
+	g.It("should report degraded when status collector jobs are repeatedly deleted then healthy after jobs can run", func() {
+		ctx, cancel := context.WithCancel(context.Background())
+		var wg sync.WaitGroup
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			ticker := time.NewTicker(staleTestDeleteInterval)
+			defer ticker.Stop()
+			for {
+				select {
+				case <-ctx.Done():
+					return
+				case <-ticker.C:
+					out, err := oc.AsAdmin().Run("delete").Args("jobs", "-n", etcdNamespaceFencing, "-l", statusCollectorLabel, "--ignore-not-found").Output()
+					if err != nil {
+						e2e.Logf("Staleness job delete loop: delete jobs -l %s -n %s failed: %v (output: %q)", statusCollectorLabel, etcdNamespaceFencing, err, string(out))
+					} else if strings.TrimSpace(string(out)) != "" {
+						e2e.Logf("Staleness job delete loop: %s", string(out))
+					}
+				}
+			}
+		}()
+
+		g.By("Blocking status collector for 5 minutes so CR lastUpdated exceeds StatusStalenessThreshold")
+		time.Sleep(staleMinBlockDuration)
+
+		g.By("Waiting for PacemakerHealthCheckDegraded (stale status)")
+		o.Expect(services.WaitForPacemakerHealthCheckDegraded(oc, "stale", staleTimestampDegradedTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+
+		// Only stop the delete loop after asserting degraded; otherwise a job could complete and update the CR before we observe stale.
+		g.By("Stopping job delete loop and allowing cronjob to run")
+		cancel()
+		wg.Wait()
+
+		g.By("Verifying PacemakerHealthCheckDegraded condition clears")
+		o.Expect(services.WaitForPacemakerHealthCheckHealthy(oc, healthCheckHealthyTimeout, utils.FiveSecondPollInterval)).To(o.Succeed())
+	})
+})