Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
0585c2a
Initial plan
Copilot Feb 22, 2026
4aa9e8b
Migrate NodeHealthAPI from @EndPoint to JAX-RS annotations
Copilot Feb 22, 2026
8e2ccd5
Add NodeHealthAPITest2: integration tests without Mockito mocks
Copilot Feb 22, 2026
a50f682
Restore logic to HealthCheckHandler, simplify NodeHealthAPI, update r…
Copilot Feb 22, 2026
b8159ab
restore docs
epugh Feb 28, 2026
01696e3
lint
epugh Feb 28, 2026
dac4e14
Keep the non mock version
epugh Mar 1, 2026
ec8252a
Responding to feedback
epugh Mar 1, 2026
b592322
lint
epugh Mar 1, 2026
ca88d6e
add changelog
epugh Mar 3, 2026
0809a34
Improve changelog
epugh Mar 9, 2026
55bf2ce
Use the name healthcheck
epugh Mar 9, 2026
abea29c
Use enum for status and proper HTTP SolrJ classes in testing.
epugh Mar 9, 2026
9c52897
WIP of hacking this guy up to work with enum :-(
epugh Mar 9, 2026
b47383b
Merge remote-tracking branch 'upstream/main' into copilot/migrate-nod…
epugh Mar 9, 2026
ff53a9d
Restore the exception raised on error behavior that v1 apis had
epugh Mar 9, 2026
2d0c50b
better title
epugh Mar 9, 2026
1452364
Flip delegation: move health-check logic to NodeHealthAPI, HealthChec…
Copilot Mar 9, 2026
71feb02
Migrate business logic into NodeHealthAPI, and fix enum issue
epugh Mar 10, 2026
2a81e9e
Follow the existing naming pattern.
epugh Mar 11, 2026
fc6ee9e
Add test for failure cases.
epugh Mar 12, 2026
1b7c709
Merge remote-tracking branch 'upstream/main' into copilot/migrate-nod…
epugh Mar 18, 2026
e9e6c02
Nicer coding style
epugh Mar 18, 2026
3f5e8ab
Nicer assert style.
epugh Mar 18, 2026
4a5fb4d
Updates from merging latest code in.
epugh Mar 18, 2026
ee3cc2a
use assertThat
epugh Mar 19, 2026
bec1201
Migrate to hamcrest, and use standalone not legacy terms.
epugh Mar 19, 2026
aecb703
Split up standalone versus cloud mode tests.
epugh Mar 19, 2026
3b48bdd
clearer changelog
epugh Mar 19, 2026
e44c779
Expose maxGenerationLag as a V2 query parameter on /api/node/health
Copilot Mar 19, 2026
a029f9c
docs: add maxGenerationLag monitoring section to replication ref guide
Copilot Mar 19, 2026
df5c6f9
better name
epugh Mar 19, 2026
3165c6e
Merge branch 'copilot/migrate-node-health-api' of github.com:epugh/so…
epugh Mar 19, 2026
fc7c7cf
Responding to feedback
epugh Mar 19, 2026
71a778d
Post discussion with jason...
epugh Mar 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions changelog/unreleased/SOLR-16458-migrate-node-health-api.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
title: "SolrJ now offers a SolrRequest class allowing users to perform v2 single-node healthchecks: NodeApi.Healthcheck"
type: added
authors:
- name: Eric Pugh
- name: Jason Gerlowski
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Q] (General question; not PR specific)

Is it our convention to add reviewers here as well?

I'm all for that but it'd be great to document in dev-docs/changelog.adoc if that's a convention we'd like to cement!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i added you because you went beyond review, I mean, this PR chagned a lot....! Just causeyou didn't type characters doens't mean you didn't have a lot of input! I think you authored this as much as I did.

links:
- name: SOLR-16458
url: https://issues.apache.org/jira/browse/SOLR-16458
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.client.api.endpoint;

import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import jakarta.ws.rs.GET;
import jakarta.ws.rs.Path;
import jakarta.ws.rs.QueryParam;
import org.apache.solr.client.api.model.NodeHealthResponse;

/** V2 API definition for checking the health of a Solr node. */
@Path("/node/health")
public interface NodeHealthApi {

@GET
@Operation(
summary = "Determine the health of a Solr node.",
tags = {"node"})
NodeHealthResponse healthcheck(
@QueryParam("requireHealthyCores") Boolean requireHealthyCores,
@Parameter(
description =
"Maximum number of index generations a follower replica may lag behind its"
+ " leader before the health check reports FAILURE. Only relevant when"
+ " running in Standalone mode with leader/follower replication.")
@QueryParam("maxGenerationLag")
Integer maxGenerationLag);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.client.api.model;

import com.fasterxml.jackson.annotation.JsonProperty;

/** Response body for the '/api/node/health' endpoint. */
public class NodeHealthResponse extends SolrJerseyResponse {

/** The possible health statuses for a Solr node. */
public enum NodeStatus {
OK,
FAILURE
}

@JsonProperty public NodeStatus status;

@JsonProperty public String message;

@JsonProperty("num_cores_unhealthy")
public Integer numCoresUnhealthy;
}
264 changes: 23 additions & 241 deletions solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,39 +17,21 @@

package org.apache.solr.handler.admin;

import static org.apache.solr.common.params.CommonParams.FAILURE;
import static org.apache.solr.common.params.CommonParams.OK;
import static org.apache.solr.common.params.CommonParams.STATUS;
import static org.apache.solr.handler.admin.api.ReplicationAPIBase.GENERATION;

import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import org.apache.lucene.index.IndexCommit;
import org.apache.solr.api.AnnotatedApi;
import org.apache.solr.api.Api;
import org.apache.solr.api.JerseyResource;
import org.apache.solr.client.api.model.NodeHealthResponse;
import org.apache.solr.client.solrj.request.HealthCheckRequest;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.Replica.State;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.IndexFetcher;
import org.apache.solr.handler.ReplicationHandler;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.handler.admin.api.NodeHealthAPI;
import org.apache.solr.handler.admin.api.NodeHealth;
import org.apache.solr.handler.api.V2ApiUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.security.AuthorizationContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Health Check Handler for reporting the health of a specific node.
Expand Down Expand Up @@ -77,12 +59,13 @@
* specify the acceptable generation lag follower should be with respect to its leader using the
* <code>maxGenerationLag=&lt;max_generation_lag&gt;</code> request parameter. If <code>
* maxGenerationLag</code> is not provided then health check would simply return OK.
*
* <p>All health-check logic lives in the v2 {@link NodeHealth}; this handler is a thin v1 bridge
* that extracts request parameters and delegates.
*/
public class HealthCheckHandler extends RequestHandlerBase {

private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final String PARAM_REQUIRE_HEALTHY_CORES = "requireHealthyCores";
private static final List<State> UNHEALTHY_STATES = Arrays.asList(State.DOWN, State.RECOVERING);

CoreContainer coreContainer;

Expand All @@ -100,224 +83,18 @@ public CoreContainer getCoreContainer() {
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
rsp.setHttpCaching(false);

// Core container should not be null and active (redundant check)
if (coreContainer == null || coreContainer.isShutDown()) {
rsp.setException(
new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
"CoreContainer is either not initialized or shutting down"));
return;
}
if (!coreContainer.isZooKeeperAware()) {
if (log.isDebugEnabled()) {
log.debug("Invoked HealthCheckHandler in legacy mode.");
}
healthCheckLegacyMode(req, rsp);
} else {
if (log.isDebugEnabled()) {
log.debug(
"Invoked HealthCheckHandler in cloud mode on [{}]",
this.coreContainer.getZkController().getNodeName());
}
healthCheckCloudMode(req, rsp);
}
}

private void healthCheckCloudMode(SolrQueryRequest req, SolrQueryResponse rsp) {
ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
ClusterState clusterState = zkStateReader.getClusterState();
// Check for isConnected and isClosed
if (zkStateReader.getZkClient().isClosed() || !zkStateReader.getZkClient().isConnected()) {
rsp.add(STATUS, FAILURE);
rsp.setException(
new SolrException(
SolrException.ErrorCode.SERVICE_UNAVAILABLE,
"Host Unavailable: Not connected to zk"));
return;
}

// Fail if not in live_nodes
if (!clusterState.getLiveNodes().contains(coreContainer.getZkController().getNodeName())) {
rsp.add(STATUS, FAILURE);
rsp.setException(
new SolrException(
SolrException.ErrorCode.SERVICE_UNAVAILABLE,
"Host Unavailable: Not in live nodes as per zk"));
return;
}

// Optionally require that all cores on this node are active if param 'requireHealthyCores=true'
if (req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES, false)) {
if (!coreContainer.isStatusLoadComplete()) {
rsp.add(STATUS, FAILURE);
rsp.setException(
new SolrException(
SolrException.ErrorCode.SERVICE_UNAVAILABLE,
"Host Unavailable: Core Loading not complete"));
return;
}
Collection<CloudDescriptor> coreDescriptors =
coreContainer.getCoreDescriptors().stream()
.map(cd -> cd.getCloudDescriptor())
.collect(Collectors.toList());
long unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState);
if (unhealthyCores > 0) {
rsp.add(STATUS, FAILURE);
rsp.add("num_cores_unhealthy", unhealthyCores);
rsp.setException(
new SolrException(
SolrException.ErrorCode.SERVICE_UNAVAILABLE,
unhealthyCores
+ " out of "
+ coreContainer.getNumAllCores()
+ " replicas are currently initializing or recovering"));
return;
}
rsp.add("message", "All cores are healthy");
}

// All lights green, report healthy
rsp.add(STATUS, OK);
}

private void healthCheckLegacyMode(SolrQueryRequest req, SolrQueryResponse rsp) {
Integer maxGenerationLag = req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG);
List<String> laggingCoresInfo = new ArrayList<>();
boolean allCoresAreInSync = true;

// check only if max generation lag is specified
if (maxGenerationLag != null) {
// if is not negative
if (maxGenerationLag < 0) {
log.error("Invalid value for maxGenerationLag:[{}]", maxGenerationLag);
rsp.add(
"message",
String.format(Locale.ROOT, "Invalid value of maxGenerationLag:%s", maxGenerationLag));
rsp.add(STATUS, FAILURE);
} else {
for (SolrCore core : coreContainer.getCores()) {
ReplicationHandler replicationHandler =
(ReplicationHandler) core.getRequestHandler(ReplicationHandler.PATH);
if (replicationHandler.isFollower()) {
boolean isCoreInSync =
isWithinGenerationLag(core, replicationHandler, maxGenerationLag, laggingCoresInfo);

allCoresAreInSync &= isCoreInSync;
}
}
}
if (allCoresAreInSync) {
rsp.add(
"message",
String.format(
Locale.ROOT,
"All the followers are in sync with leader (within maxGenerationLag: %d) "
+ "or the cores are acting as leader",
maxGenerationLag));
rsp.add(STATUS, OK);
} else {
rsp.add(
"message",
String.format(
Locale.ROOT,
"Cores violating maxGenerationLag:%d.%n%s",
maxGenerationLag,
String.join(",\n", laggingCoresInfo)));
rsp.add(STATUS, FAILURE);
}
} else { // if maxGeneration lag is not specified (is null) we aren't checking for lag
rsp.add(
"message",
"maxGenerationLag isn't specified. Followers aren't "
+ "checking for the generation lag from the leaders");
rsp.add(STATUS, OK);
}
}

private boolean isWithinGenerationLag(
final SolrCore core,
ReplicationHandler replicationHandler,
int maxGenerationLag,
List<String> laggingCoresInfo) {
IndexFetcher indexFetcher = null;
final Boolean requireHealthyCores = req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES);
final Integer maxGenerationLag =
req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG);
try {
// may not be the best way to get leader's replicableCommit
NamedList<?> follower = (NamedList<?>) replicationHandler.getInitArgs().get("follower");

indexFetcher = new IndexFetcher(follower, replicationHandler, core);

NamedList<?> replicableCommitOnLeader = indexFetcher.getLatestVersion();
long leaderGeneration = (Long) replicableCommitOnLeader.get(GENERATION);

// Get our own commit and generation from the commit
IndexCommit commit = core.getDeletionPolicy().getLatestCommit();
if (commit != null) {
long followerGeneration = commit.getGeneration();
long generationDiff = leaderGeneration - followerGeneration;

// generationDiff shouldn't be negative except for some edge cases, log it. Some scenarios
// are
// 1) commit generation rolls over Long.MAX_VALUE (really unlikely)
// 2) Leader's index is wiped clean and the follower is still showing commit generation
// from the old index
if (generationDiff < 0) {
log.warn("core:[{}], generation lag:[{}] is negative.");
} else if (generationDiff < maxGenerationLag) {
log.info(
"core:[{}] generation lag is above acceptable threshold:[{}], "
+ "generation lag:[{}], leader generation:[{}], follower generation:[{}]",
core,
maxGenerationLag,
generationDiff,
leaderGeneration,
followerGeneration);

laggingCoresInfo.add(
String.format(
Locale.ROOT,
"Core %s is lagging by %d generations",
core.getName(),
generationDiff));
return true;
}
}
} catch (Exception e) {
log.error("Failed to check if the follower is in sync with the leader", e);
} finally {
if (indexFetcher != null) {
indexFetcher.destroy();
}
V2ApiUtils.squashIntoSolrResponseWithoutHeader(
rsp, new NodeHealth(coreContainer).healthcheck(requireHealthyCores, maxGenerationLag));
} catch (SolrException e) {
final NodeHealthResponse failureResponse = new NodeHealthResponse();
failureResponse.status = NodeHealthResponse.NodeStatus.FAILURE;
V2ApiUtils.squashIntoSolrResponseWithoutHeader(rsp, failureResponse);
rsp.setException(e);
}
return false;
}

/**
* Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node.
* We first find local cores which are either not registered or unhealthy, and check each of these
* against the clusterstate, and return a count of unhealthy replicas
*
* @param cores list of core cloud descriptors to iterate
* @param clusterState clusterstate from ZK
* @return number of unhealthy cores, either in DOWN or RECOVERING state
*/
static long findUnhealthyCores(Collection<CloudDescriptor> cores, ClusterState clusterState) {
return cores.stream()
.filter(
c ->
!c.hasRegistered()
|| UNHEALTHY_STATES.contains(c.getLastPublished())) // Find candidates locally
.filter(
c ->
clusterState.hasCollection(
c.getCollectionName())) // Only care about cores for actual collections
.filter(
c ->
clusterState
.getCollection(c.getCollectionName())
.getActiveSlicesMap()
.containsKey(c.getShardId()))
.count();
}

@Override
Expand All @@ -337,7 +114,12 @@ public Boolean registerV2() {

@Override
public Collection<Api> getApis() {
return AnnotatedApi.getApis(new NodeHealthAPI(this));
return List.of();
}

@Override
public Collection<Class<? extends JerseyResource>> getJerseyResources() {
return List.of(NodeHealth.class);
}

@Override
Expand Down
Loading
Loading