sap-contributions · alexanderstephan · May 8, 2026
diff --git a/reg-tests/mcli/mcli_master_socket_leak.vtc b/reg-tests/mcli/mcli_master_socket_leak.vtc
@@ -0,0 +1,66 @@
+varnishtest "Bug fix: master CLI connection slots freed on client disconnect"
+
+# Regression test for a master CLI socket connection leak in master-worker mode.
+#
+# mworker_proxy has a fixed maxconn of 10.  When clients connect to the master
+# socket, send a command that is forwarded to a busy worker, and then close
+# the connection due to a client-side receive timeout, haproxy must free each
+# slot as the client disconnects.
+#
+# Without the fix: slots remain occupied after the client disconnects, so the
+# master CLI becomes unreachable once all 10 slots are filled.
+# With the fix: slots are freed on client disconnect and the master CLI keeps
+# accepting new connections.
+#
+# The worker is made unresponsive using "debug dev delay" (expert-mode) with
+# nbthread 1 so a single delay blocks the entire worker CLI.  This avoids
+# using SIGSTOP/SIGCONT which can be unreliable in CI environments.
+
+#REGTEST_TYPE=bug
+
+feature cmd "command -v socat"
+feature cmd "command -v timeout"
+feature ignore_unknown_macro
+
+server s1 {
+} -start
+
+haproxy h1 -W -S -conf {
+    global
+        nbthread 1
+
+    defaults
+        mode http
+        timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout client  "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout server  "${HAPROXY_TEST_TIMEOUT-5s}"
+
+    frontend fe
+        bind "fd@${fe}"
+        default_backend be
+
+    backend be
+        server s1 ${s1_addr}:${s1_port}
+} -start
+
+# Fill all 10 master CLI slots (mworker_proxy->maxconn is hardcoded to 10).
+# Each socat sends "expert-mode on" followed by "@1 debug dev delay 10000"
+# which blocks the single worker thread for 10 s.  After 2 s the timeout(1)
+# wrapper kills socat, simulating a client-side receive timeout.  "wait"
+# ensures all background processes have exited before proceeding.
+shell {
+    for i in $(seq 1 10); do
+        (printf "expert-mode on\n@1 debug dev delay 10000\n" \
+         | timeout 2 socat TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null) &
+    done
+    wait
+}
+
+# This is the key assertion: after all 10 clients have disconnected, a new
+# connection to the master CLI must succeed.  With the bug all 10 slots are
+# still marked occupied and this connect is refused or times out.
+haproxy h1 -mcli {
+    send "show version"
+    expect ~ "3."
+}
+
diff --git a/reg-tests/mcli/mcli_reload_no_timeout.lua b/reg-tests/mcli/mcli_reload_no_timeout.lua
@@ -0,0 +1,5 @@
+-- Delay worker readiness by 2s to test that serverfin does not
+-- kill the reload command while the new worker is starting.
+core.register_init(function()
+    os.execute("sleep 2")
+end)
diff --git a/reg-tests/mcli/mcli_reload_no_timeout.vtc b/reg-tests/mcli/mcli_reload_no_timeout.vtc
@@ -0,0 +1,65 @@
+varnishtest "Verify reload via master CLI is not affected by serverfin timeout"
+
+# Regression test: the timeout server-fin on the MASTER proxy must not
+# apply to locally-handled commands (applets).  A reload command is
+# forwarded to the master applet and may take longer than 1s while the
+# new worker is starting.  Without the fix, the pcli response analyser
+# would fire a read timeout and return "Can't connect to the target CLI!"
+# instead of the reload status.
+
+#REQUIRE_OPTIONS=LUA
+#REGTEST_TYPE=bug
+
+feature cmd "command -v socat"
+feature cmd "$HAPROXY_PROGRAM -vv | grep -q '+LUA'"
+feature ignore_unknown_macro
+
+server s1 {
+    rxreq
+    txresp
+} -start
+
+haproxy h1 -W -S -conf {
+    global
+        tune.lua.bool-sample-conversion normal
+        # Lua init hook that sleeps 2s, delaying worker readiness on reload
+        lua-load ${testdir}/mcli_reload_no_timeout.lua
+
+    defaults
+        mode http
+        timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout client  "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout server  "${HAPROXY_TEST_TIMEOUT-5s}"
+
+    frontend fe
+        bind "fd@${fe}"
+        default_backend be
+
+    backend be
+        server s1 ${s1_addr}:${s1_port}
+} -start
+
+# Issue a reload via socat as a second command (after "show version").
+# The first command causes CF_AUTO_CLOSE to be set on the request channel.
+# When socat half-closes its write side after sending, process_stream()
+# triggers sc_shutdown(scb) -> sc_set_hcto() which applies serverfin=1s
+# to the backend applet.  Without the fix, the reload response (which comes
+# from the master applet) would be killed by this 1s timeout if the new
+# worker takes time to start, returning "Can't connect to the target CLI!".
+shell {
+    RESULT=$(printf "show version\nreload\n" | socat -t10 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null)
+    echo "Got: $RESULT"
+    echo "$RESULT" | grep -q "Success=1" || {
+        echo "FAIL: reload did not succeed. Got: $RESULT"
+        exit 1
+    }
+}
+
+# Verify the master CLI is still functional after reload
+shell {
+    RESULT=$(printf "show version\n" | socat -t5 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null)
+    echo "$RESULT" | grep -q "3\." || {
+        echo "FAIL: show version failed. Got: $RESULT"
+        exit 1
+    }
+}
diff --git a/src/cli.c b/src/cli.c
@@ -3484,8 +3484,25 @@ int pcli_wait_for_request(struct stream *s, struct channel *req, int an_bit)
 	 * current one. Just wait. At this stage, errors should be handled by
 	 * the response analyzer.
 	 */
-	if (s->res.analysers & AN_RES_WAIT_CLI)
+	if (s->res.analysers & AN_RES_WAIT_CLI) {
+		/* Prevent process_stream from auto-forwarding the client close to
+		 * the backend via the CF_AUTO_CLOSE check while we're waiting for
+		 * a response - we manage the connection lifetime ourselves.
+		 */
+		channel_dont_close(req);
+
+		/* If the client disconnected cleanly and we have no more commands
+		 * to send, arm the server-fin timer on the backend.  A stuck
+		 * worker that never responds will then be aborted after
+		 * timeout server-fin, freeing the connection slot (GH #3351).
+		 * When lra is TICK_ETERNITY (no data received yet) the timer never
+		 * fires, so this is safe to call on every wakeup.
+		 */
+		if ((s->scf->flags & SC_FL_EOS) && !ci_data(req))
+			sc_set_hcto(s->scb);
+
 		return 0;
+	}
 
 	pcli->flags &= ~PCLI_F_BIDIR; // only for one connection
 	if ((pcli->flags & ACCESS_LVL_MASK) == ACCESS_LVL_NONE)
@@ -3528,13 +3545,7 @@ int pcli_wait_for_request(struct stream *s, struct channel *req, int an_bit)
 		else
 			channel_forward_forever(req);
 
-		if (!(pcli->flags & PCLI_F_PAYLOAD)) {
-			/* we send only 1 command per request, and we write
-			 * close after it when not in full-duplex mode.
-			 */
-			if (!(pcli->flags & PCLI_F_BIDIR))
-				sc_schedule_shutdown(s->scb);
-		} else {
+		if (pcli->flags & PCLI_F_PAYLOAD) {
 			pcli_write_prompt(s);
 		}
 
@@ -3595,6 +3606,13 @@ int pcli_wait_for_request(struct stream *s, struct channel *req, int an_bit)
         if (s->scf->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) {
                 /* There is no more request or a only a partial one and we
                  * receive a close from the client, we can leave */
+		if (s->scf->flags & SC_FL_ABRT_DONE) {
+			/* Client sent RST: abort the backend immediately.
+			 * Note: when AN_RES_WAIT_CLI is set we never reach here
+			 * (early return above), so this handles the pre-connect case.
+			 */
+			sc_schedule_abort(s->scb);
+		}
 		sc_schedule_shutdown(s->scf);
                 s->req.analysers &= ~AN_REQ_WAIT_CLI;
                 return 1;
@@ -3834,8 +3852,13 @@ int mworker_cli_create_master_proxy(char **errmsg)
 	mworker_proxy->mode = PR_MODE_CLI;
 	/* default to 10 concurrent connections */
 	mworker_proxy->maxconn = 10;
-	/* no timeout */
-	mworker_proxy->timeout.client = 0;
+	mworker_proxy->timeout.client = 0; /* no timeout */
+	/* 1s server-fin timeout: armed only after the client disconnects cleanly
+	 * (scf EOS + no pending commands), so it never fires during normal command
+	 * processing.  It ensures a stuck backend releases its slot promptly once
+	 * the client is gone.
+	 */
+	mworker_proxy->timeout.serverfin = MS_TO_TICKS(1000);
 	mworker_proxy->conf.file = strdup("MASTER");
 	mworker_proxy->conf.line = 0;
 	mworker_proxy->accept = frontend_accept;