Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions reg-tests/mcli/mcli_master_socket_leak.vtc
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
varnishtest "Bug fix: master CLI connection slots freed on client disconnect"

# Regression test for a master CLI socket connection leak in master-worker mode.
#
# mworker_proxy has a fixed maxconn of 10. When clients connect to the master
# socket, send a command that is forwarded to a busy worker, and then close
# the connection due to a client-side receive timeout, haproxy must free each
# slot as the client disconnects.
#
# Without the fix: slots remain occupied after the client disconnects, so the
# master CLI becomes unreachable once all 10 slots are filled.
# With the fix: slots are freed on client disconnect and the master CLI keeps
# accepting new connections.
#
# The worker is made unresponsive using "debug dev delay" (expert-mode) with
# nbthread 1 so a single delay blocks the entire worker CLI. This avoids
# using SIGSTOP/SIGCONT which can be unreliable in CI environments.

#REGTEST_TYPE=bug

feature cmd "command -v socat"
feature cmd "command -v timeout"
feature ignore_unknown_macro

server s1 {
} -start

haproxy h1 -W -S -conf {
global
nbthread 1

defaults
mode http
timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
timeout client "${HAPROXY_TEST_TIMEOUT-5s}"
timeout server "${HAPROXY_TEST_TIMEOUT-5s}"

frontend fe
bind "fd@${fe}"
default_backend be

backend be
server s1 ${s1_addr}:${s1_port}
} -start

# Fill all 10 master CLI slots (mworker_proxy->maxconn is hardcoded to 10).
# Each socat sends "expert-mode on" followed by "@1 debug dev delay 10000"
# which blocks the single worker thread for 10 s. After 2 s the timeout(1)
# wrapper kills socat, simulating a client-side receive timeout. "wait"
# ensures all background processes have exited before proceeding.
shell {
for i in $(seq 1 10); do
(printf "expert-mode on\n@1 debug dev delay 10000\n" \
| timeout 2 socat TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null) &
done
wait
}

# This is the key assertion: after all 10 clients have disconnected, a new
# connection to the master CLI must succeed. With the bug all 10 slots are
# still marked occupied and this connect is refused or times out.
haproxy h1 -mcli {
send "show version"
expect ~ "3."
}

5 changes: 5 additions & 0 deletions reg-tests/mcli/mcli_reload_no_timeout.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- Delay worker readiness by 2s to test that serverfin does not
-- kill the reload command while the new worker is starting.
core.register_init(function()
os.execute("sleep 2")
end)
65 changes: 65 additions & 0 deletions reg-tests/mcli/mcli_reload_no_timeout.vtc
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
varnishtest "Verify reload via master CLI is not affected by serverfin timeout"

# Regression test: the timeout server-fin on the MASTER proxy must not
# apply to locally-handled commands (applets). A reload command is
# forwarded to the master applet and may take longer than 1s while the
# new worker is starting. Without the fix, the pcli response analyser
# would fire a read timeout and return "Can't connect to the target CLI!"
# instead of the reload status.

#REQUIRE_OPTIONS=LUA
#REGTEST_TYPE=bug

feature cmd "command -v socat"
feature cmd "$HAPROXY_PROGRAM -vv | grep -q '+LUA'"
feature ignore_unknown_macro

server s1 {
rxreq
txresp
} -start

haproxy h1 -W -S -conf {
global
tune.lua.bool-sample-conversion normal
# Lua init hook that sleeps 2s, delaying worker readiness on reload
lua-load ${testdir}/mcli_reload_no_timeout.lua

defaults
mode http
timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
timeout client "${HAPROXY_TEST_TIMEOUT-5s}"
timeout server "${HAPROXY_TEST_TIMEOUT-5s}"

frontend fe
bind "fd@${fe}"
default_backend be

backend be
server s1 ${s1_addr}:${s1_port}
} -start

# Issue a reload via socat as a second command (after "show version").
# The first command causes CF_AUTO_CLOSE to be set on the request channel.
# When socat half-closes its write side after sending, process_stream()
# triggers sc_shutdown(scb) -> sc_set_hcto() which applies serverfin=1s
# to the backend applet. Without the fix, the reload response (which comes
# from the master applet) would be killed by this 1s timeout if the new
# worker takes time to start, returning "Can't connect to the target CLI!".
shell {
RESULT=$(printf "show version\nreload\n" | socat -t10 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null)
echo "Got: $RESULT"
echo "$RESULT" | grep -q "Success=1" || {
echo "FAIL: reload did not succeed. Got: $RESULT"
exit 1
}
}

# Verify the master CLI is still functional after reload
shell {
RESULT=$(printf "show version\n" | socat -t5 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null)
echo "$RESULT" | grep -q "3\." || {
echo "FAIL: show version failed. Got: $RESULT"
exit 1
}
}
43 changes: 33 additions & 10 deletions src/cli.c
Original file line number Diff line number Diff line change
Expand Up @@ -3484,8 +3484,25 @@ int pcli_wait_for_request(struct stream *s, struct channel *req, int an_bit)
* current one. Just wait. At this stage, errors should be handled by
* the response analyzer.
*/
if (s->res.analysers & AN_RES_WAIT_CLI)
if (s->res.analysers & AN_RES_WAIT_CLI) {
/* Prevent process_stream from auto-forwarding the client close to
* the backend via the CF_AUTO_CLOSE check while we're waiting for
* a response - we manage the connection lifetime ourselves.
*/
channel_dont_close(req);

/* If the client disconnected cleanly and we have no more commands
* to send, arm the server-fin timer on the backend. A stuck
* worker that never responds will then be aborted after
* timeout server-fin, freeing the connection slot (GH #3351).
* When lra is TICK_ETERNITY (no data received yet) the timer never
* fires, so this is safe to call on every wakeup.
*/
if ((s->scf->flags & SC_FL_EOS) && !ci_data(req))
sc_set_hcto(s->scb);

return 0;
}

pcli->flags &= ~PCLI_F_BIDIR; // only for one connection
if ((pcli->flags & ACCESS_LVL_MASK) == ACCESS_LVL_NONE)
Expand Down Expand Up @@ -3528,13 +3545,7 @@ int pcli_wait_for_request(struct stream *s, struct channel *req, int an_bit)
else
channel_forward_forever(req);

if (!(pcli->flags & PCLI_F_PAYLOAD)) {
/* we send only 1 command per request, and we write
* close after it when not in full-duplex mode.
*/
if (!(pcli->flags & PCLI_F_BIDIR))
sc_schedule_shutdown(s->scb);
} else {
if (pcli->flags & PCLI_F_PAYLOAD) {
pcli_write_prompt(s);
}

Expand Down Expand Up @@ -3595,6 +3606,13 @@ int pcli_wait_for_request(struct stream *s, struct channel *req, int an_bit)
if (s->scf->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) {
/* There is no more request or a only a partial one and we
* receive a close from the client, we can leave */
if (s->scf->flags & SC_FL_ABRT_DONE) {
/* Client sent RST: abort the backend immediately.
* Note: when AN_RES_WAIT_CLI is set we never reach here
* (early return above), so this handles the pre-connect case.
*/
sc_schedule_abort(s->scb);
}
sc_schedule_shutdown(s->scf);
s->req.analysers &= ~AN_REQ_WAIT_CLI;
return 1;
Expand Down Expand Up @@ -3834,8 +3852,13 @@ int mworker_cli_create_master_proxy(char **errmsg)
mworker_proxy->mode = PR_MODE_CLI;
/* default to 10 concurrent connections */
mworker_proxy->maxconn = 10;
/* no timeout */
mworker_proxy->timeout.client = 0;
mworker_proxy->timeout.client = 0; /* no timeout */
/* 1s server-fin timeout: armed only after the client disconnects cleanly
* (scf EOS + no pending commands), so it never fires during normal command
* processing. It ensures a stuck backend releases its slot promptly once
* the client is gone.
*/
mworker_proxy->timeout.serverfin = MS_TO_TICKS(1000);
mworker_proxy->conf.file = strdup("MASTER");
mworker_proxy->conf.line = 0;
mworker_proxy->accept = frontend_accept;
Expand Down