Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions doc/admin-guide/files/records.yaml.en.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1834,12 +1834,24 @@ Origin Server Connect Attempts
:overridable:

Controls what origin server connection failures contribute to marking a server down.
When set to ``2``, any connection failure during the TCP and TLS handshakes will
contribute to marking the server down. When set to ``1``, only TCP handshake failures
will contribute to marking a server down. When set to ``0``, no connection failures
will be used towards marking a server down. When set to ``3``, all failures covered
by ``2`` plus transaction inactive timeouts (server goes silent after connection is
established) will contribute to marking a server down.

+-------+-----------------------------------------------------------------------+
| Value | Behavior |
+=======+=======================================================================+
| ``0`` | No connection failures contribute to marking a server down. |
+-------+-----------------------------------------------------------------------+
| ``1`` | TCP handshake failures (excluding TLS handshake failures) contribute |
| | to marking a server down. |
+-------+-----------------------------------------------------------------------+
| ``2`` | Any connection failure during the TCP or TLS handshake contributes to |
| | marking a server down. |
+-------+-----------------------------------------------------------------------+
| ``3`` | All failures covered by ``2``, plus transaction inactive timeouts |
| | (server goes silent after the connection is established). |
+-------+-----------------------------------------------------------------------+
| ``4`` | All failures covered by ``3``, plus cases where the origin closes the |
| | connection before sending any response bytes. |
+-------+-----------------------------------------------------------------------+

.. ts:cv:: CONFIG proxy.config.http.server_max_connections INT 0
:reloadable:
Expand Down
45 changes: 31 additions & 14 deletions src/proxy/http/HttpSM.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4667,22 +4667,39 @@ HttpSM::do_hostdb_reverse_lookup()
bool
HttpSM::track_connect_fail() const
{
bool retval = false;
if (t_state.current.server->had_connect_fail()) {
// What does our policy say?
if (t_state.txn_conf->connect_down_policy == 2 ||
t_state.txn_conf->connect_down_policy == 3) { // Any connection error through TLS handshake
retval = true;
} else if (t_state.txn_conf->connect_down_policy == 1) { // Any connection error through TCP
retval = t_state.current.server->connect_result != -ENET_SSL_CONNECT_FAILED;
}
int const policy = t_state.txn_conf->connect_down_policy;

// Policy 1: any TCP-level connect error (excluding TLS handshake failures).
if (policy == 1 && t_state.current.server->had_connect_fail()) {
return t_state.current.server->connect_result != -ENET_SSL_CONNECT_FAILED;
}
// Policy 3 additionally marks the server down on transaction inactive timeout,
// even when had_connect_fail() is false (connect_result was cleared at CONNECTION_ALIVE).
if (!retval && t_state.txn_conf->connect_down_policy == 3) {
retval = (t_state.current.server->state == HttpTransact::INACTIVE_TIMEOUT);

// Policy 2+: any connect error including TLS handshake failures.
if (policy >= 2 && t_state.current.server->had_connect_fail()) {
return true;
}
return retval;

// Policy 3+: inactive timeout (connect_result was cleared at CONNECTION_ALIVE).
if (policy >= 3 && t_state.current.server->state == HttpTransact::INACTIVE_TIMEOUT) {
return true;
}

// Policy 4+: origin closed a fresh connection before sending any response bytes.
// Excludes two cases:
// - Reused keep-alive connection: there is a known race between ATS reusing and the origin closing it.
// - Multiplexed origins (HTTP/2): stream-level failure does not indicate a connection failure.
if (policy >= 4) {
bool multiplexed = false;
auto ssn = server_txn->get_proxy_ssn();
if (ssn != nullptr) {
multiplexed = static_cast<PoolableSession *>(ssn)->is_multiplexing();
}
if (!multiplexed && server_txn->is_first_transaction() && server_response_hdr_bytes == 0) {
return true;
}
}

return false;
}

void
Expand Down
19 changes: 3 additions & 16 deletions src/proxy/http/HttpTransact.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2902,23 +2902,10 @@ HttpTransact::HandleCacheOpenReadHit(State *s)

find_server_and_update_current_info(s);

// We do not want to try to revalidate documents if we think
// the server is down due to the something report problem
//
// Note: we only want to skip origin servers because 1)
// parent proxies have their own negative caching
// scheme & 2) If we skip down parents, every page
// we serve is potentially stale
//
if (s->current.request_to == ResolveInfo::ORIGIN_SERVER && is_server_negative_cached(s) && response_returnable == true &&
is_stale_cache_response_returnable(s) == true) {
server_up = false;
update_current_info(&s->current, nullptr, ResolveInfo::UNDEFINED_LOOKUP, true);
TxnDbg(dbg_ctl_http_trans, "CacheOpenReadHit - server_down, returning stale document");
}
// a parent lookup could come back as ParentResultType::FAIL if in parent.config, go_direct == false and
// For origin servers (ResolveInfo::ORIGIN_SERVER): OSDNSLookup handles serving stale when all origin servers are down.
// For parent proxies: a parent lookup could come back as ParentResultType::FAIL if in parent.config, go_direct == false and
// there are no available parents (all down).
else if (s->current.request_to == ResolveInfo::HOST_NONE && s->parent_result.result == ParentResultType::FAIL) {
if (s->current.request_to == ResolveInfo::HOST_NONE && s->parent_result.result == ParentResultType::FAIL) {
if (response_returnable == true && is_stale_cache_response_returnable(s) == true) {
server_up = false;
update_current_info(&s->current, nullptr, ResolveInfo::UNDEFINED_LOOKUP, true);
Expand Down
3 changes: 3 additions & 0 deletions tests/gold_tests/cache/proxy_serve_stale.test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@

# Verify that stale content is served when the parent is down.
Test.ATSReplayTest(replay_file="replay/proxy_serve_stale.replay.yaml")

# Verify that stale content is served when the origin server is down.
Test.ATSReplayTest(replay_file="replay/proxy_serve_stale_origin_down.replay.yaml")
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#
# This replay file assumes that caching is enabled and
# proxy.config.http.cache.ignore_client_cc_max_age is set to 0 so that we can
# test max-age in the client requests.
#

meta:
version: "1.0"

autest:
description: 'Verify that stale content is served when the origin server is down'

dns:
name: 'dns-serve-stale-origin-down'
records:
backend.example.com: ["127.0.0.1"]


server:
name: 'server-serve-stale-origin-down'

client:
name: 'client-serve-stale-origin-down'

ats:
name: 'ts-serve-stale-origin-down'
process_config:
enable_cache: true

records_config:
proxy.config.diags.debug.enabled: 1
proxy.config.diags.debug.tags: 'cache|http|dns|hostdb'
proxy.config.http.cache.max_stale_age: 10
proxy.config.http.connect.down.policy: 4
proxy.config.http.connect_attempts_timeout: 1
proxy.config.http.connect_attempts_rr_retries: 0
proxy.config.http.connect_attempts_max_retries: 1
proxy.config.http.connect_attempts_max_retries_down_server: 0
proxy.config.http.down_server.cache_time: 5

remap_config:
- from: "http://example.com/"
to: "http://backend.example.com:{SERVER_HTTP_PORT}/"

plugin_config:
- 'xdebug.so --enable=x-cache,x-cache-key,via'

sessions:
- transactions:
# 1. Cache object
# cache-key: /path/a/
# cache: miss
# origin server: up
- client-request:
method: "GET"
version: "1.1"
url: /path/a/
headers:
fields:
- [ Host, example.com ]
- [ x-debug, "x-cache,x-cache-key,via" ]
- [ uuid, a-1 ]

server-response:
status: 200
headers:
fields:
- [ Content-Length, 16]
- [ Cache-Control, "public, max-age=1"]

proxy-response:
status: 200
headers:
fields:
- [ X-Cache, { value: miss, as: equal } ]

# 2. Make the origin server down
# cache-key: /path/b/
# cache: miss
# origin server: down
- client-request:
method: "GET"
version: "1.1"
url: /path/b/
headers:
fields:
- [ Host, example.com ]
- [ x-debug, "x-cache,x-cache-key,via" ]
- [ uuid, b-1 ]

server-response:
on_connect: reset

proxy-response:
status: 502
headers:
fields:
- [ X-Cache, { value: miss, as: equal } ]

# 3. Serve stale while down_serve.cache_time
# cache-key: /path/a/
# cache: stale
# origin server: down (cached)
- client-request:
delay: 2s
method: "GET"
version: "1.1"
url: /path/a/
headers:
fields:
- [ Host, example.com ]
- [ x-debug, "x-cache,x-cache-key,via" ]
- [ uuid, a-2 ]

proxy-request:
expect: absent

server-response:
on_connect: reset

proxy-response:
status: 200
headers:
fields:
- [ X-Cache, { value: hit-stale, as: equal } ]

# 4. Revalidate stale contents
# cache-key: /path/a/
# cache: stale
# origin server: up
- client-request:
delay: 5s
method: "GET"
version: "1.1"
url: /path/a/
headers:
fields:
- [ Host, example.com ]
- [ x-debug, "x-cache,x-cache-key,via" ]
- [ uuid, a-99 ]

server-response:
status: 304

proxy-response:
status: 200
headers:
fields:
- [ X-Cache, { value: hit-stale, as: equal } ]

Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,6 @@ def run(self):

# Policy 2: inactive timeout should NOT mark the origin down.
ConnectDownPolicy3Test(policy=2, expect_mark_down=False).run()

# Policy 4: origin closes connection without sending any response.
Test.ATSReplayTest(replay_file="replay/connect_down_policy_4.replay.yaml")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
`` Server closed connection while reading response header. (origin backend1.example.com:``/path/)
`` CONNECT : SUCCESS [0] connecting to 127.0.0.1:`` for host='one.example.com' url='http://backend1.example.com:``/path/' fail_count='1' marking down
`` DNS Error: no valid server http://backend1.example.com:``/path/
`` Server closed connection while reading response header. (origin backend2.example.com:``/path/)
`` CONNECT : SUCCESS [0] connecting to 127.0.0.1:`` for host='two.example.com' url='http://backend2.example.com:``/path/' fail_count='1' marking down
`` DNS Error: no valid server http://backend2.example.com:``/path/
Loading