Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
cf1e101
DAOS-18395 test: recovery/pool_list_consolidation.py test_lost_majori…
shimizukko Apr 10, 2026
0454e20
DAOS-18395 test: Fix pylint
shimizukko Apr 10, 2026
52ef33a
DAOS-18395 test: Add mount cleanup code
shimizukko Apr 10, 2026
60d15ac
DAOS-18395 test: Fix pylint
shimizukko Apr 10, 2026
490a6ea
DAOS-18395 test: Fix pylint
shimizukko Apr 10, 2026
419fad9
DAOS-18395 test: Fix pylint
shimizukko Apr 10, 2026
a9128f1
DAOS-18395 test: Fix pylint
shimizukko Apr 10, 2026
b3d2d3d
DAOS-18395 test: Fix pylint
shimizukko Apr 10, 2026
34622ef
Merge branch 'master' into makito/DAOS-18395
shimizukko Apr 12, 2026
33688d3
DAOS-18395 test: Add MD-on-SSD check before clean_mounts
shimizukko Apr 12, 2026
992d747
DAOS-18395 test: Fix pylint
shimizukko Apr 12, 2026
a2ac583
Merge branch 'master' into makito/DAOS-18395
shimizukko Apr 14, 2026
998e1e2
Merge branch 'master' into makito/DAOS-18395
shimizukko Apr 17, 2026
f870062
DAOS-18395 test: Adjust ddb command preparation by removing the setti…
shimizukko Apr 20, 2026
e3ab7a8
Merge branch 'master' into makito/DAOS-18395
shimizukko Apr 20, 2026
9773dc8
DAOS-18395 test: Update comment
shimizukko Apr 20, 2026
f7f98b4
Merge branch 'master' into makito/DAOS-18395
shimizukko Apr 20, 2026
b99fcca
Merge branch 'master' into makito/DAOS-18395
shimizukko Apr 21, 2026
2e062c3
Merge branch 'master' into makito/DAOS-18395
shimizukko Apr 22, 2026
621c2d5
Merge branch 'master' into makito/DAOS-18395
shimizukko Apr 23, 2026
fe6150c
Merge branch 'master' into makito/DAOS-18395
shimizukko Apr 24, 2026
0142bdd
Merge branch 'master' into makito/DAOS-18395
shimizukko May 7, 2026
4ee2af6
Merge branch 'master' into makito/DAOS-18395
shimizukko May 11, 2026
29984a6
DAOS-18395 test: Do not use --db_path indicator in DdbCommandBase con…
shimizukko May 11, 2026
4e70068
Merge branch 'master' into makito/DAOS-18395
shimizukko May 14, 2026
4d70410
DAOS-18395 test: Remove comment
shimizukko May 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/tests/ftest/recovery/ddb.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ def prepare_ddb_command(self, md_on_ssd, pool):

Returns:
DdbCommand: DdbCommand object created based on the environment.

"""
if md_on_ssd:
vos_path = '""'
Expand All @@ -162,7 +161,7 @@ def prepare_ddb_command(self, md_on_ssd, pool):

if md_on_ssd:
self.log_step("MD-on-SSD: Create a directory to load pool data under /mnt.")
self.run_cmd_check_result(command=f"mkdir {self.daos_load_path}")
self.run_cmd_check_result(command=f"mkdir -p {self.daos_load_path}")

self.log_step(f"MD-on-SSD: Load pool dir to {self.daos_load_path}")
db_path = os.path.join(self.log_dir, "control_metadata", "daos_control", "engine0")
Expand Down
203 changes: 162 additions & 41 deletions src/tests/ftest/recovery/pool_list_consolidation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
import os
import time

from apricot import TestWithServers
from avocado.core.exceptions import TestFail
from ClusterShell.NodeSet import NodeSet
from ddb_utils import DdbCommand
from general_utils import check_file_exists, report_errors
from recovery_utils import wait_for_check_complete
from run_utils import command_as_user, run_remote
Expand All @@ -20,6 +22,11 @@ class PoolListConsolidationTest(TestWithServers):
:avocado: recursive
"""

def __init__(self, *args, **kwargs):
"""Initialize a DdbTest object."""
super().__init__(*args, **kwargs)
self.tmpfs_mounts = ["/mnt/daos2", "/mnt/daos3"]

def chk_dist_checker(self, inconsistency, policies=None):
"""Run DAOS checker with kinds of options.

Expand Down Expand Up @@ -99,6 +106,25 @@ def chk_dist_checker(self, inconsistency, policies=None):

return errors

def run_cmd_check_result(self, command):
"""Run given command as root and check its result.

Args:
command (str): Command to execute.
"""
command_root = command_as_user(command=command, user="root")
result = run_remote(log=self.log, hosts=self.hostlist_servers, command=command_root)
if not result.passed:
self.fail(f"{command} failed on {result.failed_hosts}!")

def clean_mounts(self):
"""Unmount and remove the tmpfs directory used for MD-on-SSD testing.
"""
self.log_step(f"MD-on-SSD: Clean {self.tmpfs_mounts} on {self.hostlist_servers}")
for tmpfs_mount in self.tmpfs_mounts:
self.run_cmd_check_result(command=f"umount {tmpfs_mount}")
self.run_cmd_check_result(command=f"rm -rf {tmpfs_mount}")

def test_dangling_pool(self):
"""Test dangling pool.

Expand Down Expand Up @@ -280,9 +306,8 @@ def test_lost_majority_ps_replicas(self):
:avocado: tags=recovery,cat_recov,pool_list_consolidation
:avocado: tags=PoolListConsolidationTest,test_lost_majority_ps_replicas
"""
if self.server_managers[0].manager.job.using_control_metadata:
self.log.info("MD-on-SSD cluster. It will be supported later.")
self.cancelForTicket('DAOS-18395')
hosts = list(set(self.server_managers[0].ranks.values()))
md_on_ssd = self.server_managers[0].manager.job.using_control_metadata

self.log_step("Create a pool with --nsvc=3.")
# We can generalize this test more. For example, use
Expand All @@ -293,41 +318,138 @@ def test_lost_majority_ps_replicas(self):
# simple.
pool = self.get_pool(svcn=3)

self.log_step("Stop servers")
dmg_command = self.get_dmg_command()
dmg_command.system_stop()
# This is where we store the original rdb-pool paths that are created during dmg pool
# create. e.g.,
# /mnt/daos1/$POOL/rdb-pool
orig_rdb_pool_paths = []

self.log_step("Remove <scm_mount>/<pool_uuid>/rdb-pool from two ranks.")
rdb_pool_paths = []
for engine_params in self.server_managers[0].manager.job.yaml.engine_params:
scm_mount = engine_params.get_value('scm_mount')
rdb_pool_path = f"{scm_mount}/{pool.uuid.lower()}/rdb-pool"
rdb_pool_paths.append(rdb_pool_path)
self.log.info("rdb_pool_paths = %s", rdb_pool_paths)
hosts = list(set(self.server_managers[0].ranks.values()))
count = 0
# Iterate both pool mount points of both ranks. I.e., 4 ranks total.
for host in hosts:
for rdb_pool_path in rdb_pool_paths:
node = NodeSet(host)
check_out = check_file_exists(
hosts=node, filename=rdb_pool_path, sudo=True)
if check_out[0]:
command = f"rm {rdb_pool_path}"
command_root = command_as_user(command=command, user="root")
if not run_remote(log=self.log, hosts=node, command=command_root).passed:
self.fail(f'Failed to remove {rdb_pool_path} on {host}')
self.log.info("Remove %s from %s", rdb_pool_path, str(node))
count += 1
self.log_step("Stop servers")
self.get_dmg_command().system_stop()

if md_on_ssd:
# MD-on-SSD case is more complex that PMEM case because we need to first load the pool
# dir to the new mount points. Then we'll iterate the new mount points to search for
# rdb-pool. Removing rdb-pool is also more complex than PMEM because in PMEM, we'll just
# use rm <rdb-pool_path>, but in MD-on-SSD, we need to use "ddb rm_pool" and it takes
# db_path, so we need to keep the mapping between rdb-pool path and db_path.

# Prepare dictionaries to determine the arguments used in prov_mem and rm_pool.
# Used for --db_path in prov_mem and rm_pool.
db_path_0 = os.path.join(self.log_dir, "control_metadata", "daos_control", "engine0")
db_path_1 = os.path.join(self.log_dir, "control_metadata", "daos_control", "engine1")

# Create the map of original mount points to the new mount points where the pool will be
# loaded.
orig_load_mount = {}
for i, engine_params in enumerate(
self.server_managers[0].manager.job.yaml.engine_params):
scm_mount = engine_params.get_value('scm_mount')
orig_load_mount[scm_mount] = self.tmpfs_mounts[i]
self.log.info("orig_load_mount = %s", orig_load_mount)

# When we call rm_pool, we need to know the right --db_path value for a given rdb-pool
# path to remove, so prepare an intermediate dictionary.
mount_to_db_path = {self.tmpfs_mounts[0]: db_path_0, self.tmpfs_mounts[1]: db_path_1}
self.log.info("mount_to_db_path = %s", mount_to_db_path)

# This is where we store the new rdb-pool paths in the loaded dir. e.g.,
# /mnt/daos3/$POOL/rdb-pool
new_rdb_pool_paths = []

# When we call rm_pool, we'll check whether a particular rdb-pool path exists. If found,
# we'll use rm_pool to remove it. When we call the command, we also need to know db_path
# that maps to this rdb-pool path, so prepare a dictionary. e.g.,
# /mnt/daos3/$POOL/rdb-pool: /var/tmp/daos_testing/control_metadata/daos_control/engine1
new_rdb_to_db_path = {}

for engine_params in self.server_managers[0].manager.job.yaml.engine_params:
# Iterate the server config and get the scm_mount values. Usually /mnt/daos0 and
# /mnt/daos1.
scm_mount = engine_params.get_value('scm_mount')

# Determine the new rdb-pool path from the new scm_mount. e.g., If the new scm_mount
# is /mnt/daos1, new rdb-pool path would be /mnt/daos3/$POOL/rdb-pool
new_mount = orig_load_mount[scm_mount]

# Create the mapping between the new rdb-pool path and the associated db_path. For
# example, if scm_mount is /mnt/daos1, the mapping would be
# /mnt/daos1/$POOL/rdb-pool:
# /var/tmp/daos_testing/control_metadata/daos_control/engine1
new_rdb_pool_path = f"{new_mount}/{pool.uuid.lower()}/rdb-pool"
new_rdb_pool_paths.append(new_rdb_pool_path)
orig_rdb_pool_path = f"{scm_mount}/{pool.uuid.lower()}/rdb-pool"
# orig_rdb_pool_paths is needed when we check if rdb-pools are recovered at the end.
orig_rdb_pool_paths.append(orig_rdb_pool_path)
new_rdb_to_db_path[new_rdb_pool_path] = mount_to_db_path[new_mount]

self.log.info("new_rdb_pool_paths = %s", new_rdb_pool_paths)
self.log.info("rdb_to_db_path = %s", new_rdb_to_db_path)

self.log_step(
"MD-on-SSD: Create a directory to load pool data under /mnt in all servers.")
command = "mkdir -p /mnt/daos2 /mnt/daos3"
result = run_remote(
log=self.log, hosts=self.hostlist_servers,
command=command_as_user(command=command, user="root"))
if not result.passed:
self.fail(f"{command} failed on {result.failed_hosts}!")

self.log_step("MD-on-SSD: Load pool dir to /mnt/daos2 and daos3 for all servers.")
for host in hosts:
# We need to call ddb prov_mem for all servers, so use new DdbCommand object with
# each host.
ddb_command = DdbCommand(server_host=host, path=self.bin, vos_path='""')
ddb_command.prov_mem(db_path=db_path_0, tmpfs_mount=self.tmpfs_mounts[0])
ddb_command.prov_mem(db_path=db_path_1, tmpfs_mount=self.tmpfs_mounts[1])

self.log_step("Remove rdb-pool from 2 out of 3 ranks from /mnt/daos2 and /mnt/daos3")
count = 0
# Iterate both pool mount points of both ranks. I.e., 4 ranks total.
for host in hosts:
for rdb_pool_path in new_rdb_pool_paths:
if count == 2:
break
if count == 2:
break
node = NodeSet(host)
# rdb_pool_path is something like "/mnt/daos2/$POOL/rdb-pool". Check if it
# exists in this host.
check_out = check_file_exists(hosts=node, filename=rdb_pool_path, sudo=True)
if check_out[0]:
# As in prov_mem, we're calling ddb rm_pool in this particular host, so use
# a new object with this particular host.
ddb_command = DdbCommand(server_host=host, path=self.bin, vos_path=None)
# Get the corresponding db_path from the rdb-pool path we're removing.
db_path = new_rdb_to_db_path[rdb_pool_path]
self.log.info("Remove %s from %s", rdb_pool_path, str(node))
ddb_command.rm_pool(db_path=db_path, removing_path=rdb_pool_path)
count += 1

else:
# PMEM case.
self.log_step("Remove <scm_mount>/<pool_uuid>/rdb-pool from two ranks.")
for engine_params in self.server_managers[0].manager.job.yaml.engine_params:
scm_mount = engine_params.get_value('scm_mount')
rdb_pool_path = f"{scm_mount}/{pool.uuid.lower()}/rdb-pool"
orig_rdb_pool_paths.append(rdb_pool_path)
self.log.info("orig_rdb_pool_paths = %s", orig_rdb_pool_paths)
count = 0
# Iterate both pool mount points of both ranks. I.e., 4 ranks total.
for host in hosts:
for rdb_pool_path in orig_rdb_pool_paths:
if count == 2:
break
node = NodeSet(host)
check_out = check_file_exists(hosts=node, filename=rdb_pool_path, sudo=True)
if check_out[0]:
command = f"rm {rdb_pool_path}"
command_root = command_as_user(command=command, user="root")
if not run_remote(log=self.log, hosts=node, command=command_root).passed:
self.fail(f'Failed to remove {rdb_pool_path} on {host}')
self.log.info("Remove %s from %s", rdb_pool_path, str(node))
count += 1

self.log_step("Run DAOS checker under kinds of mode.")
errors = []
errors = self.chk_dist_checker(
inconsistency="corrupted pool without quorum")
errors = self.chk_dist_checker(inconsistency="corrupted pool without quorum")

self.log_step("Try creating a container. It should succeed.")
cont_create_success = False
Expand All @@ -338,22 +460,18 @@ def test_lost_majority_ps_replicas(self):
cont_create_success = True
break
except TestFail as error:
msg = f"Container create failed after running checker! error = {error}"
self.log.debug(msg)
self.log.info("Container create failed after running checker! error = %s", error)

if not cont_create_success:
errors.append("Container create failed after running checker!")

msg = ("Show that rdb-pool are recovered. i.e., three out of four ranks should "
"have rdb-pool.")
self.log_step(msg)
hosts = list(set(self.server_managers[0].ranks.values()))
self.log_step(
"Show that rdb-pool are recovered. i.e., three out of four ranks should have rdb-pool.")
count = 0
for host in hosts:
for rdb_pool_path in rdb_pool_paths:
for rdb_pool_path in orig_rdb_pool_paths:
node = NodeSet(host)
check_out = check_file_exists(
hosts=node, filename=rdb_pool_path, sudo=True)
check_out = check_file_exists(hosts=node, filename=rdb_pool_path, sudo=True)
if check_out[0]:
count += 1
self.log.info("rdb-pool found at %s: %s", str(node), rdb_pool_path)
Expand All @@ -362,6 +480,9 @@ def test_lost_majority_ps_replicas(self):
if count != 3:
errors.append(f"Unexpected number of rdb-pool after repair! - {count} ranks")

if md_on_ssd:
self.clean_mounts()

report_errors(test=self, errors=errors)

def test_lost_all_rdb(self):
Expand Down
Loading
Loading