Skip to content

Commit f65a4e4

Browse files
authored
DAOS-18487 object: control EC rebuild resource consumption (#17441)
dd resource controls to limit scanner pressure and improve stability: Introduce resource control for rebuild scanning to prevent excessive load and contention during rebuild/reclaim operations. The scanner now enforces tighter pacing/limits (credits/yielding and related controls) so rebuild work progresses without overwhelming system resources. This improves cluster stability under heavy rebuild activity and reduces the likelihood of prolonged stalls caused by resource exhaustion or oversubscription, while preserving existing rebuild correctness and behavior. Signed-off-by: Liang Zhen <gnailzenh@gmail.com> Signed-off-by: Wang Shilong <shilong.wang@hpe.com> Signed-off-by: Mohamad Chaarawi <mohamad.chaarawi@hpe.com>
1 parent 1668519 commit f65a4e4

2 files changed

Lines changed: 734 additions & 307 deletions

File tree

src/object/srv_internal.h

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,29 @@
2626

2727
extern struct dss_module_key obj_module_key;
2828

29-
struct migr_res_manager;
29+
/* anchor point of resource waiter
30+
* NB: resource control can be a independent library in the future.
31+
*/
32+
struct migr_res_waiter {
33+
struct migrate_pool_tls *rw_tls;
34+
/* link chain on resource manager */
35+
d_list_t rw_link;
36+
/* quantity of resource being demanded */
37+
uint64_t rw_units;
38+
/* start to wait since... */
39+
uint64_t rw_wait_since;
40+
/* eventual to wait on */
41+
ABT_eventual rw_eventual;
42+
/* for eventual */
43+
int *rw_rc;
44+
};
45+
46+
/* resource handle */
47+
struct migr_res_handle {
48+
int rh_type;
49+
int rh_bkt;
50+
uint64_t rh_units;
51+
};
3052

3153
/* Per pool attached to the migrate tls(per xstream) */
3254
struct migrate_pool_tls {
@@ -80,8 +102,6 @@ struct migrate_pool_tls {
80102
/* The current in-flight data size */
81103
uint64_t mpt_inflight_size;
82104

83-
struct migr_res_manager *mpt_rmg;
84-
85105
/* reference count for the structure */
86106
uint64_t mpt_refcount;
87107
uint32_t mpt_opc;
@@ -94,6 +114,11 @@ struct migrate_pool_tls {
94114

95115
/* migration init error */
96116
int mpt_init_err;
117+
118+
/* Watchdog: track progress to detect complete rebuild hang */
119+
uint64_t mpt_last_progress_obj_count; /* obj_count at last check */
120+
uint64_t mpt_last_progress_rec_count; /* rec_count at last check */
121+
uint64_t mpt_last_progress_ts; /* time of last observed progress */
97122
};
98123

99124
struct migrate_cont_hdl {

0 commit comments

Comments
 (0)