Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions ydb/core/tx/columnshard/columnshard__overload.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,27 @@

namespace NKikimr::NColumnShard {

TColumnShard::EOverloadStatus TColumnShard::ResourcesStatusToOverloadStatus(const NOverload::EResourcesStatus status) const {
TColumnShard::TOverloadStatus TColumnShard::ResourcesStatusToOverloadStatus(const NOverload::EResourcesStatus status) const {
switch (status) {
case NOverload::EResourcesStatus::Ok:
return EOverloadStatus::None;
return TOverloadStatus{EOverloadStatus::None, {}};
case NOverload::EResourcesStatus::WritesInFlyLimitReached:
return EOverloadStatus::ShardWritesInFly;
return TOverloadStatus{EOverloadStatus::ShardWritesInFly, "The limit on the number of in-flight write requests to a shard has been exceeded. Please add more resources or reduce the database load."};
case NOverload::EResourcesStatus::WritesSizeInFlyLimitReached:
return EOverloadStatus::ShardWritesSizeInFly;
return TOverloadStatus{EOverloadStatus::ShardWritesSizeInFly, "The limit on the total size of in-flight write requests to the shard has been exceeded. Please add more resources or reduce the database load."};
Copy link

Copilot AI Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This switch statement is missing a default case. If an unexpected EResourcesStatus value is passed, the function will fall through without returning a value, leading to undefined behavior. Consider adding a default case that either asserts or returns an appropriate error status.

Suggested change
return TOverloadStatus{EOverloadStatus::ShardWritesSizeInFly, "The limit on the total size of in-flight write requests to the shard has been exceeded. Please add more resources or reduce the database load."};
return TOverloadStatus{EOverloadStatus::ShardWritesSizeInFly, "The limit on the total size of in-flight write requests to the shard has been exceeded. Please add more resources or reduce the database load."};
default:
Y_FAIL("Unexpected EResourcesStatus value: %d", static_cast<int>(status));
return TOverloadStatus{EOverloadStatus::None, "Unknown resource status."};

Copilot uses AI. Check for mistakes.
}
}

TColumnShard::EOverloadStatus TColumnShard::CheckOverloadedImmediate(const TInternalPathId /* pathId */) const {
TColumnShard::TOverloadStatus TColumnShard::CheckOverloadedImmediate(const TInternalPathId /* pathId */) const {
if (IsAnyChannelYellowStop()) {
return EOverloadStatus::Disk;
return TOverloadStatus{EOverloadStatus::Disk, "Channels are overloaded (yellow), please rebalance groups or add new ones"};
Copy link

Copilot AI Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Grammar issue: The comma splice creates a run-on sentence. Consider restructuring as: "Channels are overloaded (yellow). Please rebalance groups or add new ones." or "Channels are overloaded (yellow); please rebalance groups or add new ones."

Suggested change
return TOverloadStatus{EOverloadStatus::Disk, "Channels are overloaded (yellow), please rebalance groups or add new ones"};
return TOverloadStatus{EOverloadStatus::Disk, "Channels are overloaded (yellow). Please rebalance groups or add new ones"};

Copilot uses AI. Check for mistakes.
}
const ui64 txLimit = Settings.OverloadTxInFlight;

if (txLimit && Executor()->GetStats().TxInFly > txLimit) {
AFL_WARN(NKikimrServices::TX_COLUMNSHARD_WRITE)("event", "shard_overload")("reason", "tx_in_fly")("sum", Executor()->GetStats().TxInFly)(
"limit", txLimit);
return EOverloadStatus::ShardTxInFly;
return TOverloadStatus{EOverloadStatus::ShardTxInFly, TStringBuilder{} << "The local transaction limit has been exceeded " << Executor()->GetStats().TxInFly << " of " << txLimit << ". Please add more resources or reduce the database load."};
Copy link

Copilot AI Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The formatting of this error message is awkward. The transaction counts appear mid-sentence without proper punctuation. Consider reformatting as: "The local transaction limit has been exceeded: " << Executor()->GetStats().TxInFly << " of " << txLimit << ". Please add more resources or reduce the database load." (adding a colon after "exceeded")

Suggested change
return TOverloadStatus{EOverloadStatus::ShardTxInFly, TStringBuilder{} << "The local transaction limit has been exceeded " << Executor()->GetStats().TxInFly << " of " << txLimit << ". Please add more resources or reduce the database load."};
return TOverloadStatus{EOverloadStatus::ShardTxInFly, TStringBuilder{} << "The local transaction limit has been exceeded: " << Executor()->GetStats().TxInFly << " of " << txLimit << ". Please add more resources or reduce the database load."};

Copilot uses AI. Check for mistakes.
}

if (AppData()->FeatureFlags.GetEnableOlapRejectProbability()) {
Expand All @@ -31,12 +31,12 @@ TColumnShard::EOverloadStatus TColumnShard::CheckOverloadedImmediate(const TInte
const float rnd = TAppData::RandomProvider->GenRandReal2();
if (rnd < rejectProbabilty) {
AFL_WARN(NKikimrServices::TX_COLUMNSHARD_WRITE)("event", "shard_overload")("reason", "reject_probality")("RP", rejectProbabilty);
return EOverloadStatus::RejectProbability;
return TOverloadStatus{EOverloadStatus::RejectProbability, "The local database is overloaded. Please add more resources or reduce the database load."};
}
}
}

return EOverloadStatus::None;
return TOverloadStatus{EOverloadStatus::None, {}};
}

void TColumnShard::Handle(TEvColumnShard::TEvOverloadUnsubscribe::TPtr& ev, const TActorContext&) {
Expand Down
14 changes: 7 additions & 7 deletions ydb/core/tx/columnshard/columnshard__write.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,25 +528,25 @@ void TColumnShard::Handle(NEvents::TDataEvents::TEvWrite::TPtr& ev, const TActor
if (outOfSpace) {
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("event", "skip_writing")("reason", "quota_exceeded")("source", "dataevent");
}
auto overloadStatus = outOfSpace ? EOverloadStatus::Disk : CheckOverloadedImmediate(*internalPathId);
if (overloadStatus == EOverloadStatus::None) {
auto overloadStatus = outOfSpace ? TOverloadStatus{EOverloadStatus::Disk, "The disk quota has been exhausted. Please increase the available database disk resources or delete unused data."} : CheckOverloadedImmediate(*internalPathId);
if (overloadStatus.Status == EOverloadStatus::None) {
overloadStatus = ResourcesStatusToOverloadStatus(Counters.GetWritesMonitor()->OnStartWrite(arrowData->GetSize()));
}
if (overloadStatus != EOverloadStatus::None) {
if (overloadStatus.Status != EOverloadStatus::None) {
LWPROBE(EvWriteResult, TabletID(), source.ToString(), record.GetTxId(), cookie, "immediate error", false, "overload data error");
LWPROBE(EvWrite, TabletID(), source.ToString(), cookie, record.GetTxId(), writeTimeout.value_or(TDuration::Max()), arrowData->GetSize(), "", false, operation.GetIsBulk(), ToString(NKikimrDataEvents::TEvWriteResult::STATUS_OVERLOADED), "overload data error " + ToString(overloadStatus));
LWPROBE(EvWrite, TabletID(), source.ToString(), cookie, record.GetTxId(), writeTimeout.value_or(TDuration::Max()), arrowData->GetSize(), "", false, operation.GetIsBulk(), ToString(NKikimrDataEvents::TEvWriteResult::STATUS_OVERLOADED), "overload data error " + ToString(overloadStatus.Status));
auto result = NEvents::TDataEvents::TEvWriteResult::BuildError(
TabletID(), 0, NKikimrDataEvents::TEvWriteResult::STATUS_OVERLOADED, "overload data error");
TabletID(), 0, NKikimrDataEvents::TEvWriteResult::STATUS_OVERLOADED, TStringBuilder{} << "Column shard " << TabletID() << " is overloaded. Reason: " << overloadStatus.Reason);

if ((overloadStatus == EOverloadStatus::ShardWritesSizeInFly || overloadStatus == EOverloadStatus::ShardWritesInFly) && record.HasOverloadSubscribe()) {
if ((overloadStatus.Status == EOverloadStatus::ShardWritesSizeInFly || overloadStatus.Status == EOverloadStatus::ShardWritesInFly) && record.HasOverloadSubscribe()) {
auto seqNo = record.GetOverloadSubscribe();
result->Record.SetOverloadSubscribed(seqNo);
Send(NOverload::TOverloadManagerServiceOperator::MakeServiceId(),
std::make_unique<NOverload::TEvOverloadSubscribe>(NOverload::TColumnShardInfo{.ColumnShardId = SelfId(), .TabletId = TabletID()},
NOverload::TPipeServerInfo{.PipeServerId = ev->Recipient, .InterconnectSessionId = PipeServersInterconnectSessions[ev->Recipient]},
NOverload::TOverloadSubscriberInfo{.PipeServerId = ev->Recipient, .OverloadSubscriberId = ev->Sender, .SeqNo = seqNo}));
}
OverloadWriteFail(overloadStatus,
OverloadWriteFail(overloadStatus.Status,
NEvWrite::TWriteMeta(0, pathId, source, {}, TGUID::CreateTimebased().AsGuidString(),
Counters.GetCSCounters().WritingCounters->GetWriteFlowCounters()),
arrowData->GetSize(), cookie, std::move(result), ctx);
Expand Down
5 changes: 3 additions & 2 deletions ydb/core/tx/columnshard/columnshard_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ class TColumnShard: public TActor<TColumnShard>, public NTabletFlatExecutor::TTa
}

using EOverloadStatus = EOverloadStatus;
using TOverloadStatus = TOverloadStatus;

// For syslocks
void IncCounter(NDataShard::ECumulativeCounters counter, ui64 num = 1) const {
Expand Down Expand Up @@ -384,8 +385,8 @@ class TColumnShard: public TActor<TColumnShard>, public NTabletFlatExecutor::TTa
private:
void OverloadWriteFail(const EOverloadStatus overloadReason, const NEvWrite::TWriteMeta& writeMeta, const ui64 writeSize, const ui64 cookie,
std::unique_ptr<NActors::IEventBase>&& event, const TActorContext& ctx);
EOverloadStatus ResourcesStatusToOverloadStatus(const NOverload::EResourcesStatus status) const;
EOverloadStatus CheckOverloadedImmediate(const TInternalPathId tableId) const;
TOverloadStatus ResourcesStatusToOverloadStatus(const NOverload::EResourcesStatus status) const;
TOverloadStatus CheckOverloadedImmediate(const TInternalPathId tableId) const;
EOverloadStatus CheckOverloadedWait(const TInternalPathId tableId) const;

protected:
Expand Down
5 changes: 5 additions & 0 deletions ydb/core/tx/columnshard/counters/columnshard.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ enum class EOverloadStatus {
RejectProbability,
};

struct TOverloadStatus {
EOverloadStatus Status;
TString Reason;
};

enum class EWriteFailReason {
Disabled /* "disabled" */ = 0,
PutBlob /* "put_blob" */,
Expand Down
2 changes: 1 addition & 1 deletion ydb/tests/functional/serverless/test_serverless.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ def __init__(self, ts):
assert False, 'database did not move into Overloaded state'

logger.info("Upsert data whith SQL")
with pytest.raises(ydb.issues.Overloaded, match=r'.*overload data error.*'):
with pytest.raises(ydb.issues.Overloaded, match=r'.*Column shard.*is overloaded.*'):
qpool.execute_with_retries(
"UPSERT INTO `{}` (ts, value_string) VALUES(Timestamp('2020-01-01T00:00:00.000000Z'), 'xxx')".format(path),
retry_settings=RetrySettings(max_retries=0))
Expand Down
Loading