Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 68 additions & 1 deletion etc/kayobe/ansible/scripts/smartmon.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,65 @@ def load_dwpd_ratings(path=DWPD_RATINGS_PATH):
DWPD_RATINGS = load_dwpd_ratings()


# Helper: Identify historical temperature/airflow attribute failures
def is_historical_temperature_attr_failure(attribute):
"""
Return True when a pySMART attribute failure represents only a historical
temperature/airflow threshold breach.

Some disks keep WHEN_FAILED=In_the_past forever after an overheating event.
pySMART turns that into assessment=WARN, which is useful to expose, but it
should not make the main smart_healthy metric look like an active disk
failure.
"""
when_failed = str(getattr(attribute, "when_failed", "") or "").strip().lower()
name = str(getattr(attribute, "name", "") or "").strip().lower()

if when_failed != "in_the_past":
return False

return "temperature" in name or "airflow" in name


def get_failed_smart_attributes(device):
"""
Return pySMART attributes with a meaningful WHEN_FAILED value.
"""
failed_attrs = []
for attribute in getattr(device, "attributes", []) or []:
when_failed = str(getattr(attribute, "when_failed", "") or "").strip().lower()
if when_failed and when_failed not in {"-", "none", "never"}:
failed_attrs.append(attribute)
return failed_attrs


def smart_health_value(device):
"""
Convert pySMART assessment into the exported healthy metric.

PASS is healthy. WARN is also treated as healthy only when every failed
attribute is a historical temperature/airflow threshold breach. Other WARN
states, FAIL states, current failures, and non-temperature historical
failures remain unhealthy.
"""
assessment = str(device.assessment or "").strip().upper()

if assessment == "PASS":
return 1

if assessment != "WARN":
return 0

failed_attrs = get_failed_smart_attributes(device)
if not failed_attrs:
return 0

if all(is_historical_temperature_attr_failure(attribute) for attribute in failed_attrs):
return 1

return 0
Comment on lines +189 to +213
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The smart_health_value function can be optimized by accepting failed_attrs as an argument. This avoids redundant calls to get_failed_smart_attributes when this function is used in conjunction with other logic that also requires the list of failed attributes, as seen in parse_device_info.

def smart_health_value(device, failed_attrs):
    """
    Convert pySMART assessment into the exported healthy metric.

    PASS is healthy. WARN is also treated as healthy only when every failed
    attribute is a historical temperature/airflow threshold breach. Other WARN
    states, FAIL states, current failures, and non-temperature historical
    failures remain unhealthy.
    """
    assessment = str(device.assessment or "").strip().upper()

    if assessment == "PASS":
        return 1

    if assessment != "WARN":
        return 0

    if not failed_attrs:
        return 0

    if all(is_historical_temperature_attr_failure(attribute) for attribute in failed_attrs):
        return 1

    return 0



def get_rated_dwpd(model_name):
"""
Look up DWPD rating for the given model name, defaulting to 1.0.
Expand Down Expand Up @@ -224,6 +283,7 @@ def parse_device_info(device):
"device_model": device.model or "",
"serial_number": serial_number,
"firmware_version": device.firmware or "",
"assessment": device.assessment or "",
}
sorted_labels = sorted(labels.items())
label_str = ",".join(f'{k}="{v}"' for k, v in sorted_labels)
Expand All @@ -240,10 +300,17 @@ def parse_device_info(device):
f'smartmon_device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}'
)
if device.assessment:
is_healthy = 1 if device.assessment.upper() == "PASS" else 0
is_healthy = smart_health_value(device)
metrics.append(
f'smartmon_device_smart_healthy{{{metric_labels}}} {float(is_healthy)}'
)
failed_attrs = get_failed_smart_attributes(device)
historical_temperature_attr_failure = 1 if failed_attrs and all(
is_historical_temperature_attr_failure(attribute) for attribute in failed_attrs
) else 0
metrics.append(
f'smartmon_device_historical_temperature_failure{{{metric_labels}}} {float(historical_temperature_attr_failure)}'
)
Comment on lines +303 to +313
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic here can be simplified and optimized. By passing the already computed failed_attrs to the updated smart_health_value function, we avoid a redundant call. Additionally, the historical_temperature_attr_failure metric can be derived directly from the is_healthy status and the assessment string, as is_healthy is only 1 for a WARN state if all failures are historical temperature/airflow breaches.

            assessment_upper = str(device.assessment).strip().upper()
            failed_attrs = get_failed_smart_attributes(device)
            is_healthy = smart_health_value(device, failed_attrs)
            metrics.append(
                f'smartmon_device_smart_healthy{{metric_labels}} {float(is_healthy)}'
            )
            historical_temperature_attr_failure = 1 if is_healthy == 1 and assessment_upper == "WARN" else 0
            metrics.append(
                f'smartmon_device_historical_temperature_failure{{metric_labels}} {float(historical_temperature_attr_failure)}'
            )


# Explicitly collect top-level temperature if available (fixes SCSI temperature issue)
# pySMART exposes 'temperature' as a top-level property which we can use for SCSI,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
features:
- |
The smartmon exporter now treats historical SMART temperature and airflow
threshold breaches as non-critical when calculating the
smartmon_device_smart_healthy metric. This prevents disks with only a past
over-temperature event from being reported as actively unhealthy.

A new smartmon_device_historical_temperature_failure metric is exported so
these historical temperature or airflow threshold breaches can still be
viewed and alerted on separately as a warning if required.
Loading