Skip to content

Commit 52bc95c

Browse files
committed
19472: Fix false CRIT for NetApp temperature sensors without transceivers
Only monitor sensors that report an actual temperature value and at least one threshold. SUP-28183 Change-Id: I9b68380a56a966350135a80950b6a440ad48986a
1 parent d20795b commit 52bc95c

3 files changed

Lines changed: 113 additions & 42 deletions

File tree

.werks/19472.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
[//]: # (werk v3)
2+
# Fix false CRIT for NetApp temperature sensors without transceivers
3+
4+
key | value
5+
---------- | ---
6+
date | 2026-04-01T16:17:31.156156+00:00
7+
version | 2.5.0b4
8+
class | fix
9+
edition | community
10+
component | checks
11+
level | 1
12+
compatible | yes
13+
14+
The `netapp_ontap_temp` check incorrectly reported CRIT for temperature sensors
15+
on NetApp ONTAP systems where DAC cables are used
16+
instead of optical SFP transceivers, or where SSDs have no temperature sensors
17+
installed.
18+
19+
NetApp firmware reports these ports as `installed: true` even when no transceiver
20+
is present, and returns `state: error` with `temperature: null` and all thresholds
21+
`null`. This seems a firmware/API limitation: the hardware detection does not
22+
distinguish between "not installed" and "error".
23+
24+
The check now only monitors sensors that report an actual temperature value and
25+
at least one threshold. DAC-only sensors are silently ignored, eliminating the
26+
false CRIT state.
27+
28+
If you have existing services showing CRIT for such sensors, they will disappear
29+
after the next check.

packages/cmk-plugins/cmk/plugins/netapp/models.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,27 @@ class ShelfTemperatureModel(ShelfObjectModel):
542542
high_warning: int | None = None
543543
high_critical: int | None = None
544544

545+
def _has_sensor_data(self) -> bool:
546+
"""Return True only if the sensor reports a real temperature AND at least one threshold.
547+
548+
NetApp firmware reports installed=true for DAC-only ports (no SFP transceiver present),
549+
setting state=error and all values to null. These are not real errors and must be skipped.
550+
This appears as a firmware/API bug in NetApp: the hardware detection does not differentiate
551+
between "not installed" and "error".
552+
"""
553+
if self.temperature is None:
554+
return False
555+
556+
return None not in (
557+
self.low_warning,
558+
self.low_critical,
559+
self.high_warning,
560+
self.high_critical,
561+
)
562+
563+
def consider_installed(self) -> bool:
564+
return super().consider_installed() and self._has_sensor_data()
565+
545566

546567
class ShelfPsuModel(ShelfObjectModel):
547568
"""

packages/cmk-plugins/tests/cmk/plugins/netapp/agent_based/test_netapp_ontap_temp.py

Lines changed: 63 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from cmk.plugins.netapp.agent_based.netapp_ontap_temp import (
1919
_check_netapp_ontap_temp,
2020
discovery_netapp_ontap_temp,
21+
parse_netapp_ontap_temp,
2122
)
2223
from cmk.plugins.netapp.models import ShelfTemperatureModel
2324

@@ -32,79 +33,68 @@ class ShelfTemperatureModelFactory(ModelFactory):
3233
list_id="2",
3334
state="ok",
3435
id=111,
35-
temperature=20.0,
36+
temperature=20,
3637
ambient=True,
37-
low_warning=None,
38-
low_critical=None,
39-
high_warning=None,
40-
high_critical=None,
38+
low_warning=5,
39+
low_critical=0,
40+
high_warning=55,
41+
high_critical=60,
4142
),
4243
],
4344
"Internal Shelf 2": [
4445
ShelfTemperatureModelFactory.build(
4546
list_id="2",
4647
state="ok",
47-
temperature=50.0,
48+
temperature=50,
4849
ambient=False,
49-
low_warning=None,
50-
low_critical=None,
51-
high_warning=None,
52-
high_critical=None,
50+
low_warning=5,
51+
low_critical=0,
52+
high_warning=95,
53+
high_critical=105,
5354
),
5455
],
5556
"Ambient Shelf 1": [
5657
ShelfTemperatureModelFactory.build(
5758
list_id="1",
5859
state="ok",
59-
temperature=50.0,
60+
temperature=50,
6061
ambient=True,
61-
low_warning=None,
62-
low_critical=None,
63-
high_warning=None,
64-
high_critical=None,
62+
low_warning=5,
63+
low_critical=0,
64+
high_warning=95,
65+
high_critical=105,
6566
),
6667
ShelfTemperatureModelFactory.build(
6768
list_id="1",
6869
state="ok",
69-
temperature=10.0,
70+
temperature=10,
7071
ambient=True,
71-
low_warning=None,
72-
low_critical=None,
73-
high_warning=None,
74-
high_critical=None,
72+
low_warning=5,
73+
low_critical=0,
74+
high_warning=95,
75+
high_critical=105,
7576
),
7677
ShelfTemperatureModelFactory.build(
7778
list_id="1",
7879
state="ok",
79-
temperature=20.0,
80+
temperature=20,
8081
ambient=True,
81-
low_warning=None,
82-
low_critical=None,
83-
high_warning=None,
84-
high_critical=None,
85-
),
86-
ShelfTemperatureModelFactory.build(
87-
list_id="1",
88-
id=20,
89-
state="error",
90-
temperature=None,
91-
ambient=True,
92-
low_warning=None,
93-
low_critical=None,
94-
high_warning=None,
95-
high_critical=None,
82+
low_warning=5,
83+
low_critical=0,
84+
high_warning=95,
85+
high_critical=105,
9686
),
9787
],
9888
"Internal Shelf 1": [
9989
ShelfTemperatureModelFactory.build(
10090
list_id="1",
10191
state="ok",
102-
temperature=30.0,
92+
temperature=30,
10393
ambient=False,
104-
low_warning=None,
105-
low_critical=None,
106-
high_warning=None,
107-
high_critical=None,
94+
low_warning=5,
95+
low_critical=0,
96+
high_warning=95,
97+
high_critical=105,
10898
),
10999
],
110100
}
@@ -142,7 +132,6 @@ def test_check_netapp_ontap_temp_() -> None:
142132
Metric("temp", 50.0),
143133
Result(state=State.OK, summary="Average: 26.7 °C"),
144134
Result(state=State.OK, summary="Lowest: 10 °C"),
145-
Result(state=State.CRIT, summary="Additional failed sensors: 1 (1/20)"),
146135
]
147136

148137

@@ -218,3 +207,35 @@ def test_check_netapp_ontap_temp_trend(
218207
)
219208

220209
assert result[-1] == expected_trend_result
210+
211+
212+
@pytest.mark.parametrize(
213+
"json_line, expected_count",
214+
[
215+
pytest.param(
216+
# Valid sensor: temperature and thresholds present
217+
'{"list_id":"1","id":8,"state":"ok","installed":true,"temperature":57,"ambient":false,'
218+
'"low_warning":5,"low_critical":0,"high_warning":95,"high_critical":105}',
219+
1,
220+
id="valid sensor is kept",
221+
),
222+
pytest.param(
223+
# DAC-only sensor: installed=true but no SFP — all values null (NetApp firmware bug)
224+
'{"list_id":"1","id":10,"state":"error","installed":true,"temperature":null,"ambient":false,'
225+
'"low_warning":null,"low_critical":null,"high_warning":null,"high_critical":null}',
226+
0,
227+
id="DAC sensor with null temperature and null thresholds is filtered",
228+
),
229+
pytest.param(
230+
# Sensor with temperature but no thresholds is filtered
231+
'{"list_id":"1","id":12,"state":"ok","installed":true,"temperature":45,"ambient":false,'
232+
'"low_warning":null,"low_critical":null,"high_warning":null,"high_critical":null}',
233+
0,
234+
id="sensor with temperature but no thresholds is filtered",
235+
),
236+
],
237+
)
238+
def test_parse_netapp_ontap_temp_filters_dac_sensors(json_line: str, expected_count: int) -> None:
239+
section = parse_netapp_ontap_temp([[json_line]])
240+
total = sum(len(sensors) for sensors in section.values())
241+
assert total == expected_count

0 commit comments

Comments
 (0)