Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinf
cgroups | A summary of the number of active and enabled cgroups | Linux
cpu\_vulnerabilities | Exposes CPU vulnerability information from sysfs. | Linux
devstat | Exposes device statistics | Dragonfly, FreeBSD
dmmultipath | Exposes DM-multipath device and path metrics from `/sys/block/dm-*`. | Linux
drm | Expose GPU metrics using sysfs / DRM, `amdgpu` is the only driver which exposes this information through DRM | Linux
drbd | Exposes Distributed Replicated Block Device statistics (to version 8.4) | Linux
ethtool | Exposes network interface information and network driver statistics equivalent to `ethtool`, `ethtool -S`, and `ethtool -i`. | Linux
Expand Down Expand Up @@ -339,6 +340,32 @@ echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$
mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom
```

### DM-Multipath Collector

The `dmmultipath` collector reads `/sys/block/dm-*` to discover Device Mapper
multipath devices and expose path health metrics. It identifies multipath
devices by checking that `dm/uuid` starts with `mpath-`, which distinguishes
them from LVM or other DM device types.

No special permissions are required — the collector reads only world-readable
sysfs attributes.

Enable it with `--collector.dmmultipath`.

#### Exposed metrics

| Metric | Type | Description |
|--------|------|-------------|
| `node_dmmultipath_device_info` | Gauge | Info metric with `device`, `sysfs_name`, and `uuid` (contains WWID for PV correlation). |
| `node_dmmultipath_device_active` | Gauge | Whether the DM device is active (1) or suspended (0). Labels: `device`, `sysfs_name`. |
| `node_dmmultipath_device_size_bytes` | Gauge | Size of the DM device in bytes. Labels: `device`, `sysfs_name`. |
| `node_dmmultipath_device_paths` | Gauge | Number of paths. Labels: `device`, `sysfs_name`. |
| `node_dmmultipath_device_paths_active` | Gauge | Number of paths in active state (SCSI `running` or NVMe `live`). Labels: `device`, `sysfs_name`. |
| `node_dmmultipath_device_paths_failed` | Gauge | Number of paths not in active state. Labels: `device`, `sysfs_name`. |
| `node_dmmultipath_path_state` | Gauge | Reports the underlying device state for each path. Labels: `device`, `path`, `state`. |

The `sysfs_name` label (e.g. `dm-0`) matches the `device` label in `node_disk_*` metrics, enabling direct correlation between multipath health and I/O statistics without recording rules.

### Filtering enabled collectors

The `node_exporter` will expose all metrics from enabled collectors by default. This is the recommended way to collect metrics to avoid errors when comparing metrics of different families.
Expand Down
143 changes: 143 additions & 0 deletions collector/dmmultipath_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !nodmmultipath

package collector

import (
"errors"
"fmt"
"log/slog"
"os"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/blockdevice"
)

// isPathActive returns true for device states that indicate a healthy,
// usable path. This covers SCSI ("running") and NVMe ("live") devices.
func isPathActive(state string) bool {
return state == "running" || state == "live"
}

type dmMultipathCollector struct {
fs blockdevice.FS
logger *slog.Logger

deviceInfo *prometheus.Desc
deviceActive *prometheus.Desc
deviceSizeBytes *prometheus.Desc
devicePaths *prometheus.Desc
devicePathsActive *prometheus.Desc
devicePathsFailed *prometheus.Desc
pathState *prometheus.Desc
}

func init() {
registerCollector("dmmultipath", defaultDisabled, NewDMMultipathCollector)
}

// NewDMMultipathCollector returns a new Collector exposing Device Mapper
// multipath device metrics from /sys/block/dm-*.
func NewDMMultipathCollector(logger *slog.Logger) (Collector, error) {
const subsystem = "dmmultipath"

fs, err := blockdevice.NewFS(*procPath, *sysPath)
if err != nil {
return nil, fmt.Errorf("failed to open sysfs: %w", err)
}

deviceLabels := []string{"device", "sysfs_name"}

return &dmMultipathCollector{
fs: fs,
logger: logger,
deviceInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "device_info"),
"Non-numeric information about a DM-multipath device.",
[]string{"device", "sysfs_name", "uuid"}, nil,
),
deviceActive: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "device_active"),
"Whether the multipath device-mapper device is active (1) or suspended (0).",
deviceLabels, nil,
),
deviceSizeBytes: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "device_size_bytes"),
"Size of the multipath device in bytes, read from /sys/block/<dm>/size.",
deviceLabels, nil,
),
devicePaths: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "device_paths"),
"Number of paths for a multipath device.",
deviceLabels, nil,
),
devicePathsActive: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "device_paths_active"),
"Number of paths in active state (SCSI running or NVMe live) for a multipath device.",
deviceLabels, nil,
),
devicePathsFailed: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "device_paths_failed"),
"Number of paths not in active state for a multipath device.",
deviceLabels, nil,
),
pathState: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "path_state"),
"Reports the underlying device state for a multipath path, as read from /sys/block/<dev>/device/state.",
[]string{"device", "path", "state"}, nil,
),
}, nil
}

func (c *dmMultipathCollector) Update(ch chan<- prometheus.Metric) error {
devices, err := c.fs.DMMultipathDevices()
if err != nil {
if errors.Is(err, os.ErrNotExist) || errors.Is(err, os.ErrPermission) {
c.logger.Debug("Could not read DM-multipath devices", "err", err)
return ErrNoData
}
return fmt.Errorf("failed to scan DM-multipath devices: %w", err)
}

for _, dev := range devices {
ch <- prometheus.MustNewConstMetric(c.deviceInfo, prometheus.GaugeValue, 1,
dev.Name, dev.SysfsName, dev.UUID)

active := 0.0
if !dev.Suspended {
active = 1.0
}
ch <- prometheus.MustNewConstMetric(c.deviceActive, prometheus.GaugeValue, active, dev.Name, dev.SysfsName)
ch <- prometheus.MustNewConstMetric(c.deviceSizeBytes, prometheus.GaugeValue, float64(dev.SizeBytes), dev.Name, dev.SysfsName)

var activePaths, failedPaths float64
for _, p := range dev.Paths {
if isPathActive(p.State) {
activePaths++
} else {
failedPaths++
}

ch <- prometheus.MustNewConstMetric(c.pathState, prometheus.GaugeValue, 1,
dev.Name, p.Device, p.State)
}

ch <- prometheus.MustNewConstMetric(c.devicePaths, prometheus.GaugeValue, float64(len(dev.Paths)), dev.Name, dev.SysfsName)
ch <- prometheus.MustNewConstMetric(c.devicePathsActive, prometheus.GaugeValue, activePaths, dev.Name, dev.SysfsName)
ch <- prometheus.MustNewConstMetric(c.devicePathsFailed, prometheus.GaugeValue, failedPaths, dev.Name, dev.SysfsName)
}

return nil
}
151 changes: 151 additions & 0 deletions collector/dmmultipath_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !nodmmultipath

package collector

import (
"io"
"log/slog"
"strings"
"testing"

"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
)

func TestDMMultipathMetrics(t *testing.T) {
*procPath = "fixtures/proc"
*sysPath = "fixtures/sys"

logger := slog.New(slog.NewTextHandler(io.Discard, nil))
coll, err := NewDMMultipathCollector(logger)
if err != nil {
t.Fatal(err)
}

c := coll.(*dmMultipathCollector)

ch := make(chan prometheus.Metric, 200)
if err := c.Update(ch); err != nil {
t.Fatal(err)
}
close(ch)

metrics := make(map[string][]*dto.Metric)
for m := range ch {
d := &dto.Metric{}
if err := m.Write(d); err != nil {
t.Fatal(err)
}
desc := m.Desc().String()
metrics[desc] = append(metrics[desc], d)
}

assertGaugeValue(t, metrics, "device_active", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 1)
assertGaugeValue(t, metrics, "device_active", labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 1)
assertGaugeValue(t, metrics, "device_size_bytes", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 53687091200)
assertGaugeValue(t, metrics, `device_paths"`, labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 4)
assertGaugeValue(t, metrics, `device_paths"`, labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 2)

// mpathA: sdi, sdj, sdk are running; sdl is offline → 3 active, 1 failed.
assertGaugeValue(t, metrics, "device_paths_active", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 3)
assertGaugeValue(t, metrics, "device_paths_failed", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 1)

// mpathB: sdm, sdn are both running → 2 active, 0 failed.
assertGaugeValue(t, metrics, "device_paths_active", labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 2)
assertGaugeValue(t, metrics, "device_paths_failed", labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 0)

assertGaugeValue(t, metrics, "path_state",
labelMap{"device": "mpathA", "path": "sdi", "state": "running"}, 1)
assertGaugeValue(t, metrics, "path_state",
labelMap{"device": "mpathA", "path": "sdl", "state": "offline"}, 1)
}

func TestDMMultipathNoDevices(t *testing.T) {
*procPath = "fixtures/proc"
*sysPath = t.TempDir()

logger := slog.New(slog.NewTextHandler(io.Discard, nil))
coll, err := NewDMMultipathCollector(logger)
if err != nil {
t.Fatal(err)
}

c := coll.(*dmMultipathCollector)

ch := make(chan prometheus.Metric, 200)
err = c.Update(ch)
close(ch)

if err != ErrNoData {
t.Fatalf("expected ErrNoData, got %v", err)
}
}

func TestIsPathActive(t *testing.T) {
tests := []struct {
state string
active bool
}{
{"running", true},
{"live", true},
{"offline", false},
{"blocked", false},
{"transport-offline", false},
{"dead", false},
{"unknown", false},
{"", false},
}
for _, tc := range tests {
got := isPathActive(tc.state)
if got != tc.active {
t.Errorf("isPathActive(%q) = %v, want %v", tc.state, got, tc.active)
}
}
}

type labelMap map[string]string

func assertGaugeValue(t *testing.T, metrics map[string][]*dto.Metric, metricSubstring string, labels labelMap, expected float64) {
t.Helper()
for desc, ms := range metrics {
if !strings.Contains(desc, metricSubstring) {
continue
}
for _, m := range ms {
if matchLabels(m.GetLabel(), labels) {
got := m.GetGauge().GetValue()
if got != expected {
t.Errorf("%s%v: got %v, want %v", metricSubstring, labels, got, expected)
}
return
}
}
}
t.Errorf("metric %s%v not found", metricSubstring, labels)
}

func matchLabels(pairs []*dto.LabelPair, want labelMap) bool {
if want == nil {
return len(pairs) == 0
}
found := 0
for _, lp := range pairs {
if v, ok := want[lp.GetName()]; ok && v == lp.GetValue() {
found++
}
}
return found == len(want)
}
Loading