Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 101 additions & 54 deletions collector/edac_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ package collector
import (
"fmt"
"log/slog"
"os"
"path/filepath"
"regexp"
"strings"

"github.com/prometheus/client_golang/prometheus"
)
Expand All @@ -30,115 +32,160 @@ const (

var (
edacMemControllerRE = regexp.MustCompile(`.*devices/system/edac/mc/mc([0-9]*)`)
edacMemCsrowRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/csrow([0-9]*)`)
edacMemDimmRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/dimm([0-9]*)`)
)

type edacCollector struct {
ceCount *prometheus.Desc
ueCount *prometheus.Desc
csRowCECount *prometheus.Desc
csRowUECount *prometheus.Desc
logger *slog.Logger
ceCount *prometheus.Desc
ueCount *prometheus.Desc
channelCECount *prometheus.Desc
channelUECount *prometheus.Desc
logger *slog.Logger
}

func init() {
registerCollector("edac", defaultEnabled, NewEdacCollector)
}

// NewEdacCollector returns a new Collector exposing edac stats.
func NewEdacCollector(logger *slog.Logger) (Collector, error) {

return &edacCollector{

ceCount: prometheus.NewDesc(
prometheus.BuildFQName(namespace, edacSubsystem, "correctable_errors_total"),
"Total correctable memory errors.",
[]string{"controller"}, nil,
[]string{"controller"},
nil,
),

ueCount: prometheus.NewDesc(
prometheus.BuildFQName(namespace, edacSubsystem, "uncorrectable_errors_total"),
"Total uncorrectable memory errors.",
[]string{"controller"}, nil,
[]string{"controller"},
nil,
),
csRowCECount: prometheus.NewDesc(
prometheus.BuildFQName(namespace, edacSubsystem, "csrow_correctable_errors_total"),
"Total correctable memory errors for this csrow.",
[]string{"controller", "csrow"}, nil,

channelCECount: prometheus.NewDesc(
prometheus.BuildFQName(namespace, edacSubsystem, "channel_correctable_errors_total"),
"Total correctable memory errors for this channel.",
[]string{"controller", "csrow", "channel", "dimm_label"},
nil,
),
csRowUECount: prometheus.NewDesc(
prometheus.BuildFQName(namespace, edacSubsystem, "csrow_uncorrectable_errors_total"),
"Total uncorrectable memory errors for this csrow.",
[]string{"controller", "csrow"}, nil,

channelUECount: prometheus.NewDesc(
prometheus.BuildFQName(namespace, edacSubsystem, "channel_uncorrectable_errors_total"),
"Total uncorrectable memory errors for this channel.",
[]string{"controller", "csrow", "channel", "dimm_label"},
nil,
),

logger: logger,
}, nil
}

func (c *edacCollector) Update(ch chan<- prometheus.Metric) error {

memControllers, err := filepath.Glob(sysFilePath("devices/system/edac/mc/mc[0-9]*"))
if err != nil {
return err
}

for _, controller := range memControllers {

controllerMatch := edacMemControllerRE.FindStringSubmatch(controller)
if controllerMatch == nil {
return fmt.Errorf("controller string didn't match regexp: %s", controller)
}

controllerNumber := controllerMatch[1]

value, err := readUintFromFile(filepath.Join(controller, "ce_count"))
if err != nil {
return fmt.Errorf("couldn't get ce_count for controller %s: %w", controllerNumber, err)
}
ch <- prometheus.MustNewConstMetric(
c.ceCount, prometheus.CounterValue, float64(value), controllerNumber)

value, err = readUintFromFile(filepath.Join(controller, "ce_noinfo_count"))
if err != nil {
return fmt.Errorf("couldn't get ce_noinfo_count for controller %s: %w", controllerNumber, err)
if err == nil {
ch <- prometheus.MustNewConstMetric(
c.ceCount,
prometheus.CounterValue,
float64(value),
controllerNumber,
)
}
ch <- prometheus.MustNewConstMetric(
c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown")

value, err = readUintFromFile(filepath.Join(controller, "ue_count"))
if err != nil {
return fmt.Errorf("couldn't get ue_count for controller %s: %w", controllerNumber, err)
}
ch <- prometheus.MustNewConstMetric(
c.ueCount, prometheus.CounterValue, float64(value), controllerNumber)

value, err = readUintFromFile(filepath.Join(controller, "ue_noinfo_count"))
if err != nil {
return fmt.Errorf("couldn't get ue_noinfo_count for controller %s: %w", controllerNumber, err)
if err == nil {
ch <- prometheus.MustNewConstMetric(
c.ueCount,
prometheus.CounterValue,
float64(value),
controllerNumber,
)
}
ch <- prometheus.MustNewConstMetric(
c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown")

// For each controller, walk the csrow directories.
csrows, err := filepath.Glob(controller + "/csrow[0-9]*")

if err != nil {
return err
}

for _, csrow := range csrows {
csrowMatch := edacMemCsrowRE.FindStringSubmatch(csrow)
if csrowMatch == nil {
return fmt.Errorf("csrow string didn't match regexp: %s", csrow)
base := filepath.Base(csrow)

match := regexp.MustCompile(`csrow([0-9]+)`).FindStringSubmatch(base)
if match == nil {
continue
}
csrowNumber := csrowMatch[1]
csrowNumber := match[1]

value, err = readUintFromFile(filepath.Join(csrow, "ce_count"))
channelFiles, err := filepath.Glob(csrow + "/ch*_ce_count")
if err != nil {
return fmt.Errorf("couldn't get ce_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err)
return err
}
ch <- prometheus.MustNewConstMetric(
c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber)

value, err = readUintFromFile(filepath.Join(csrow, "ue_count"))
if err != nil {
return fmt.Errorf("couldn't get ue_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err)
for _, chFile := range channelFiles {

base := filepath.Base(chFile)

match := regexp.MustCompile(`ch([0-9]+)_ce_count`).FindStringSubmatch(base)
if match == nil {
continue
}

channelNumber := match[1]
label := "unknown"
labelBytes, err := os.ReadFile(filepath.Join(csrow, "ch"+channelNumber+"_dimm_label"))
if err == nil {
label = strings.TrimSpace(string(labelBytes))
// format label
label = strings.ReplaceAll(label, "#", "")
label = strings.ReplaceAll(label, "csrow", "_csrow")
label = strings.ReplaceAll(label, "channel", "_channel")
}
value, err := readUintFromFile(chFile)
if err == nil {
ch <- prometheus.MustNewConstMetric(
c.channelCECount,
prometheus.CounterValue,
float64(value),
controllerNumber,
csrowNumber,
channelNumber,
label,
)
}

value, err = readUintFromFile(filepath.Join(csrow, "ch"+channelNumber+"_ue_count"))
if err == nil {
ch <- prometheus.MustNewConstMetric(
c.channelUECount,
prometheus.CounterValue,
float64(value),
controllerNumber,
csrowNumber,
channelNumber,
label,
)
}
}
ch <- prometheus.MustNewConstMetric(
c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber)
}
}

return err
return nil
}
20 changes: 12 additions & 8 deletions collector/fixtures/e2e-64k-page-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1331,17 +1331,21 @@ node_drbd_remote_pending{device="drbd1"} 12346
# HELP node_drbd_remote_unacknowledged Number of requests received by the peer via the network connection, but that have not yet been answered.
# TYPE node_drbd_remote_unacknowledged gauge
node_drbd_remote_unacknowledged{device="drbd1"} 12347
# HELP node_edac_channel_correctable_errors_total Total correctable memory errors for this channel.
# TYPE node_edac_channel_correctable_errors_total counter
node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 0
node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 0
node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 0
node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 0
# HELP node_edac_channel_uncorrectable_errors_total Total uncorrectable memory errors for this channel.
# TYPE node_edac_channel_uncorrectable_errors_total counter
node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 2
node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 2
node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 2
node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 2
# HELP node_edac_correctable_errors_total Total correctable memory errors.
# TYPE node_edac_correctable_errors_total counter
node_edac_correctable_errors_total{controller="0"} 1
# HELP node_edac_csrow_correctable_errors_total Total correctable memory errors for this csrow.
# TYPE node_edac_csrow_correctable_errors_total counter
node_edac_csrow_correctable_errors_total{controller="0",csrow="0"} 3
node_edac_csrow_correctable_errors_total{controller="0",csrow="unknown"} 2
# HELP node_edac_csrow_uncorrectable_errors_total Total uncorrectable memory errors for this csrow.
# TYPE node_edac_csrow_uncorrectable_errors_total counter
node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="0"} 4
node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="unknown"} 6
# HELP node_edac_uncorrectable_errors_total Total uncorrectable memory errors.
# TYPE node_edac_uncorrectable_errors_total counter
node_edac_uncorrectable_errors_total{controller="0"} 5
Expand Down
20 changes: 12 additions & 8 deletions collector/fixtures/e2e-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1363,17 +1363,21 @@ node_drbd_remote_pending{device="drbd1"} 12346
# HELP node_drbd_remote_unacknowledged Number of requests received by the peer via the network connection, but that have not yet been answered.
# TYPE node_drbd_remote_unacknowledged gauge
node_drbd_remote_unacknowledged{device="drbd1"} 12347
# HELP node_edac_channel_correctable_errors_total Total correctable memory errors for this channel.
# TYPE node_edac_channel_correctable_errors_total counter
node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 0
node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 0
node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 0
node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 0
# HELP node_edac_channel_uncorrectable_errors_total Total uncorrectable memory errors for this channel.
# TYPE node_edac_channel_uncorrectable_errors_total counter
node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 2
node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 2
node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 2
node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 2
# HELP node_edac_correctable_errors_total Total correctable memory errors.
# TYPE node_edac_correctable_errors_total counter
node_edac_correctable_errors_total{controller="0"} 1
# HELP node_edac_csrow_correctable_errors_total Total correctable memory errors for this csrow.
# TYPE node_edac_csrow_correctable_errors_total counter
node_edac_csrow_correctable_errors_total{controller="0",csrow="0"} 3
node_edac_csrow_correctable_errors_total{controller="0",csrow="unknown"} 2
# HELP node_edac_csrow_uncorrectable_errors_total Total uncorrectable memory errors for this csrow.
# TYPE node_edac_csrow_uncorrectable_errors_total counter
node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="0"} 4
node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="unknown"} 6
# HELP node_edac_uncorrectable_errors_total Total uncorrectable memory errors.
# TYPE node_edac_uncorrectable_errors_total counter
node_edac_uncorrectable_errors_total{controller="0"} 5
Expand Down
63 changes: 63 additions & 0 deletions collector/fixtures/sys.ttar
Original file line number Diff line number Diff line change
Expand Up @@ -9174,11 +9174,54 @@ Mode: 644
Directory: sys/devices/system/edac/mc/mc0/csrow0
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_ce_count
Lines: 1
0
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/devices/system/edac/mc/mc0/csrow1
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow1/ch0_ce_count
Lines: 1
0
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_ce_count
Lines: 1
0
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow1/ch1_ce_count
Lines: 1
0
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow0/ce_count
Lines: 1
3
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_ue_count
Lines: 1
2
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow1/ch0_ue_count
Lines: 1
2
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_ue_count
Lines: 1
2
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow1/ch1_ue_count
Lines: 1
2
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow0/ue_count
Lines: 1
4
Expand All @@ -9194,6 +9237,26 @@ Lines: 1
6
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_dimm_label
Lines: 1
mc0csrow0channel0
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_dimm_label
Lines: 1
mc0csrow0channel1
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow1/ch0_dimm_label
Lines: 1
mc0csrow1channel0
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/system/edac/mc/mc0/csrow1/ch1_dimm_label
Lines: 1
mc0csrow1channel1
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/devices/system/node
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down