Skip to content

Commit b614bb3

Browse files
sradcocursoragent
andcommitted
collector: add nvmesubsystem collector for NVMe-oF path health
Add a new disabled-by-default collector that reads /sys/class/nvme-subsystem/ to expose NVMe over Fabrics subsystem path health metrics. Exposed metrics: - node_nvmesubsystem_info: subsystem identity (NQN, model, serial, iopolicy) - node_nvmesubsystem_namespace_info: maps namespace device to subsystem - node_nvmesubsystem_paths: total controller paths per subsystem - node_nvmesubsystem_paths_live: live controller paths per subsystem - node_nvmesubsystem_path_state: per-controller state (live, connecting, dead, etc.) The namespace_info metric enables precise correlation between NVMe block devices (e.g. nvme0n1) and their parent subsystems for workload-aware storage path health alerting. Enable with --collector.nvmesubsystem Depends on: prometheus/procfs#797 Signed-off-by: Shirly Radco <sradco@redhat.com> Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 618342b commit b614bb3

6 files changed

Lines changed: 423 additions & 34 deletions

File tree

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So
203203
meminfo\_numa | Exposes memory statistics from `/sys/devices/system/node/node[0-9]*/meminfo`, `/sys/devices/system/node/node[0-9]*/numastat`. | Linux
204204
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
205205
network_route | Exposes the routing table as metrics | Linux
206+
nvmesubsystem | Exposes NVMe-oF subsystem path health from `/sys/class/nvme-subsystem/`. | Linux
206207
pcidevice | Exposes pci devices' information including their link status and parent devices. | Linux
207208
perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux
208209
processes | Exposes aggregate process statistics from `/proc`. | Linux
@@ -366,6 +367,25 @@ Enable it with `--collector.dmmultipath`.
366367

367368
The `sysfs_name` label (e.g. `dm-0`) matches the `device` label in `node_disk_*` metrics, enabling direct correlation between multipath health and I/O statistics without recording rules.
368369

370+
### NVMe Subsystem Collector
371+
372+
The `nvmesubsystem` collector exposes NVMe-oF (NVMe over Fabrics) subsystem
373+
path health by reading `/sys/class/nvme-subsystem/`. It complements the
374+
existing `nvme` collector (which reports per-controller hardware stats) by
375+
monitoring the **connectivity layer** — how many controller paths are live,
376+
connecting, or dead for each NVMe subsystem.
377+
378+
Enable it with `--collector.nvmesubsystem`.
379+
380+
#### Exposed metrics
381+
382+
| Metric | Description |
383+
|--------|-------------|
384+
| `node_nvmesubsystem_info` | Info metric with subsystem NQN, model, serial and I/O policy as labels. |
385+
| `node_nvmesubsystem_paths` | Number of controller paths for the subsystem. |
386+
| `node_nvmesubsystem_paths_live` | Number of controller paths currently in `live` state. |
387+
| `node_nvmesubsystem_path_state` | Per-controller path state (1 for the current state, 0 for others). |
388+
369389
### Filtering enabled collectors
370390

371391
The `node_exporter` will expose all metrics from enabled collectors by default. This is the recommended way to collect metrics to avoid errors when comparing metrics of different families.
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package collector
15+
16+
import (
17+
"strings"
18+
"testing"
19+
20+
dto "github.com/prometheus/client_model/go"
21+
)
22+
23+
type labelMap map[string]string
24+
25+
func assertGaugeValue(t *testing.T, metrics map[string][]*dto.Metric, metricSubstring string, labels labelMap, expected float64) {
26+
t.Helper()
27+
for desc, ms := range metrics {
28+
if !strings.Contains(desc, metricSubstring) {
29+
continue
30+
}
31+
for _, m := range ms {
32+
if matchLabels(m.GetLabel(), labels) {
33+
got := m.GetGauge().GetValue()
34+
if got != expected {
35+
t.Errorf("%s%v: got %v, want %v", metricSubstring, labels, got, expected)
36+
}
37+
return
38+
}
39+
}
40+
}
41+
t.Errorf("metric %s%v not found", metricSubstring, labels)
42+
}
43+
44+
func matchLabels(pairs []*dto.LabelPair, want labelMap) bool {
45+
if want == nil {
46+
return len(pairs) == 0
47+
}
48+
found := 0
49+
for _, lp := range pairs {
50+
if v, ok := want[lp.GetName()]; ok && v == lp.GetValue() {
51+
found++
52+
}
53+
}
54+
return found == len(want)
55+
}

collector/dmmultipath_linux_test.go

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package collector
1818
import (
1919
"io"
2020
"log/slog"
21-
"strings"
2221
"testing"
2322

2423
"github.com/prometheus/client_golang/prometheus"
@@ -116,36 +115,3 @@ func TestIsPathActive(t *testing.T) {
116115
}
117116
}
118117

119-
type labelMap map[string]string
120-
121-
func assertGaugeValue(t *testing.T, metrics map[string][]*dto.Metric, metricSubstring string, labels labelMap, expected float64) {
122-
t.Helper()
123-
for desc, ms := range metrics {
124-
if !strings.Contains(desc, metricSubstring) {
125-
continue
126-
}
127-
for _, m := range ms {
128-
if matchLabels(m.GetLabel(), labels) {
129-
got := m.GetGauge().GetValue()
130-
if got != expected {
131-
t.Errorf("%s%v: got %v, want %v", metricSubstring, labels, got, expected)
132-
}
133-
return
134-
}
135-
}
136-
}
137-
t.Errorf("metric %s%v not found", metricSubstring, labels)
138-
}
139-
140-
func matchLabels(pairs []*dto.LabelPair, want labelMap) bool {
141-
if want == nil {
142-
return len(pairs) == 0
143-
}
144-
found := 0
145-
for _, lp := range pairs {
146-
if v, ok := want[lp.GetName()]; ok && v == lp.GetValue() {
147-
found++
148-
}
149-
}
150-
return found == len(want)
151-
}

collector/fixtures/sys.ttar

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2423,6 +2423,104 @@ Lines: 1
24232423
4096
24242424
Mode: 644
24252425
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2426+
Directory: sys/class/nvme-subsystem
2427+
Mode: 755
2428+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2429+
Directory: sys/class/nvme-subsystem/nvme-subsys0
2430+
Mode: 755
2431+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2432+
Path: sys/class/nvme-subsystem/nvme-subsys0/iopolicy
2433+
Lines: 1
2434+
round-robinEOF
2435+
Mode: 644
2436+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2437+
Path: sys/class/nvme-subsystem/nvme-subsys0/model
2438+
Lines: 1
2439+
Dell PowerStoreEOF
2440+
Mode: 644
2441+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2442+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme0
2443+
Mode: 755
2444+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2445+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/address
2446+
Lines: 1
2447+
nn-0x200000109b123456:pn-0x100000109b123456EOF
2448+
Mode: 644
2449+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2450+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/state
2451+
Lines: 1
2452+
liveEOF
2453+
Mode: 644
2454+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2455+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/transport
2456+
Lines: 1
2457+
fcEOF
2458+
Mode: 644
2459+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2460+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme1
2461+
Mode: 755
2462+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2463+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/address
2464+
Lines: 1
2465+
nn-0x200000109b123457:pn-0x100000109b123457EOF
2466+
Mode: 644
2467+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2468+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/state
2469+
Lines: 1
2470+
liveEOF
2471+
Mode: 644
2472+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2473+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/transport
2474+
Lines: 1
2475+
fcEOF
2476+
Mode: 644
2477+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2478+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme2
2479+
Mode: 755
2480+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2481+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/address
2482+
Lines: 1
2483+
nn-0x200000109b123458:pn-0x100000109b123458EOF
2484+
Mode: 644
2485+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2486+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/state
2487+
Lines: 1
2488+
liveEOF
2489+
Mode: 644
2490+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2491+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/transport
2492+
Lines: 1
2493+
fcEOF
2494+
Mode: 644
2495+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2496+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme3
2497+
Mode: 755
2498+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2499+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/address
2500+
Lines: 1
2501+
nn-0x200000109b123459:pn-0x100000109b123459EOF
2502+
Mode: 644
2503+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2504+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/state
2505+
Lines: 1
2506+
deadEOF
2507+
Mode: 644
2508+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2509+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/transport
2510+
Lines: 1
2511+
fcEOF
2512+
Mode: 644
2513+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2514+
Path: sys/class/nvme-subsystem/nvme-subsys0/serial
2515+
Lines: 1
2516+
SN12345678EOF
2517+
Mode: 644
2518+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2519+
Path: sys/class/nvme-subsystem/nvme-subsys0/subsysnqn
2520+
Lines: 1
2521+
nqn.2014-08.org.nvmexpress:uuid:a34c4f3a-0d6f-5cec-dead-beefcafebabeEOF
2522+
Mode: 644
2523+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
24262524
Directory: sys/class/power_supply
24272525
Mode: 755
24282526
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

collector/nvmesubsystem_linux.go

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nonvmesubsystem
15+
16+
package collector
17+
18+
import (
19+
"errors"
20+
"fmt"
21+
"log/slog"
22+
"os"
23+
24+
"github.com/prometheus/client_golang/prometheus"
25+
"github.com/prometheus/procfs/sysfs"
26+
)
27+
28+
var nvmeControllerStates = []string{
29+
"live", "connecting", "resetting", "dead", "unknown",
30+
}
31+
32+
func normalizeControllerState(raw string) string {
33+
switch raw {
34+
case "live", "connecting", "resetting", "dead":
35+
return raw
36+
case "deleting", "deleting (no IO)", "new":
37+
return raw
38+
default:
39+
return "unknown"
40+
}
41+
}
42+
43+
type nvmeSubsystemCollector struct {
44+
fs sysfs.FS
45+
logger *slog.Logger
46+
47+
subsystemInfo *prometheus.Desc
48+
namespaceInfo *prometheus.Desc
49+
subsystemPaths *prometheus.Desc
50+
subsystemPathsLive *prometheus.Desc
51+
pathState *prometheus.Desc
52+
}
53+
54+
func init() {
55+
registerCollector("nvmesubsystem", defaultDisabled, NewNVMeSubsystemCollector)
56+
}
57+
58+
// NewNVMeSubsystemCollector returns a new Collector exposing NVMe-oF subsystem
59+
// path health from /sys/class/nvme-subsystem/.
60+
func NewNVMeSubsystemCollector(logger *slog.Logger) (Collector, error) {
61+
const subsystem = "nvmesubsystem"
62+
63+
fs, err := sysfs.NewFS(*sysPath)
64+
if err != nil {
65+
return nil, fmt.Errorf("failed to open sysfs: %w", err)
66+
}
67+
68+
return &nvmeSubsystemCollector{
69+
fs: fs,
70+
logger: logger,
71+
subsystemInfo: prometheus.NewDesc(
72+
prometheus.BuildFQName(namespace, subsystem, "info"),
73+
"Non-numeric information about an NVMe subsystem.",
74+
[]string{"subsystem", "nqn", "model", "serial", "iopolicy"}, nil,
75+
),
76+
namespaceInfo: prometheus.NewDesc(
77+
prometheus.BuildFQName(namespace, subsystem, "namespace_info"),
78+
"Maps an NVMe namespace block device to its subsystem.",
79+
[]string{"subsystem", "device"}, nil,
80+
),
81+
subsystemPaths: prometheus.NewDesc(
82+
prometheus.BuildFQName(namespace, subsystem, "paths"),
83+
"Number of controller paths for an NVMe subsystem.",
84+
[]string{"subsystem"}, nil,
85+
),
86+
subsystemPathsLive: prometheus.NewDesc(
87+
prometheus.BuildFQName(namespace, subsystem, "paths_live"),
88+
"Number of controller paths in live state for an NVMe subsystem.",
89+
[]string{"subsystem"}, nil,
90+
),
91+
pathState: prometheus.NewDesc(
92+
prometheus.BuildFQName(namespace, subsystem, "path_state"),
93+
"Current NVMe controller path state (1 for the current state, 0 for all others).",
94+
[]string{"subsystem", "controller", "transport", "state"}, nil,
95+
),
96+
}, nil
97+
}
98+
99+
func (c *nvmeSubsystemCollector) Update(ch chan<- prometheus.Metric) error {
100+
subsystems, err := c.fs.NVMeSubsystemClass()
101+
if err != nil {
102+
if errors.Is(err, os.ErrNotExist) || errors.Is(err, os.ErrPermission) {
103+
c.logger.Debug("Could not read NVMe subsystem info", "err", err)
104+
return ErrNoData
105+
}
106+
return fmt.Errorf("failed to scan NVMe subsystems: %w", err)
107+
}
108+
109+
for _, subsys := range subsystems {
110+
ch <- prometheus.MustNewConstMetric(c.subsystemInfo, prometheus.GaugeValue, 1,
111+
subsys.Name, subsys.NQN, subsys.Model, subsys.Serial, subsys.IOPolicy)
112+
113+
for _, ns := range subsys.Namespaces {
114+
ch <- prometheus.MustNewConstMetric(c.namespaceInfo, prometheus.GaugeValue, 1,
115+
subsys.Name, ns)
116+
}
117+
118+
total := float64(len(subsys.Controllers))
119+
var live float64
120+
for _, ctrl := range subsys.Controllers {
121+
state := normalizeControllerState(ctrl.State)
122+
if state == "live" {
123+
live++
124+
}
125+
126+
for _, s := range nvmeControllerStates {
127+
val := 0.0
128+
if s == state {
129+
val = 1.0
130+
}
131+
ch <- prometheus.MustNewConstMetric(c.pathState, prometheus.GaugeValue, val,
132+
subsys.Name, ctrl.Name, ctrl.Transport, s)
133+
}
134+
}
135+
136+
ch <- prometheus.MustNewConstMetric(c.subsystemPaths, prometheus.GaugeValue, total, subsys.Name)
137+
ch <- prometheus.MustNewConstMetric(c.subsystemPathsLive, prometheus.GaugeValue, live, subsys.Name)
138+
}
139+
140+
return nil
141+
}

0 commit comments

Comments
 (0)