diff --git a/collector/hwmon_linux.go b/collector/hwmon_linux.go index fe4ac817aa..f2b71dfbdf 100644 --- a/collector/hwmon_linux.go +++ b/collector/hwmon_linux.go @@ -161,19 +161,9 @@ func collectSensorData(dir string, data map[string]map[string]string) error { return nil } -func (c *hwMonCollector) updateHwmon(ch chan<- prometheus.Metric, dir string) error { - hwmonName, err := c.hwmonName(dir) - if err != nil { - return err - } - - if c.deviceFilter.ignored(hwmonName) { - c.logger.Debug("ignoring hwmon chip", "chip", hwmonName) - return nil - } - +func (c *hwMonCollector) updateHwmon(ch chan<- prometheus.Metric, dir, hwmonName string) error { data := make(map[string]map[string]string) - err = collectSensorData(dir, data) + err := collectSensorData(dir, data) if err != nil { return err } @@ -452,7 +442,7 @@ func (c *hwMonCollector) hwmonHumanReadableChipName(dir string) (string, error) func (c *hwMonCollector) Update(ch chan<- prometheus.Metric) error { // Step 1: scan /sys/class/hwmon, resolve all symlinks and call - // updatesHwmon for each folder + // updateHwmon for each folder. hwmonPathName := filepath.Join(sysFilePath("class"), "hwmon") @@ -466,7 +456,20 @@ func (c *hwMonCollector) Update(ch chan<- prometheus.Metric) error { return err } - var lastErr error + // Pass 1: enumerate hwmon directories and pre-compute the device-derived + // chip name. Multiple hwmon nodes can share a single parent device (for + // example, asus-nb-wmi exposes one hwmon for fan control and another for + // WMI sensors), which makes hwmonName collide and triggers the "metric + // collected before with the same name and label values" registry error. + type hwmonEntry struct { + dir string + baseName string + nameFile string + } + + entries := make([]hwmonEntry, 0, len(hwmonFiles)) + chipCounts := make(map[string]int) + nameCounts := make(map[string]int) for _, hwDir := range hwmonFiles { hwmonXPathName := filepath.Join(hwmonPathName, hwDir.Name()) fileInfo, err := os.Lstat(hwmonXPathName) @@ -485,7 +488,48 @@ func (c *hwMonCollector) Update(ch chan<- prometheus.Metric) error { continue } - if err = c.updateHwmon(ch, hwmonXPathName); err != nil { + baseName, err := c.hwmonName(hwmonXPathName) + if err != nil { + c.logger.Debug("failed to derive hwmon chip name", "dir", hwmonXPathName, "err", err) + continue + } + + nameFile := "" + if raw, err := os.ReadFile(filepath.Join(hwmonXPathName, "name")); err == nil { + nameFile = strings.TrimSpace(string(raw)) + } + + entries = append(entries, hwmonEntry{ + dir: hwmonXPathName, + baseName: baseName, + nameFile: nameFile, + }) + chipCounts[baseName]++ + nameCounts[baseName+"\x00"+nameFile]++ + } + + // Pass 2: emit metrics. For each entry, resolve a unique chip label + // (disambiguating colliding base names with the chip's `name` file + // content when distinct, else with the hwmonX basename) and only then + // apply the include/exclude filter so user regexes match the label + // that ends up in the metrics. + var lastErr error + for _, e := range entries { + chipName := e.baseName + if chipCounts[e.baseName] > 1 { + suffix := cleanMetricName(e.nameFile) + if suffix == "" || nameCounts[e.baseName+"\x00"+e.nameFile] > 1 { + suffix = cleanMetricName(filepath.Base(e.dir)) + } + chipName = e.baseName + "_" + suffix + } + + if c.deviceFilter.ignored(chipName) { + c.logger.Debug("ignoring hwmon chip", "chip", chipName) + continue + } + + if err := c.updateHwmon(ch, e.dir, chipName); err != nil { lastErr = err } } diff --git a/collector/hwmon_linux_test.go b/collector/hwmon_linux_test.go new file mode 100644 index 0000000000..c44f7d9971 --- /dev/null +++ b/collector/hwmon_linux_test.go @@ -0,0 +1,275 @@ +// Copyright 2026 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !nohwmon + +package collector + +import ( + "io" + "log/slog" + "os" + "path/filepath" + "testing" + + "github.com/prometheus/client_golang/prometheus" +) + +// fakeHwmon describes a single hwmon node to materialize under a temporary +// /sys layout. The collector's hwmonName function reads the parent of the +// `device` symlink to derive a chip label, which is what produces the +// collision we need to test. +type fakeHwmon struct { + hwmonDir string // e.g. "hwmon3" + device string // parent device dir name, e.g. "asus-nb-wmi". Empty disables the device symlink. + name string // contents of the optional `name` file + files map[string]string // sensor file basename -> content +} + +// buildFakeSysfs writes a minimal /sys tree containing the supplied hwmon +// nodes and returns the path that should be passed as *sysPath. Each +// non-empty `device` shares a common /sys/devices/platform/ +// directory so two hwmon entries can collide on chip name. +func buildFakeSysfs(t *testing.T, hwmons []fakeHwmon) string { + t.Helper() + + sysRoot := t.TempDir() + classHwmon := filepath.Join(sysRoot, "class", "hwmon") + if err := os.MkdirAll(classHwmon, 0o755); err != nil { + t.Fatalf("mkdir class/hwmon: %v", err) + } + + for _, h := range hwmons { + var hwmonReal string + if h.device != "" { + hwmonReal = filepath.Join(sysRoot, "devices", "platform", h.device, "hwmon", h.hwmonDir) + } else { + hwmonReal = filepath.Join(sysRoot, "devices", "virtual", "hwmon", h.hwmonDir) + } + if err := os.MkdirAll(hwmonReal, 0o755); err != nil { + t.Fatalf("mkdir %s: %v", hwmonReal, err) + } + + if h.device != "" { + deviceTarget := filepath.Join(sysRoot, "devices", "platform", h.device) + if err := os.Symlink(deviceTarget, filepath.Join(hwmonReal, "device")); err != nil { + t.Fatalf("symlink device: %v", err) + } + } + + if h.name != "" { + if err := os.WriteFile(filepath.Join(hwmonReal, "name"), []byte(h.name+"\n"), 0o644); err != nil { + t.Fatalf("write name: %v", err) + } + } + + for fname, content := range h.files { + if err := os.WriteFile(filepath.Join(hwmonReal, fname), []byte(content+"\n"), 0o644); err != nil { + t.Fatalf("write %s: %v", fname, err) + } + } + + if err := os.Symlink(hwmonReal, filepath.Join(classHwmon, h.hwmonDir)); err != nil { + t.Fatalf("symlink class/hwmon: %v", err) + } + } + + return sysRoot +} + +// gatherChipLabels runs the collector through a registry and returns the +// observed `chip` label values across all metrics. It also surfaces any +// gather error — which is the failure mode #3637 reported. +func gatherChipLabels(t *testing.T, c *hwMonCollector) ([]string, error) { + t.Helper() + reg := prometheus.NewRegistry() + if err := reg.Register(testHwmonCollector{c: c}); err != nil { + t.Fatalf("register: %v", err) + } + families, err := reg.Gather() + if err != nil { + return nil, err + } + var chips []string + for _, fam := range families { + for _, m := range fam.Metric { + for _, l := range m.Label { + if l.GetName() == "chip" { + chips = append(chips, l.GetValue()) + } + } + } + } + return chips, nil +} + +// testHwmonCollector adapts hwMonCollector.Update for prometheus.Registry +// so we can exercise duplicate detection at the gather step. +type testHwmonCollector struct { + c *hwMonCollector +} + +func (t testHwmonCollector) Describe(ch chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(t, ch) +} + +func (t testHwmonCollector) Collect(ch chan<- prometheus.Metric) { + if err := t.c.Update(ch); err != nil { + panic(err) + } +} + +func newTestHwmonCollector() *hwMonCollector { + return &hwMonCollector{logger: slog.New(slog.NewTextHandler(io.Discard, nil))} +} + +func contains(haystack []string, needle string) bool { + for _, h := range haystack { + if h == needle { + return true + } + } + return false +} + +// Two hwmon entries sharing the same parent device — the configuration +// that triggered #3637 on ASUS WMI laptops — must produce distinct chip +// labels and not error during gather. +func TestHwmonDuplicateChipNamesAreDisambiguated(t *testing.T) { + hwmons := []fakeHwmon{ + { + hwmonDir: "hwmon6", + device: "asus-nb-wmi", + name: "asus", + files: map[string]string{ + "pwm1": "128", + "pwm1_enable": "2", + }, + }, + { + hwmonDir: "hwmon7", + device: "asus-nb-wmi", + name: "asus_wmi_sensors", + files: map[string]string{ + "pwm1": "200", + "pwm1_enable": "2", + }, + }, + } + + sysRoot := buildFakeSysfs(t, hwmons) + prev := *sysPath + t.Cleanup(func() { *sysPath = prev }) + *sysPath = sysRoot + + chips, err := gatherChipLabels(t, newTestHwmonCollector()) + if err != nil { + t.Fatalf("gather: %v", err) + } + + if !contains(chips, "platform_asus_nb_wmi_asus") { + t.Errorf("expected disambiguated chip 'platform_asus_nb_wmi_asus', got %v", uniq(chips)) + } + if !contains(chips, "platform_asus_nb_wmi_asus_wmi_sensors") { + t.Errorf("expected disambiguated chip 'platform_asus_nb_wmi_asus_wmi_sensors', got %v", uniq(chips)) + } + for _, chip := range chips { + if chip == "platform_asus_nb_wmi" { + t.Errorf("colliding chip should not be emitted with bare base name, got %q", chip) + } + } +} + +// When chip names are already unique, the collector must leave them alone +// — no surprise suffixes for unaffected users. +func TestHwmonUniqueChipNamesAreUnchanged(t *testing.T) { + hwmons := []fakeHwmon{ + { + hwmonDir: "hwmon0", + device: "coretemp.0", + name: "coretemp", + files: map[string]string{"temp1_input": "42000"}, + }, + { + hwmonDir: "hwmon1", + device: "coretemp.1", + name: "coretemp", + files: map[string]string{"temp1_input": "43000"}, + }, + } + + sysRoot := buildFakeSysfs(t, hwmons) + prev := *sysPath + t.Cleanup(func() { *sysPath = prev }) + *sysPath = sysRoot + + chips, err := gatherChipLabels(t, newTestHwmonCollector()) + if err != nil { + t.Fatalf("gather: %v", err) + } + if !contains(chips, "platform_coretemp_0") { + t.Errorf("expected platform_coretemp_0, got %v", uniq(chips)) + } + if !contains(chips, "platform_coretemp_1") { + t.Errorf("expected platform_coretemp_1, got %v", uniq(chips)) + } +} + +// When colliding entries share the same `name` file content, the `name` +// disambiguator collapses too. We must still emit unique chip labels by +// falling back to the hwmonX basename. +func TestHwmonDuplicateChipNamesWithSameNameFile(t *testing.T) { + hwmons := []fakeHwmon{ + { + hwmonDir: "hwmon3", + device: "asus-nb-wmi", + name: "asus", + files: map[string]string{"pwm1_enable": "2"}, + }, + { + hwmonDir: "hwmon4", + device: "asus-nb-wmi", + name: "asus", + files: map[string]string{"pwm1_enable": "2"}, + }, + } + + sysRoot := buildFakeSysfs(t, hwmons) + prev := *sysPath + t.Cleanup(func() { *sysPath = prev }) + *sysPath = sysRoot + + chips, err := gatherChipLabels(t, newTestHwmonCollector()) + if err != nil { + t.Fatalf("gather: %v", err) + } + if !contains(chips, "platform_asus_nb_wmi_hwmon3") { + t.Errorf("expected platform_asus_nb_wmi_hwmon3, got %v", uniq(chips)) + } + if !contains(chips, "platform_asus_nb_wmi_hwmon4") { + t.Errorf("expected platform_asus_nb_wmi_hwmon4, got %v", uniq(chips)) + } +} + +func uniq(in []string) []string { + seen := map[string]struct{}{} + out := make([]string, 0, len(in)) + for _, s := range in { + if _, ok := seen[s]; ok { + continue + } + seen[s] = struct{}{} + out = append(out, s) + } + return out +}