Skip to content

Commit b341fa2

Browse files
nvidia-gpu: enable gpu software inventory
The patch uses the MCTP VDM command to retrieve the GPU driver version and updates the DBus interface xyz.openbmc_project.Software.Version with this information at DBus object path /xyz/openbmc_project/software/. The patch also associates software inventory to the chassis inventory item. The GPU driver version is made available in Redfish at the URI /redfish/v1/UpdateService/FirmwareInventory/. Tested: Build an image for nvl32-obmc machine with the following patches cherry picked. https://gerrit.openbmc.org/c/openbmc/openbmc/+/85490 The patch cherry-picks the following patches that are currently under review. ``` 1. device tree https://lore.kernel.org/all/aRbLqH8pLWCQryhu@molberding.nvidia.com/ 2. mctpd patches CodeConstruct/mctp#85 3. u-boot changes https://lore.kernel.org/openbmc/20251121-msx4-v1-0-fc0118b666c1@nvidia.com/T/#t 4. kernel changes as specified in the openbmc patch (for espi) 5. entity-manager changes https://gerrit.openbmc.org/c/openbmc/entity-manager/+/85455 6. platform-init changes https://gerrit.openbmc.org/c/openbmc/platform-init/+/85456 7. spi changes https://lore.kernel.org/all/20251121-w25q01jv_fixup-v1-1-3d175050db73@nvidia.com/ ``` The GPU driver version shows up on the DBus. Change-Id: I712fe0952a02f36e386d3f37a5d4a8192ba641de Signed-off-by: Harshit Aghera <haghera@nvidia.com>
1 parent 383c5e3 commit b341fa2

9 files changed

Lines changed: 530 additions & 0 deletions
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright OpenBMC Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#include "NvidiaDriverInformation.hpp"
7+
8+
#include "Utils.hpp"
9+
10+
#include <MctpRequester.hpp>
11+
#include <NvidiaGpuMctpVdm.hpp>
12+
#include <OcpMctpVdm.hpp>
13+
#include <phosphor-logging/lg2.hpp>
14+
#include <sdbusplus/asio/connection.hpp>
15+
#include <sdbusplus/asio/object_server.hpp>
16+
#include <sdbusplus/message/native_types.hpp>
17+
18+
#include <cstdint>
19+
#include <memory>
20+
#include <span>
21+
#include <string>
22+
#include <system_error>
23+
#include <vector>
24+
25+
const std::string softwareInventoryPath = "/xyz/openbmc_project/software/";
26+
27+
NvidiaDriverInformation::NvidiaDriverInformation(
28+
std::shared_ptr<sdbusplus::asio::connection>& conn,
29+
mctp::MctpRequester& mctpRequester, const std::string& name,
30+
const sdbusplus::message::object_path& path, const uint8_t eid,
31+
sdbusplus::asio::object_server& objectServer) :
32+
eid(eid), conn(conn), mctpRequester(mctpRequester)
33+
{
34+
const std::string dbusPath = softwareInventoryPath + escapeName(name);
35+
36+
versionInterface = objectServer.add_interface(
37+
dbusPath, "xyz.openbmc_project.Software.Version");
38+
39+
versionInterface->register_property<std::string>("Version", "");
40+
41+
versionInterface->register_property<std::string>(
42+
"Purpose", "xyz.openbmc_project.Software.Version.VersionPurpose.Other");
43+
44+
if (!versionInterface->initialize())
45+
{
46+
lg2::error(
47+
"Failed to initialize Version interface for Driver Information for eid {EID}",
48+
"EID", eid);
49+
}
50+
51+
std::vector<Association> associations;
52+
associations.emplace_back("running", "ran_on", path.parent_path());
53+
54+
associationInterface =
55+
objectServer.add_interface(dbusPath, association::interface);
56+
57+
associationInterface->register_property("Associations", associations);
58+
59+
if (!associationInterface->initialize())
60+
{
61+
lg2::error(
62+
"Failed to initialize Association interface for Driver Information for eid {EID}",
63+
"EID", eid);
64+
}
65+
}
66+
67+
void NvidiaDriverInformation::processResponse(const std::error_code& ec,
68+
std::span<const uint8_t> buffer)
69+
{
70+
if (ec)
71+
{
72+
lg2::error(
73+
"Error updating Driver Information for eid {EID} : sending message over MCTP failed, rc={RC}",
74+
"EID", eid, "RC", ec.message());
75+
return;
76+
}
77+
78+
ocp::accelerator_management::CompletionCode cc{};
79+
uint16_t reasonCode = 0;
80+
gpu::DriverState driverState{};
81+
std::string driverVersion;
82+
83+
const int rc = gpu::decodeGetDriverInformationResponse(
84+
buffer, cc, reasonCode, driverState, driverVersion);
85+
86+
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
87+
{
88+
lg2::error(
89+
"Error updating Driver Information for eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={REASON}",
90+
"EID", eid, "RC", rc, "CC", static_cast<uint8_t>(cc), "REASON",
91+
reasonCode);
92+
return;
93+
}
94+
95+
versionInterface->set_property("Version", driverVersion);
96+
}
97+
98+
void NvidiaDriverInformation::update()
99+
{
100+
const int rc = gpu::encodeGetDriverInformationRequest(0, request);
101+
102+
if (rc != 0)
103+
{
104+
lg2::error(
105+
"Error updating Driver Information for eid {EID} : encode failed, rc={RC}",
106+
"EID", eid, "RC", rc);
107+
return;
108+
}
109+
110+
mctpRequester.sendRecvMsg(
111+
eid, request,
112+
[weak{weak_from_this()}](const std::error_code& ec,
113+
std::span<const uint8_t> buffer) {
114+
std::shared_ptr<NvidiaDriverInformation> self = weak.lock();
115+
if (!self)
116+
{
117+
lg2::error("invalid reference to NvidiaDriverInformation");
118+
return;
119+
}
120+
self->processResponse(ec, buffer);
121+
});
122+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright OpenBMC Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#pragma once
7+
8+
#include "MctpRequester.hpp"
9+
10+
#include <NvidiaGpuMctpVdm.hpp>
11+
#include <sdbusplus/asio/connection.hpp>
12+
#include <sdbusplus/asio/object_server.hpp>
13+
14+
#include <array>
15+
#include <cstdint>
16+
#include <memory>
17+
#include <string>
18+
19+
struct NvidiaDriverInformation :
20+
public std::enable_shared_from_this<NvidiaDriverInformation>
21+
{
22+
public:
23+
NvidiaDriverInformation(
24+
std::shared_ptr<sdbusplus::asio::connection>& conn,
25+
mctp::MctpRequester& mctpRequester, const std::string& name,
26+
const sdbusplus::message::object_path& path, uint8_t eid,
27+
sdbusplus::asio::object_server& objectServer);
28+
29+
void update();
30+
31+
private:
32+
void processResponse(const std::error_code& ec,
33+
std::span<const uint8_t> buffer);
34+
35+
uint8_t eid{};
36+
37+
std::shared_ptr<sdbusplus::asio::connection> conn;
38+
39+
mctp::MctpRequester& mctpRequester;
40+
41+
std::array<uint8_t, sizeof(ocp::accelerator_management::CommonRequest)>
42+
request{};
43+
44+
std::shared_ptr<sdbusplus::asio::dbus_interface> versionInterface;
45+
std::shared_ptr<sdbusplus::asio::dbus_interface> associationInterface;
46+
};

src/nvidia-gpu/NvidiaGpuDevice.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <Inventory.hpp>
1212
#include <MctpRequester.hpp>
1313
#include <NvidiaDeviceDiscovery.hpp>
14+
#include <NvidiaDriverInformation.hpp>
1415
#include <NvidiaGpuEnergySensor.hpp>
1516
#include <NvidiaGpuMctpVdm.hpp>
1617
#include <NvidiaGpuPowerPeakReading.hpp>
@@ -93,6 +94,9 @@ void GpuDevice::makeSensors()
9394
conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
9495
objectServer, std::vector<thresholds::Threshold>{});
9596

97+
driverInfo = std::make_shared<NvidiaDriverInformation>(
98+
conn, mctpRequester, name, path, eid, objectServer);
99+
96100
getTLimitThresholds();
97101

98102
lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
@@ -210,6 +214,7 @@ void GpuDevice::read()
210214
peakPower->update();
211215
energySensor->update();
212216
voltageSensor->update();
217+
driverInfo->update();
213218

214219
waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
215220
waitTimer.async_wait(

src/nvidia-gpu/NvidiaGpuDevice.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "NvidiaGpuPowerSensor.hpp"
1212
#include "NvidiaGpuSensor.hpp"
1313

14+
#include <NvidiaDriverInformation.hpp>
1415
#include <NvidiaGpuEnergySensor.hpp>
1516
#include <NvidiaGpuPowerPeakReading.hpp>
1617
#include <NvidiaGpuVoltageSensor.hpp>
@@ -74,6 +75,7 @@ class GpuDevice : public std::enable_shared_from_this<GpuDevice>
7475
std::shared_ptr<NvidiaGpuPowerPeakReading> peakPower;
7576
std::shared_ptr<NvidiaGpuEnergySensor> energySensor;
7677
std::shared_ptr<NvidiaGpuVoltageSensor> voltageSensor;
78+
std::shared_ptr<NvidiaDriverInformation> driverInfo;
7779

7880
std::array<uint8_t, sizeof(gpu::ReadThermalParametersRequest)>
7981
thermalParamReqMsg{};

src/nvidia-gpu/NvidiaGpuMctpVdm.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <cstdint>
1515
#include <cstring>
1616
#include <span>
17+
#include <string>
1718
#include <vector>
1819

1920
namespace gpu
@@ -413,6 +414,74 @@ int decodeGetVoltageResponse(std::span<const uint8_t> buf,
413414
return 0;
414415
}
415416

417+
int encodeGetDriverInformationRequest(uint8_t instanceId,
418+
std::span<uint8_t> buf)
419+
{
420+
if (buf.size() < sizeof(ocp::accelerator_management::CommonRequest))
421+
{
422+
return EINVAL;
423+
}
424+
425+
auto* msg = reinterpret_cast<ocp::accelerator_management::CommonRequest*>(
426+
buf.data());
427+
428+
ocp::accelerator_management::BindingPciVidInfo header{};
429+
header.ocp_accelerator_management_msg_type =
430+
static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
431+
header.instance_id = instanceId &
432+
ocp::accelerator_management::instanceIdBitMask;
433+
header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
434+
435+
auto rc = packHeader(header, msg->msgHdr.hdr);
436+
437+
if (rc != 0)
438+
{
439+
return rc;
440+
}
441+
442+
msg->command = static_cast<uint8_t>(
443+
PlatformEnvironmentalCommands::GET_DRIVER_INFORMATION);
444+
msg->data_size = 0;
445+
446+
return 0;
447+
}
448+
449+
int decodeGetDriverInformationResponse(
450+
std::span<const uint8_t> buf,
451+
ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
452+
DriverState& driverState, std::string& driverVersion)
453+
{
454+
auto rc =
455+
ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
456+
457+
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
458+
{
459+
return rc;
460+
}
461+
462+
if (buf.size() < sizeof(GetDriverInformationResponse))
463+
{
464+
return EINVAL;
465+
}
466+
467+
const auto* response =
468+
reinterpret_cast<const GetDriverInformationResponse*>(buf.data());
469+
470+
const uint16_t dataSize = le16toh(response->hdr.data_size);
471+
472+
if (dataSize < sizeof(DriverState) + sizeof(char))
473+
{
474+
return EINVAL;
475+
}
476+
477+
driverState = response->driverState;
478+
const size_t versionSize =
479+
buf.size() - sizeof(GetDriverInformationResponse);
480+
driverVersion = std::string(&response->driverVersion, versionSize);
481+
482+
return 0;
483+
}
484+
416485
int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId,
417486
std::span<uint8_t> buf)
418487
{

src/nvidia-gpu/NvidiaGpuMctpVdm.hpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ enum class PlatformEnvironmentalCommands : uint8_t
4343
GET_MAX_OBSERVED_POWER = 0x04,
4444
GET_CURRENT_ENERGY_COUNTER = 0x06,
4545
GET_INVENTORY_INFORMATION = 0x0C,
46+
GET_DRIVER_INFORMATION = 0x0E,
4647
GET_VOLTAGE = 0x0F,
4748
};
4849

@@ -106,6 +107,13 @@ enum class PciePortType : uint8_t
106107
DOWNSTREAM = 1,
107108
};
108109

110+
enum class DriverState : uint8_t
111+
{
112+
DRIVER_STATE_UNKNOWN = 0,
113+
DRIVER_STATE_NOT_LOADED = 1,
114+
DRIVER_STATE_LOADED = 2,
115+
};
116+
109117
struct QueryDeviceIdentificationRequest
110118
{
111119
ocp::accelerator_management::CommonRequest hdr;
@@ -189,6 +197,13 @@ struct ListPCIePortsDownstreamPortsData
189197
uint8_t count;
190198
} __attribute__((packed));
191199

200+
struct GetDriverInformationResponse
201+
{
202+
ocp::accelerator_management::CommonResponse hdr;
203+
DriverState driverState;
204+
char driverVersion;
205+
} __attribute__((packed));
206+
192207
struct GetInventoryInformationRequest
193208
{
194209
ocp::accelerator_management::CommonRequest hdr;
@@ -251,6 +266,14 @@ int decodeGetVoltageResponse(std::span<const uint8_t> buf,
251266
ocp::accelerator_management::CompletionCode& cc,
252267
uint16_t& reasonCode, uint32_t& voltage);
253268

269+
int encodeGetDriverInformationRequest(uint8_t instanceId,
270+
std::span<uint8_t> buf);
271+
272+
int decodeGetDriverInformationResponse(
273+
std::span<const uint8_t> buf,
274+
ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
275+
DriverState& driverState, std::string& driverVersion);
276+
254277
int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId,
255278
std::span<uint8_t> buf);
256279

src/nvidia-gpu/NvidiaGpuSensorMain.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ int main()
5555
sdbusplus::asio::object_server objectServer(systemBus, true);
5656
objectServer.add_manager("/xyz/openbmc_project/sensors");
5757
objectServer.add_manager("/xyz/openbmc_project/inventory");
58+
objectServer.add_manager("/xyz/openbmc_project/software");
5859
systemBus->request_name("xyz.openbmc_project.GpuSensor");
5960

6061
mctp::MctpRequester mctpRequester(io);

src/nvidia-gpu/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ gpusensor_sources = files(
22
'Inventory.cpp',
33
'MctpRequester.cpp',
44
'NvidiaDeviceDiscovery.cpp',
5+
'NvidiaDriverInformation.cpp',
56
'NvidiaGpuDevice.cpp',
67
'NvidiaGpuEnergySensor.cpp',
78
'NvidiaGpuMctpVdm.cpp',

0 commit comments

Comments
 (0)