From dd6e3d88d8c484eaba1aeacac83a1ed0b2db65d0 Mon Sep 17 00:00:00 2001 From: "maofeng.huang" Date: Mon, 16 Jun 2025 09:35:31 +0800 Subject: [PATCH] Add the interfaces of EventSet, SN, EccMode and other components. --- .gitignore | 22 + Makefile | 17 +- README.md | 14 +- gen/ixml/api.h | 1148 ++++++++++++++++++++++++++++++++---- gen/ixml/cgo_helpers.go | 76 +-- gen/ixml/ixml.yml | 3 + pkg/ixml/api.h | 1148 ++++++++++++++++++++++++++++++++---- pkg/ixml/cgo_helpers.go | 76 +-- pkg/ixml/const.go | 206 ++++++- pkg/ixml/device.go | 320 +++++++--- pkg/ixml/event_set.go | 66 +++ pkg/ixml/ixml.go | 356 +++++++++-- pkg/ixml/ixml_test.go | 87 +++ pkg/ixml/return.go | 106 ++++ pkg/ixml/system.go | 6 + pkg/ixml/types_gen.go | 42 +- samples/attributes/main.go | 57 +- samples/board/main.go | 113 ++++ samples/events/main.go | 96 +++ samples/metrics/main.go | 60 +- samples/system/main.go | 22 +- 21 files changed, 3465 insertions(+), 576 deletions(-) create mode 100644 .gitignore create mode 100644 pkg/ixml/event_set.go create mode 100644 pkg/ixml/ixml_test.go create mode 100644 pkg/ixml/return.go create mode 100644 samples/board/main.go create mode 100644 samples/events/main.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..be94944 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +*.swp +*.swo +.cache +coverage.out* + +*.out +*.log + +# MacOS +.DS_Store + +# IDE +.idea/ +.vscode/ +*.code-workspace + +# Files generated by editors +*~ +*.iml +*.swp +*.sublime-project +*.sublime-workspace \ No newline at end of file diff --git a/Makefile b/Makefile index b9ac26c..0940eee 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +MODULE := gitee.com/deep-spark/go-ixml + GEN_DIR = $(PWD)/gen PKG_DIR = $(PWD)/pkg GEN_BINDINGS_DIR = $(GEN_DIR)/ixml @@ -24,8 +26,6 @@ SOURCES = $(shell find $(GEN_BINDINGS_DIR) -type f) .PHONY: bindings test-bindings clean-bindings all: bindings -test: test-bindings -clean: clean-bindings bindings: $(SOURCES) rm -rf $(PKG_BINDINGS_DIR)/{ixml,doc,const,cgo_helpers,types,types_gen}.go @@ -38,11 +38,14 @@ bindings: $(SOURCES) cd -> /dev/null rm -rf $(PKG_BINDINGS_DIR)/types.go $(PKG_BINDINGS_DIR)/_obj -test-bindings: bindings - cd $(PKG_BINDINGS_DIR); \ - go test -v .; \ - cd -> /dev/null +COVERAGE_FILE := coverage.out +test: bindings + go test -v -coverprofile=$(COVERAGE_FILE) $(MODULE)/pkg/... + +coverage: test + cat $(COVERAGE_FILE) | grep -v "_mock.go" > $(COVERAGE_FILE).no-mocks + go tool cover -func=$(COVERAGE_FILE).no-mocks -clean-bindings: +clean: rm -rf $(PKG_BINDINGS_DIR)/{ixml,doc,const,cgo_helpers,types,types_gen}.go rm -rf $(PKG_BINDINGS_DIR)/types.go $(PKG_BINDINGS_DIR)/_obj \ No newline at end of file diff --git a/README.md b/README.md index 312a25b..2a254c6 100644 --- a/README.md +++ b/README.md @@ -97,12 +97,22 @@ To get gpm metrics of device, run the following command: go run samples/gpmmetrics/main.go ``` -To get running process information of device,run the following command: +To get running process information of device, run the following command: ```bash go run samples/processinfo/main.go ``` -To get system information such as driver version and CUDA version, run the following command: +To test the interface of event on device, run the following command: +```bash +go run samples/events/main.go +``` + +To test the board information of device, run the following command: +```bash +go run samples/board/main.go +``` + +To test system information such as driver version, CUDA version and IXML version, run the following command: ```bash go run samples/system/main.go ``` diff --git a/gen/ixml/api.h b/gen/ixml/api.h index b5b2f4c..91bd006 100644 --- a/gen/ixml/api.h +++ b/gen/ixml/api.h @@ -93,6 +93,8 @@ extern "C" #define DECLDIR #endif +#define DEVICE_MAX_NUM 32 + /** * Return values for NVML API calls. */ @@ -125,6 +127,7 @@ extern "C" NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported + NVML_ERROR_DEPRECATED = 26, //!< The requested functionality has been deprecated NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred } nvmlReturn_t; @@ -133,10 +136,48 @@ extern "C" struct nvmlDevice_st *handle; } nvmlDevice_t; - typedef struct + /** + * See \ref nvmlDeviceGetMemoryErrorCounter + */ + typedef enum nvmlMemoryLocation_enum { - struct nvmlEventSet_st *handle; - } nvmlEventSet_t; + NVML_MEMORY_LOCATION_L1_CACHE = 0, //!< GPU L1 Cache + NVML_MEMORY_LOCATION_L2_CACHE = 1, //!< GPU L2 Cache + NVML_MEMORY_LOCATION_DRAM = 2, //!< Turing+ DRAM + NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2, //!< GPU Device Memory + NVML_MEMORY_LOCATION_REGISTER_FILE = 3, //!< GPU Register File + NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4, //!< GPU Texture Memory + NVML_MEMORY_LOCATION_TEXTURE_SHM = 5, //!< Shared memory + NVML_MEMORY_LOCATION_CBU = 6, //!< CBU + NVML_MEMORY_LOCATION_SRAM = 7, //!< Turing+ SRAM + // Keep this last + NVML_MEMORY_LOCATION_COUNT //!< This counts the number of memory locations the driver knows about + } nvmlMemoryLocation_t; + + /** + * Causes for page retirement + */ + typedef enum nvmlPageRetirementCause_enum + { + NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error + NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1, //!< Page was retired due to double bit ECC error + + // Keep this last + NVML_PAGE_RETIREMENT_CAUSE_COUNT + } nvmlPageRetirementCause_t; + + /** + * API types that allow changes to default permission restrictions + */ + typedef enum nvmlRestrictedAPI_enum + { + NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0, //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks + //!< and see nvmlDeviceResetApplicationsClocks + NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, //!< APIs that enable/disable Auto Boosted clocks + //!< see nvmlDeviceSetAutoBoostedClocksEnabled + // Keep this last + NVML_RESTRICTED_API_COUNT + } nvmlRestrictedAPI_t; /** @} */ /** @@ -200,6 +241,15 @@ extern "C" NVML_TEMPERATURE_COUNT } nvmlTemperatureSensors_t; + /** + * Generic enable/disable enum. + */ + typedef enum nvmlEnableState_enum + { + NVML_FEATURE_DISABLED = 0, //!< Feature disabled + NVML_FEATURE_ENABLED = 1 //!< Feature enabled + } nvmlEnableState_t; + /** * Clock types. * @@ -360,15 +410,6 @@ extern "C" struct nvmlGpmSample_st *handle; } nvmlGpmSample_t; - // typedef struct nvmlGpmSample_st { - // nvmlDevice_t device; - // unsigned long long timeStamp; - // unsigned long long sm_active; - // unsigned long long active_warps; - // unsigned long long total_cycles; - // unsigned long long dram_bandwidth; - // } nvmlGpmSample_t; - /** * GPM metric information. */ @@ -408,6 +449,157 @@ extern "C" unsigned int isSupportedDevice; //!< OUT: Indicates device support } nvmlGpmSupport_t; +/** + * Maximum limit on Physical Bridges per Board + */ +#define NVML_MAX_PHYSICAL_BRIDGE (128) + + /** + * Enum to represent type of bridge chip + */ + typedef enum nvmlBridgeChipType_enum + { + NVML_BRIDGE_CHIP_PLX = 0, + NVML_BRIDGE_CHIP_BRO4 = 1 + } nvmlBridgeChipType_t; + + /** + * Information about the Bridge Chip Firmware + */ + typedef struct nvmlBridgeChipInfo_st + { + nvmlBridgeChipType_t type; //!< Type of Bridge Chip + unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable + } nvmlBridgeChipInfo_t; + + /** + * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate + * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. + */ + typedef struct nvmlBridgeChipHierarchy_st + { + unsigned char bridgeCount; //!< Number of Bridge Chips on the Board + nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board + } nvmlBridgeChipHierarchy_t; + + /** + * Represents Type of Sampling Event + */ + typedef enum nvmlSamplingType_enum + { + NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU + NVML_GPU_UTILIZATION_SAMPLES = + 1, //!< To represent percent of time during which one or more kernels was executing on the GPU + NVML_MEMORY_UTILIZATION_SAMPLES = + 2, //!< To represent percent of time during which global (device) memory was being read or written + NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy + NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy + NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples + NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples + + // Keep this last + NVML_SAMPLINGTYPE_COUNT + } nvmlSamplingType_t; + + /** + * Represents the queryable PCIe utilization counters + */ + typedef enum nvmlPcieUtilCounter_enum + { + NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity + NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity + + // Keep this last + NVML_PCIE_UTIL_COUNT + } nvmlPcieUtilCounter_t; + + /** + * Represents the type for sample value returned + */ + typedef enum nvmlValueType_enum + { + NVML_VALUE_TYPE_DOUBLE = 0, + NVML_VALUE_TYPE_UNSIGNED_INT = 1, + NVML_VALUE_TYPE_UNSIGNED_LONG = 2, + NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, + NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, + + // Keep this last + NVML_VALUE_TYPE_COUNT + } nvmlValueType_t; + + /** + * Union to represent different types of Value + */ + typedef union nvmlValue_st + { + double dVal; //!< If the value is double + unsigned int uiVal; //!< If the value is unsigned int + unsigned long ulVal; //!< If the value is unsigned long + unsigned long long ullVal; //!< If the value is unsigned long long + signed long long sllVal; //!< If the value is signed long long + } nvmlValue_t; + + /** + * Information for Sample + */ + typedef struct nvmlSample_st + { + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlValue_t sampleValue; //!< Sample Value + } nvmlSample_t; + + /** + * Represents type of perf policy for which violation times can be queried + */ + typedef enum nvmlPerfPolicyType_enum + { + NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks + NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks + NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks + NVML_PERF_POLICY_BOARD_LIMIT = + 3, //!< How long did the board limit cause the GPU to be below application clocks + NVML_PERF_POLICY_LOW_UTILIZATION = + 4, //!< How long did low utilization cause the GPU to be below application clocks + NVML_PERF_POLICY_RELIABILITY = + 5, //!< How long did the board reliability limit cause the GPU to be below application clocks + + NVML_PERF_POLICY_TOTAL_APP_CLOCKS = + 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) + NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks + + // Keep this last + NVML_PERF_POLICY_COUNT + } nvmlPerfPolicyType_t; + + /** + * Compute mode. + * + * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. + * Earlier CUDA versions supported a single exclusive mode, + * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. + */ + typedef enum nvmlComputeMode_enum + { + NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed + NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, + //!< usable from multiple threads at a time + + // Keep this last + NVML_COMPUTEMODE_COUNT + } nvmlComputeMode_t; + + /** + * Struct to hold perf policy violation status data + */ + typedef struct nvmlViolationTime_st + { + unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds + unsigned long long violationTime; //!< violationTime in Nanoseconds + } nvmlViolationTime_t; + #define NVML_GPM_SUPPORT_VERSION 1 /** * Buffer size guaranteed to be large enough for storing GPU identifiers. @@ -419,6 +611,11 @@ extern "C" */ #define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE 80 +/** + * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion + */ +#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80 + /** * Buffer size guaranteed to be large enough for storing GPU device names. */ @@ -429,6 +626,21 @@ extern "C" */ #define NVML_DEVICE_NAME_V2_BUFFER_SIZE 96 +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber + */ +#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial + */ +#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion + */ +#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32 + /** * Buffer size guaranteed to be large enough for pci bus id */ @@ -728,6 +940,24 @@ extern "C" */ nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length); + /** + * Retrieves the version of the NVML library. + * + * For all products. + * + * The version identifier is an alphanumeric string. It will not exceed 80 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE. + * + * @param version Reference in which to return the version identifier + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + */ + nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length); + /** * Retrieves the version of the CUDA driver. * @@ -935,205 +1165,639 @@ extern "C" nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); /** - * Retrieves the PCI attributes of this device. + * Retrieves the current compute mode for the device. * * For all products. * - * See \ref nvmlPciInfo_t for details on the available PCI info. + * See \ref nvmlComputeMode_t for details on allowed compute modes. * * @param device The identifier of the target device - * @param pci Reference in which to return the PCI info + * @param mode Reference in which to return the current compute mode * * @return - * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_SUCCESS if \a mode has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetComputeMode() */ - nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); + nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); /** - * Retrieves the NVML index of this device. + * Retrieves the CUDA compute capability of the device. * * For all products. * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or GPU UUID. See - * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). - * - * When used with MIG device handles this API returns indices that can be - * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. - * MIG device indices are unique within a device. - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * Returns the major and minor compute capability version numbers of the + * device. The major and minor versions are equivalent to the + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be + * returned by CUDA's cuDeviceGetAttribute(). * * @param device The identifier of the target device - * @param index Reference in which to return the NVML index of the device + * @param major Reference in which to return the major CUDA compute capability + * @param minor Reference in which to return the minor CUDA compute capability * * @return - * - \ref NVML_SUCCESS if \a index has been set + * - \ref NVML_SUCCESS if \a major and \a minor have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetHandleByIndex() - * @see nvmlDeviceGetCount() */ - nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); + nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); /** - * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * Retrieves the current and pending ECC modes for the device. * * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. * - * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following + * the next reboot. * - * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. + * See \ref nvmlEnableState_t for details on allowed modes. * * @param device The identifier of the target device - * @param power Reference in which to return the power usage information + * @param current Reference in which to return the current ECC mode + * @param pending Reference in which to return the pending ECC mode * * @return - * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_SUCCESS if \a current and \a pending have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() */ - nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); + nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, + nvmlEnableState_t *current, + nvmlEnableState_t *pending); /** - * Check if the GPU devices are on the same physical board. + * Retrieves the default ECC modes for the device. * - * For all fully supported products. + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. * - * @param device1 The first GPU device - * @param device2 The second GPU device - * @param onSameBoard Reference in which to return the status. - * Non-zero indicates that the GPUs are on the same board. + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param defaultMode Reference in which to return the default ECC mode * * @return - * - \ref NVML_SUCCESS if \a onSameBoard has been set + * - \ref NVML_SUCCESS if \a current and \a pending have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a default is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() */ - nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); + nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode); /** - * Get information about processes with a compute context on a device - * - * For Fermi &tm; or newer fully supported devices. - * - * This function returns information only about compute running processes (e.g. CUDA application which have - * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. - * - * To query the current number of running compute processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. + * Retrieves the device boardId from 0-N. + * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with + * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. + * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across + * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and + * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will + * always return those values but they will always be different from each other). * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new compute processes are spawned. * - * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if - * the caller has appropriate privileges. Per-instance information can be queried by using - * specific MIG device handles. - * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * For Fermi &tm; or newer fully supported devices. * - * @param device The device handle or MIG device handle - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information + * @param device The identifier of the target device + * @param boardId Reference in which to return the device's board ID * * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_SUCCESS if \a boardId has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ - nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); - - /** - * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead */ + nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); /** - * Retrieve the PCIe replay counter. + * Retrieves whether the device is on a Multi-GPU Board + * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. * - * For Kepler &tm; or newer fully supported devices. + * For Fermi &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param value Reference in which to return the counter's value + * @param multiGpuBool Reference in which to return a zero or non-zero value + * to indicate whether the device is on a multi GPU board * * @return - * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_SUCCESS if \a multiGpuBool has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ - nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); - - /** @} */ // @defgroup nvmlGPMStructs - - /***************************************************************************************************/ - /** @defgroup nvmlGpmFunctions GPM Functions - * @{ - */ - /***************************************************************************************************/ + nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); /** - * Calculate GPM metrics from two samples. + * Retrieves the PCI attributes of this device. * - * For Hopper &tm; or newer fully supported devices. + * For all products. * - * @param metricsGet IN/OUT: populated \a nvmlGpmMetricsGet_t struct + * See \ref nvmlPciInfo_t for details on the available PCI info. + * + * @param device The identifier of the target device + * @param pci Reference in which to return the PCI info * * @return - * - \ref NVML_SUCCESS on success - * - Nonzero NVML_ERROR_? enum on error + * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ - nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); + nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); /** - * Indicate whether the supplied device supports GPM + * Retrieves the maximum PCIe link generation possible with this device and system * - * @param device NVML device to query for - * @param gpmSupport Structure to indicate GPM support \a nvmlGpmSupport_t. Indicates - * GPM support per system for the supplied device + * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function + * will report is generation 1. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkGen Reference in which to return the max PCIe link generation * * @return - * - NVML_SUCCESS on success - * - Nonzero NVML_ERROR_? enum if there is an error in processing the query + * - \ref NVML_SUCCESS if \a maxLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ - nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); + nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); /** - * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() + * Retrieves the maximum PCIe link width possible with this device and system * - * For Hopper &tm; or newer fully supported devices. + * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report + * a max link width of 8. * - * @param gpmSample Sample to free + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkWidth Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); + + /** + * Retrieves the current PCIe link generation + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkGen Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); + + /** + * Retrieves the current PCIe link width + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkWidth Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); + + /** + * Retrieve PCIe utilization information. + * This function is querying a byte counter over a 20ms interval and thus is the + * PCIe throughput over that interval. + * + * For Maxwell &tm; or newer fully supported devices. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param counter The specific counter that should be queried \ref + * nvmlPcieUtilCounter_t + * @param value Reference in which to return throughput in KB/s + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, + nvmlPcieUtilCounter_t counter, + unsigned int *value); + + /** + * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead + */ + + /** + * Retrieve the PCIe replay counter. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param value Reference in which to return the counter's value + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); + + /** + * Retrieves the NVML index of this device. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or GPU UUID. See + * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). + * + * When used with MIG device handles this API returns indices that can be + * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. + * MIG device indices are unique within a device. + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * @param device The identifier of the target device + * @param index Reference in which to return the NVML index of the device + * + * @return + * - \ref NVML_SUCCESS if \a index has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetHandleByIndex() + * @see nvmlDeviceGetCount() + */ + nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); + + /** + * Retrieves the globally unique board serial number associated with this device's board. + * + * For all products with an inforom. + * + * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). + * This number matches the serial number tag that is physically attached to the board. See \ref + * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param serial Reference in which to return the board/module serial number + * @param length The maximum allowed length of the string returned in \a serial + * + * @return + * - \ref NVML_SUCCESS if \a serial has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); + + /** + * Acquire the handle for a particular device, based on its board serial number. + * + * For Fermi &tm; or newer fully supported devices. + * + * This number corresponds to the value printed directly on the board, and to the value returned by + * \ref nvmlDeviceGetSerial(). + * + * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor + * of \ref nvmlDeviceGetHandleByUUID. + * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @param serial The board serial number of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one + * device has the same serial (dual GPU boards) + * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power + * cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetSerial + * @see nvmlDeviceGetHandleByUUID + */ + nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); + + /** + * Get VBIOS version of the device. + * + * For all products. + * + * The VBIOS version may change from time to time. It will not exceed 32 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference to which to return the VBIOS version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); + + /** + * Retrieves the the device board part number which is programmed into the board's InfoROM + * + * For all products. + * + * @param device Identifier of the target device + * @param partNumber Reference to the buffer to return + * @param length Length of the buffer reference + * + * @return + * - \ref NVML_SUCCESS if \a partNumber has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise + * inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char *partNumber, unsigned int length); + + /** + * Set the ECC mode for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires root/admin permissions. + * + * The ECC mode determines whether the GPU enables its ECC support. + * + * This operation takes effect after the next reboot. + * + * See \ref nvmlEnableState_t for details on available modes. + * + * @param device The identifier of the target device + * @param ecc The target ECC mode + * + * @return + * - \ref NVML_SUCCESS if the ECC mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetEccMode() + */ + nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); + + /** + * Retrieves the current and pending ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following + * the next reboot. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current ECC mode + * @param pending Reference in which to return the pending ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ + nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, + nvmlEnableState_t *current, + nvmlEnableState_t *pending); + + /** + * Retrieves the default ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param defaultMode Reference in which to return the default ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a default is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ + nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode); + + /** + * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * + * For Fermi &tm; or newer fully supported devices. + * + * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param power Reference in which to return the power usage information + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); + + /** + * Check if the GPU devices are on the same physical board. + * + * For all fully supported products. + * + * @param device1 The first GPU device + * @param device2 The second GPU device + * @param onSameBoard Reference in which to return the status. + * Non-zero indicates that the GPUs are on the same board. + * + * @return + * - \ref NVML_SUCCESS if \a onSameBoard has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); + + /** + * Get information about processes with a compute context on a device + * + * For Fermi &tm; or newer fully supported devices. + * + * This function returns information only about compute running processes (e.g. CUDA application which have + * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. + * + * To query the current number of running compute processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new compute processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * + * @param device The device handle or MIG device handle + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ + nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); + + /** @} */ // @defgroup nvmlGPMStructs + + /***************************************************************************************************/ + /** @defgroup nvmlGpmFunctions GPM Functions + * @{ + */ + /***************************************************************************************************/ + + /** + * Calculate GPM metrics from two samples. + * + * For Hopper &tm; or newer fully supported devices. + * + * @param metricsGet IN/OUT: populated \a nvmlGpmMetricsGet_t struct + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ + nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); + + /** + * Indicate whether the supplied device supports GPM + * + * @param device NVML device to query for + * @param gpmSupport Structure to indicate GPM support \a nvmlGpmSupport_t. Indicates + * GPM support per system for the supplied device + * + * @return + * - NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum if there is an error in processing the query + */ + nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); + + /** + * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() + * + * For Hopper &tm; or newer fully supported devices. + * + * @param gpmSample Sample to free * * @return * - \ref NVML_SUCCESS on success @@ -1311,6 +1975,242 @@ extern "C" /** @} */ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); + /***************************************************************************************************/ + /** @addtogroup nvmlEvents + * @{ + */ + /***************************************************************************************************/ + + /** + * Handle to an event set + */ + typedef struct + { + struct nvmlEventSet_st *handle; + } nvmlEventSet_t; + +/** @defgroup nvmlEventType Event Types + * @{ + * Event Types which user can be notified about. + * See description of particular functions for details. + * + * See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices + * support each event. + * + * Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents + */ +//! Event about single bit ECC errors +/** + * \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event + */ +#define nvmlEventTypeSingleBitEccError 0x0000000000000001LL + +//! Event about double bit ECC errors +/** + * \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event + */ +#define nvmlEventTypeDoubleBitEccError 0x0000000000000002LL + +//! Event about PState changes +/** + * \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to + * no work being executed on the GPU, power capping or thermal capping. In a typical situation, + * Fermi-based GPU should stay in P0 for the duration of the execution of the compute process. + */ +#define nvmlEventTypePState 0x0000000000000004LL + +//! Event that Xid critical error occurred +#define nvmlEventTypeXidCriticalError 0x0000000000000008LL + +//! Event about clock changes +/** + * Kepler only + */ +#define nvmlEventTypeClock 0x0000000000000010LL + +//! Event about AC/Battery power source changes +#define nvmlEventTypePowerSourceChange 0x0000000000000080LL + +//! Event about MIG configuration changes +#define nvmlEventMigConfigChange 0x0000000000000100LL + +//! Mask with no events +#define nvmlEventTypeNone 0x0000000000000000LL + +//! Mask of all events +#define nvmlEventTypeAll \ + (nvmlEventTypeNone | nvmlEventTypeSingleBitEccError | nvmlEventTypeDoubleBitEccError | nvmlEventTypePState | \ + nvmlEventTypeClock | nvmlEventTypeXidCriticalError | nvmlEventTypePowerSourceChange | nvmlEventMigConfigChange) + /** @} */ + + /** + * Information about occurred event + */ + typedef struct nvmlEventData_st + { + nvmlDevice_t device; //!< Specific device where the event occurred + unsigned long long eventType; //!< Information about what specific event occurred + unsigned long long + eventData; //!< Stores XID error for the device in the event of nvmlEventTypeXidCriticalError, + // eventData is 0 for any other event. eventData is set as 999 for unknown xid error. + unsigned int + gpuInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a GPU + // instance, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF + // otherwise. + unsigned int + computeInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a + // compute instance, stores a valid compute instance ID. computeInstanceId is set to + // 0xFFFFFFFF otherwise. + } nvmlEventData_t; + + /** @} */ + + /***************************************************************************************************/ + /** @defgroup nvmlEvents Event Handling Methods + * This chapter describes methods that NVML can perform against each device to register and wait for + * some event to occur. + * @{ + */ + /***************************************************************************************************/ + + /** + * Create an empty set of events. + * Event set should be freed by \ref nvmlEventSetFree + * + * For Fermi &tm; or newer fully supported devices. + * @param set Reference in which to return the event handle + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventSetFree + */ + nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); + + /** + * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t + * + * For Fermi &tm; or newer fully supported devices. + * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) + * Power capping events are available only on Power Management enabled devices (see \ref + * nvmlDeviceGetPowerManagementMode) + * + * For Linux only. + * + * \b IMPORTANT: Operations on \a set are not thread safe + * + * This call starts recording of events on specific device. + * All events that occurred before this call are not recorded. + * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 + * + * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. + * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes + * are registered in that case. + * + * @param device The identifier of the target device + * @param eventTypes Bitmask of \ref nvmlEventType to record + * @param set Set to which add new event types + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested + * event types + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceGetSupportedEventTypes + * @see nvmlEventSetWait + * @see nvmlEventSetFree + */ + nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, + unsigned long long eventTypes, + nvmlEventSet_t set); + + /** + * Returns information about events supported on device + * + * For Fermi &tm; or newer fully supported devices. + * + * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. + * + * @param device The identifier of the target device + * @param eventTypes Reference in which to return bitmask of supported events + * + * @return + * - \ref NVML_SUCCESS if the eventTypes has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ + nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); + + /** + * Waits on events and delivers events + * + * For Fermi &tm; or newer fully supported devices. + * + * If some events are ready to be delivered at the time of the call, function returns immediately. + * If there are no events ready to be delivered, function sleeps till event arrives + * but not longer than specified timeout. This function in certain conditions can return before + * specified timeout passes (e.g. when interrupt arrives) + * + * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system. + * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error + * type is returned for all xid error events. + * + * On Linux, every xid error event would return the associated event data and other information if applicable. + * + * In MIG mode, if device handle is provided, the API reports all the events for the available instances, + * only if the caller has appropriate privileges. In absence of required privileges, only the events which + * affect all the instances (i.e. whole device) are reported. + * + * This API does not currently support per-instance event reporting using MIG device handles. + * + * @param set Reference to set of events to wait on + * @param data Reference in which to return event data + * @param timeoutms Maximum amount of wait time in milliseconds for registered event + * + * @return + * - \ref NVML_SUCCESS if the data has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL + * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived + * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ + nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t *data, unsigned int timeoutms); + + /** + * Releases events in the set + * + * For Fermi &tm; or newer fully supported devices. + * + * @param set Reference to events to be released + * + * @return + * - \ref NVML_SUCCESS if the event has been successfully released + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceRegisterEvents + */ + nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); + + /** @} */ + nvmlReturn_t DECLDIR ixmlDeviceGetBoardPosition(nvmlDevice_t device, unsigned int *position); nvmlReturn_t DECLDIR ixmlDeviceGetGPUVoltage(nvmlDevice_t device, unsigned int *integer, unsigned int *decimal); diff --git a/gen/ixml/cgo_helpers.go b/gen/ixml/cgo_helpers.go index 92c7059..20f06a3 100644 --- a/gen/ixml/cgo_helpers.go +++ b/gen/ixml/cgo_helpers.go @@ -113,52 +113,6 @@ func (x *Device) PassRef() *C.nvmlDevice_t { return (*C.nvmlDevice_t)(unsafe.Pointer(x)) } -// Ref returns a reference to C object as it is. -func (x *EventSet) Ref() *C.nvmlEventSet_t { - if x == nil { - return nil - } - return (*C.nvmlEventSet_t)(unsafe.Pointer(x)) -} - -// Free cleanups the referenced memory using C free. -func (x *EventSet) Free() { - if x != nil { - C.free(unsafe.Pointer(x)) - } -} - -// NewEventSetRef converts the C object reference into a raw struct reference without wrapping. -func NewEventSetRef(ref unsafe.Pointer) *EventSet { - return (*EventSet)(ref) -} - -// NewEventSet allocates a new C object of this type and converts the reference into -// a raw struct reference without wrapping. -func NewEventSet() *EventSet { - return (*EventSet)(allocEventSetMemory(1)) -} - -// allocEventSetMemory allocates memory for type C.nvmlEventSet_t in C. -// The caller is responsible for freeing the this memory via C.free. -func allocEventSetMemory(n int) unsafe.Pointer { - mem, err := C.calloc(C.size_t(n), (C.size_t)(sizeOfEventSetValue)) - if mem == nil { - panic(fmt.Sprintln("memory alloc error: ", err)) - } - return mem -} - -const sizeOfEventSetValue = unsafe.Sizeof([1]C.nvmlEventSet_t{}) - -// PassRef returns a reference to C object as it is or allocates a new C object of this type. -func (x *EventSet) PassRef() *C.nvmlEventSet_t { - if x == nil { - x = (*EventSet)(allocEventSetMemory(1)) - } - return (*C.nvmlEventSet_t)(unsafe.Pointer(x)) -} - // Ref returns a reference to C object as it is. func (x *Memory) Ref() *C.nvmlMemory_t { if x == nil { @@ -167,7 +121,6 @@ func (x *Memory) Ref() *C.nvmlMemory_t { return (*C.nvmlMemory_t)(unsafe.Pointer(x)) } - // NewMemoryRef converts the C object reference into a raw struct reference without wrapping. func NewMemoryRef(ref unsafe.Pointer) *Memory { return (*Memory)(ref) @@ -346,3 +299,32 @@ type stringHeader struct { Data unsafe.Pointer Len int } + +func clen(n []byte) int { + for i := 0; i < len(n); i++ { + if n[i] == 0 { + return i + } + } + return len(n) +} + + +func uint32SliceToIntSlice(s []uint32) []int { + ret := make([]int, len(s)) + for i := range s { + ret[i] = int(s[i]) + } + return ret +} + +func convertSlice[T any, I any](input []T) []I { + output := make([]I, len(input)) + for i, obj := range input { + switch v := any(obj).(type) { + case I: + output[i] = v + } + } + return output +} \ No newline at end of file diff --git a/gen/ixml/ixml.yml b/gen/ixml/ixml.yml index 133bff7..c5374e7 100644 --- a/gen/ixml/ixml.yml +++ b/gen/ixml/ixml.yml @@ -37,6 +37,8 @@ TRANSLATOR: - {action: replace, from: "^nvml"} - {action: replace, from: "_t$"} - {transform: export} + - {action: replace, from: "^EventSet$", to: "nvmlEventSet"} + - {action: replace, from: "^EventData$", to: "nvmlEventData"} - {action: replace, from: "^GpmMetricsGet", to: "nvmlGpmMetricsGetType"} function: - {action: accept, from: "^ixml"} @@ -46,4 +48,5 @@ TRANSLATOR: - {action: replace, from: "^nvmlDeviceGetCount_v2", to: "nvmlDeviceGetCount"} - {action: replace, from: "^nvmlDeviceGetHandleByIndex_v2", to: "nvmlDeviceGetHandleByIndex"} - {action: replace, from: "^nvmlDeviceGetPciInfo_v3", to: "nvmlDeviceGetPciInfo"} + - {action: replace, from: "^nvmlEventSetWait_v2", to: "nvmlEventSetWait"} - {transform: unexport} diff --git a/pkg/ixml/api.h b/pkg/ixml/api.h index b5b2f4c..91bd006 100644 --- a/pkg/ixml/api.h +++ b/pkg/ixml/api.h @@ -93,6 +93,8 @@ extern "C" #define DECLDIR #endif +#define DEVICE_MAX_NUM 32 + /** * Return values for NVML API calls. */ @@ -125,6 +127,7 @@ extern "C" NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported + NVML_ERROR_DEPRECATED = 26, //!< The requested functionality has been deprecated NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred } nvmlReturn_t; @@ -133,10 +136,48 @@ extern "C" struct nvmlDevice_st *handle; } nvmlDevice_t; - typedef struct + /** + * See \ref nvmlDeviceGetMemoryErrorCounter + */ + typedef enum nvmlMemoryLocation_enum { - struct nvmlEventSet_st *handle; - } nvmlEventSet_t; + NVML_MEMORY_LOCATION_L1_CACHE = 0, //!< GPU L1 Cache + NVML_MEMORY_LOCATION_L2_CACHE = 1, //!< GPU L2 Cache + NVML_MEMORY_LOCATION_DRAM = 2, //!< Turing+ DRAM + NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2, //!< GPU Device Memory + NVML_MEMORY_LOCATION_REGISTER_FILE = 3, //!< GPU Register File + NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4, //!< GPU Texture Memory + NVML_MEMORY_LOCATION_TEXTURE_SHM = 5, //!< Shared memory + NVML_MEMORY_LOCATION_CBU = 6, //!< CBU + NVML_MEMORY_LOCATION_SRAM = 7, //!< Turing+ SRAM + // Keep this last + NVML_MEMORY_LOCATION_COUNT //!< This counts the number of memory locations the driver knows about + } nvmlMemoryLocation_t; + + /** + * Causes for page retirement + */ + typedef enum nvmlPageRetirementCause_enum + { + NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error + NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1, //!< Page was retired due to double bit ECC error + + // Keep this last + NVML_PAGE_RETIREMENT_CAUSE_COUNT + } nvmlPageRetirementCause_t; + + /** + * API types that allow changes to default permission restrictions + */ + typedef enum nvmlRestrictedAPI_enum + { + NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0, //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks + //!< and see nvmlDeviceResetApplicationsClocks + NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, //!< APIs that enable/disable Auto Boosted clocks + //!< see nvmlDeviceSetAutoBoostedClocksEnabled + // Keep this last + NVML_RESTRICTED_API_COUNT + } nvmlRestrictedAPI_t; /** @} */ /** @@ -200,6 +241,15 @@ extern "C" NVML_TEMPERATURE_COUNT } nvmlTemperatureSensors_t; + /** + * Generic enable/disable enum. + */ + typedef enum nvmlEnableState_enum + { + NVML_FEATURE_DISABLED = 0, //!< Feature disabled + NVML_FEATURE_ENABLED = 1 //!< Feature enabled + } nvmlEnableState_t; + /** * Clock types. * @@ -360,15 +410,6 @@ extern "C" struct nvmlGpmSample_st *handle; } nvmlGpmSample_t; - // typedef struct nvmlGpmSample_st { - // nvmlDevice_t device; - // unsigned long long timeStamp; - // unsigned long long sm_active; - // unsigned long long active_warps; - // unsigned long long total_cycles; - // unsigned long long dram_bandwidth; - // } nvmlGpmSample_t; - /** * GPM metric information. */ @@ -408,6 +449,157 @@ extern "C" unsigned int isSupportedDevice; //!< OUT: Indicates device support } nvmlGpmSupport_t; +/** + * Maximum limit on Physical Bridges per Board + */ +#define NVML_MAX_PHYSICAL_BRIDGE (128) + + /** + * Enum to represent type of bridge chip + */ + typedef enum nvmlBridgeChipType_enum + { + NVML_BRIDGE_CHIP_PLX = 0, + NVML_BRIDGE_CHIP_BRO4 = 1 + } nvmlBridgeChipType_t; + + /** + * Information about the Bridge Chip Firmware + */ + typedef struct nvmlBridgeChipInfo_st + { + nvmlBridgeChipType_t type; //!< Type of Bridge Chip + unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable + } nvmlBridgeChipInfo_t; + + /** + * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate + * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. + */ + typedef struct nvmlBridgeChipHierarchy_st + { + unsigned char bridgeCount; //!< Number of Bridge Chips on the Board + nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board + } nvmlBridgeChipHierarchy_t; + + /** + * Represents Type of Sampling Event + */ + typedef enum nvmlSamplingType_enum + { + NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU + NVML_GPU_UTILIZATION_SAMPLES = + 1, //!< To represent percent of time during which one or more kernels was executing on the GPU + NVML_MEMORY_UTILIZATION_SAMPLES = + 2, //!< To represent percent of time during which global (device) memory was being read or written + NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy + NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy + NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples + NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples + + // Keep this last + NVML_SAMPLINGTYPE_COUNT + } nvmlSamplingType_t; + + /** + * Represents the queryable PCIe utilization counters + */ + typedef enum nvmlPcieUtilCounter_enum + { + NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity + NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity + + // Keep this last + NVML_PCIE_UTIL_COUNT + } nvmlPcieUtilCounter_t; + + /** + * Represents the type for sample value returned + */ + typedef enum nvmlValueType_enum + { + NVML_VALUE_TYPE_DOUBLE = 0, + NVML_VALUE_TYPE_UNSIGNED_INT = 1, + NVML_VALUE_TYPE_UNSIGNED_LONG = 2, + NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, + NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, + + // Keep this last + NVML_VALUE_TYPE_COUNT + } nvmlValueType_t; + + /** + * Union to represent different types of Value + */ + typedef union nvmlValue_st + { + double dVal; //!< If the value is double + unsigned int uiVal; //!< If the value is unsigned int + unsigned long ulVal; //!< If the value is unsigned long + unsigned long long ullVal; //!< If the value is unsigned long long + signed long long sllVal; //!< If the value is signed long long + } nvmlValue_t; + + /** + * Information for Sample + */ + typedef struct nvmlSample_st + { + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlValue_t sampleValue; //!< Sample Value + } nvmlSample_t; + + /** + * Represents type of perf policy for which violation times can be queried + */ + typedef enum nvmlPerfPolicyType_enum + { + NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks + NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks + NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks + NVML_PERF_POLICY_BOARD_LIMIT = + 3, //!< How long did the board limit cause the GPU to be below application clocks + NVML_PERF_POLICY_LOW_UTILIZATION = + 4, //!< How long did low utilization cause the GPU to be below application clocks + NVML_PERF_POLICY_RELIABILITY = + 5, //!< How long did the board reliability limit cause the GPU to be below application clocks + + NVML_PERF_POLICY_TOTAL_APP_CLOCKS = + 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) + NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks + + // Keep this last + NVML_PERF_POLICY_COUNT + } nvmlPerfPolicyType_t; + + /** + * Compute mode. + * + * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. + * Earlier CUDA versions supported a single exclusive mode, + * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. + */ + typedef enum nvmlComputeMode_enum + { + NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed + NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, + //!< usable from multiple threads at a time + + // Keep this last + NVML_COMPUTEMODE_COUNT + } nvmlComputeMode_t; + + /** + * Struct to hold perf policy violation status data + */ + typedef struct nvmlViolationTime_st + { + unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds + unsigned long long violationTime; //!< violationTime in Nanoseconds + } nvmlViolationTime_t; + #define NVML_GPM_SUPPORT_VERSION 1 /** * Buffer size guaranteed to be large enough for storing GPU identifiers. @@ -419,6 +611,11 @@ extern "C" */ #define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE 80 +/** + * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion + */ +#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80 + /** * Buffer size guaranteed to be large enough for storing GPU device names. */ @@ -429,6 +626,21 @@ extern "C" */ #define NVML_DEVICE_NAME_V2_BUFFER_SIZE 96 +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber + */ +#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial + */ +#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion + */ +#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32 + /** * Buffer size guaranteed to be large enough for pci bus id */ @@ -728,6 +940,24 @@ extern "C" */ nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length); + /** + * Retrieves the version of the NVML library. + * + * For all products. + * + * The version identifier is an alphanumeric string. It will not exceed 80 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE. + * + * @param version Reference in which to return the version identifier + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + */ + nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length); + /** * Retrieves the version of the CUDA driver. * @@ -935,205 +1165,639 @@ extern "C" nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); /** - * Retrieves the PCI attributes of this device. + * Retrieves the current compute mode for the device. * * For all products. * - * See \ref nvmlPciInfo_t for details on the available PCI info. + * See \ref nvmlComputeMode_t for details on allowed compute modes. * * @param device The identifier of the target device - * @param pci Reference in which to return the PCI info + * @param mode Reference in which to return the current compute mode * * @return - * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_SUCCESS if \a mode has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetComputeMode() */ - nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); + nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); /** - * Retrieves the NVML index of this device. + * Retrieves the CUDA compute capability of the device. * * For all products. * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or GPU UUID. See - * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). - * - * When used with MIG device handles this API returns indices that can be - * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. - * MIG device indices are unique within a device. - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * Returns the major and minor compute capability version numbers of the + * device. The major and minor versions are equivalent to the + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be + * returned by CUDA's cuDeviceGetAttribute(). * * @param device The identifier of the target device - * @param index Reference in which to return the NVML index of the device + * @param major Reference in which to return the major CUDA compute capability + * @param minor Reference in which to return the minor CUDA compute capability * * @return - * - \ref NVML_SUCCESS if \a index has been set + * - \ref NVML_SUCCESS if \a major and \a minor have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetHandleByIndex() - * @see nvmlDeviceGetCount() */ - nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); + nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); /** - * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * Retrieves the current and pending ECC modes for the device. * * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. * - * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following + * the next reboot. * - * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. + * See \ref nvmlEnableState_t for details on allowed modes. * * @param device The identifier of the target device - * @param power Reference in which to return the power usage information + * @param current Reference in which to return the current ECC mode + * @param pending Reference in which to return the pending ECC mode * * @return - * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_SUCCESS if \a current and \a pending have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() */ - nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); + nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, + nvmlEnableState_t *current, + nvmlEnableState_t *pending); /** - * Check if the GPU devices are on the same physical board. + * Retrieves the default ECC modes for the device. * - * For all fully supported products. + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. * - * @param device1 The first GPU device - * @param device2 The second GPU device - * @param onSameBoard Reference in which to return the status. - * Non-zero indicates that the GPUs are on the same board. + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param defaultMode Reference in which to return the default ECC mode * * @return - * - \ref NVML_SUCCESS if \a onSameBoard has been set + * - \ref NVML_SUCCESS if \a current and \a pending have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a default is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() */ - nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); + nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode); /** - * Get information about processes with a compute context on a device - * - * For Fermi &tm; or newer fully supported devices. - * - * This function returns information only about compute running processes (e.g. CUDA application which have - * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. - * - * To query the current number of running compute processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. + * Retrieves the device boardId from 0-N. + * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with + * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. + * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across + * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and + * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will + * always return those values but they will always be different from each other). * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new compute processes are spawned. * - * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if - * the caller has appropriate privileges. Per-instance information can be queried by using - * specific MIG device handles. - * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * For Fermi &tm; or newer fully supported devices. * - * @param device The device handle or MIG device handle - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information + * @param device The identifier of the target device + * @param boardId Reference in which to return the device's board ID * * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_SUCCESS if \a boardId has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ - nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); - - /** - * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead */ + nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); /** - * Retrieve the PCIe replay counter. + * Retrieves whether the device is on a Multi-GPU Board + * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. * - * For Kepler &tm; or newer fully supported devices. + * For Fermi &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param value Reference in which to return the counter's value + * @param multiGpuBool Reference in which to return a zero or non-zero value + * to indicate whether the device is on a multi GPU board * * @return - * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_SUCCESS if \a multiGpuBool has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ - nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); - - /** @} */ // @defgroup nvmlGPMStructs - - /***************************************************************************************************/ - /** @defgroup nvmlGpmFunctions GPM Functions - * @{ - */ - /***************************************************************************************************/ + nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); /** - * Calculate GPM metrics from two samples. + * Retrieves the PCI attributes of this device. * - * For Hopper &tm; or newer fully supported devices. + * For all products. * - * @param metricsGet IN/OUT: populated \a nvmlGpmMetricsGet_t struct + * See \ref nvmlPciInfo_t for details on the available PCI info. + * + * @param device The identifier of the target device + * @param pci Reference in which to return the PCI info * * @return - * - \ref NVML_SUCCESS on success - * - Nonzero NVML_ERROR_? enum on error + * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ - nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); + nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); /** - * Indicate whether the supplied device supports GPM + * Retrieves the maximum PCIe link generation possible with this device and system * - * @param device NVML device to query for - * @param gpmSupport Structure to indicate GPM support \a nvmlGpmSupport_t. Indicates - * GPM support per system for the supplied device + * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function + * will report is generation 1. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkGen Reference in which to return the max PCIe link generation * * @return - * - NVML_SUCCESS on success - * - Nonzero NVML_ERROR_? enum if there is an error in processing the query + * - \ref NVML_SUCCESS if \a maxLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ - nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); + nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); /** - * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() + * Retrieves the maximum PCIe link width possible with this device and system * - * For Hopper &tm; or newer fully supported devices. + * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report + * a max link width of 8. * - * @param gpmSample Sample to free + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkWidth Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); + + /** + * Retrieves the current PCIe link generation + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkGen Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); + + /** + * Retrieves the current PCIe link width + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkWidth Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); + + /** + * Retrieve PCIe utilization information. + * This function is querying a byte counter over a 20ms interval and thus is the + * PCIe throughput over that interval. + * + * For Maxwell &tm; or newer fully supported devices. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param counter The specific counter that should be queried \ref + * nvmlPcieUtilCounter_t + * @param value Reference in which to return throughput in KB/s + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, + nvmlPcieUtilCounter_t counter, + unsigned int *value); + + /** + * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead + */ + + /** + * Retrieve the PCIe replay counter. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param value Reference in which to return the counter's value + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); + + /** + * Retrieves the NVML index of this device. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or GPU UUID. See + * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). + * + * When used with MIG device handles this API returns indices that can be + * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. + * MIG device indices are unique within a device. + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * @param device The identifier of the target device + * @param index Reference in which to return the NVML index of the device + * + * @return + * - \ref NVML_SUCCESS if \a index has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetHandleByIndex() + * @see nvmlDeviceGetCount() + */ + nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); + + /** + * Retrieves the globally unique board serial number associated with this device's board. + * + * For all products with an inforom. + * + * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). + * This number matches the serial number tag that is physically attached to the board. See \ref + * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param serial Reference in which to return the board/module serial number + * @param length The maximum allowed length of the string returned in \a serial + * + * @return + * - \ref NVML_SUCCESS if \a serial has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); + + /** + * Acquire the handle for a particular device, based on its board serial number. + * + * For Fermi &tm; or newer fully supported devices. + * + * This number corresponds to the value printed directly on the board, and to the value returned by + * \ref nvmlDeviceGetSerial(). + * + * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor + * of \ref nvmlDeviceGetHandleByUUID. + * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @param serial The board serial number of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one + * device has the same serial (dual GPU boards) + * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power + * cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetSerial + * @see nvmlDeviceGetHandleByUUID + */ + nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); + + /** + * Get VBIOS version of the device. + * + * For all products. + * + * The VBIOS version may change from time to time. It will not exceed 32 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference to which to return the VBIOS version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); + + /** + * Retrieves the the device board part number which is programmed into the board's InfoROM + * + * For all products. + * + * @param device Identifier of the target device + * @param partNumber Reference to the buffer to return + * @param length Length of the buffer reference + * + * @return + * - \ref NVML_SUCCESS if \a partNumber has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise + * inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char *partNumber, unsigned int length); + + /** + * Set the ECC mode for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires root/admin permissions. + * + * The ECC mode determines whether the GPU enables its ECC support. + * + * This operation takes effect after the next reboot. + * + * See \ref nvmlEnableState_t for details on available modes. + * + * @param device The identifier of the target device + * @param ecc The target ECC mode + * + * @return + * - \ref NVML_SUCCESS if the ECC mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetEccMode() + */ + nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); + + /** + * Retrieves the current and pending ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following + * the next reboot. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current ECC mode + * @param pending Reference in which to return the pending ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ + nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, + nvmlEnableState_t *current, + nvmlEnableState_t *pending); + + /** + * Retrieves the default ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param defaultMode Reference in which to return the default ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a default is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ + nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode); + + /** + * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * + * For Fermi &tm; or newer fully supported devices. + * + * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param power Reference in which to return the power usage information + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); + + /** + * Check if the GPU devices are on the same physical board. + * + * For all fully supported products. + * + * @param device1 The first GPU device + * @param device2 The second GPU device + * @param onSameBoard Reference in which to return the status. + * Non-zero indicates that the GPUs are on the same board. + * + * @return + * - \ref NVML_SUCCESS if \a onSameBoard has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); + + /** + * Get information about processes with a compute context on a device + * + * For Fermi &tm; or newer fully supported devices. + * + * This function returns information only about compute running processes (e.g. CUDA application which have + * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. + * + * To query the current number of running compute processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new compute processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * + * @param device The device handle or MIG device handle + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ + nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); + + /** @} */ // @defgroup nvmlGPMStructs + + /***************************************************************************************************/ + /** @defgroup nvmlGpmFunctions GPM Functions + * @{ + */ + /***************************************************************************************************/ + + /** + * Calculate GPM metrics from two samples. + * + * For Hopper &tm; or newer fully supported devices. + * + * @param metricsGet IN/OUT: populated \a nvmlGpmMetricsGet_t struct + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ + nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); + + /** + * Indicate whether the supplied device supports GPM + * + * @param device NVML device to query for + * @param gpmSupport Structure to indicate GPM support \a nvmlGpmSupport_t. Indicates + * GPM support per system for the supplied device + * + * @return + * - NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum if there is an error in processing the query + */ + nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); + + /** + * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() + * + * For Hopper &tm; or newer fully supported devices. + * + * @param gpmSample Sample to free * * @return * - \ref NVML_SUCCESS on success @@ -1311,6 +1975,242 @@ extern "C" /** @} */ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); + /***************************************************************************************************/ + /** @addtogroup nvmlEvents + * @{ + */ + /***************************************************************************************************/ + + /** + * Handle to an event set + */ + typedef struct + { + struct nvmlEventSet_st *handle; + } nvmlEventSet_t; + +/** @defgroup nvmlEventType Event Types + * @{ + * Event Types which user can be notified about. + * See description of particular functions for details. + * + * See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices + * support each event. + * + * Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents + */ +//! Event about single bit ECC errors +/** + * \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event + */ +#define nvmlEventTypeSingleBitEccError 0x0000000000000001LL + +//! Event about double bit ECC errors +/** + * \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event + */ +#define nvmlEventTypeDoubleBitEccError 0x0000000000000002LL + +//! Event about PState changes +/** + * \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to + * no work being executed on the GPU, power capping or thermal capping. In a typical situation, + * Fermi-based GPU should stay in P0 for the duration of the execution of the compute process. + */ +#define nvmlEventTypePState 0x0000000000000004LL + +//! Event that Xid critical error occurred +#define nvmlEventTypeXidCriticalError 0x0000000000000008LL + +//! Event about clock changes +/** + * Kepler only + */ +#define nvmlEventTypeClock 0x0000000000000010LL + +//! Event about AC/Battery power source changes +#define nvmlEventTypePowerSourceChange 0x0000000000000080LL + +//! Event about MIG configuration changes +#define nvmlEventMigConfigChange 0x0000000000000100LL + +//! Mask with no events +#define nvmlEventTypeNone 0x0000000000000000LL + +//! Mask of all events +#define nvmlEventTypeAll \ + (nvmlEventTypeNone | nvmlEventTypeSingleBitEccError | nvmlEventTypeDoubleBitEccError | nvmlEventTypePState | \ + nvmlEventTypeClock | nvmlEventTypeXidCriticalError | nvmlEventTypePowerSourceChange | nvmlEventMigConfigChange) + /** @} */ + + /** + * Information about occurred event + */ + typedef struct nvmlEventData_st + { + nvmlDevice_t device; //!< Specific device where the event occurred + unsigned long long eventType; //!< Information about what specific event occurred + unsigned long long + eventData; //!< Stores XID error for the device in the event of nvmlEventTypeXidCriticalError, + // eventData is 0 for any other event. eventData is set as 999 for unknown xid error. + unsigned int + gpuInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a GPU + // instance, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF + // otherwise. + unsigned int + computeInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a + // compute instance, stores a valid compute instance ID. computeInstanceId is set to + // 0xFFFFFFFF otherwise. + } nvmlEventData_t; + + /** @} */ + + /***************************************************************************************************/ + /** @defgroup nvmlEvents Event Handling Methods + * This chapter describes methods that NVML can perform against each device to register and wait for + * some event to occur. + * @{ + */ + /***************************************************************************************************/ + + /** + * Create an empty set of events. + * Event set should be freed by \ref nvmlEventSetFree + * + * For Fermi &tm; or newer fully supported devices. + * @param set Reference in which to return the event handle + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventSetFree + */ + nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); + + /** + * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t + * + * For Fermi &tm; or newer fully supported devices. + * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) + * Power capping events are available only on Power Management enabled devices (see \ref + * nvmlDeviceGetPowerManagementMode) + * + * For Linux only. + * + * \b IMPORTANT: Operations on \a set are not thread safe + * + * This call starts recording of events on specific device. + * All events that occurred before this call are not recorded. + * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 + * + * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. + * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes + * are registered in that case. + * + * @param device The identifier of the target device + * @param eventTypes Bitmask of \ref nvmlEventType to record + * @param set Set to which add new event types + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested + * event types + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceGetSupportedEventTypes + * @see nvmlEventSetWait + * @see nvmlEventSetFree + */ + nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, + unsigned long long eventTypes, + nvmlEventSet_t set); + + /** + * Returns information about events supported on device + * + * For Fermi &tm; or newer fully supported devices. + * + * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. + * + * @param device The identifier of the target device + * @param eventTypes Reference in which to return bitmask of supported events + * + * @return + * - \ref NVML_SUCCESS if the eventTypes has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ + nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); + + /** + * Waits on events and delivers events + * + * For Fermi &tm; or newer fully supported devices. + * + * If some events are ready to be delivered at the time of the call, function returns immediately. + * If there are no events ready to be delivered, function sleeps till event arrives + * but not longer than specified timeout. This function in certain conditions can return before + * specified timeout passes (e.g. when interrupt arrives) + * + * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system. + * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error + * type is returned for all xid error events. + * + * On Linux, every xid error event would return the associated event data and other information if applicable. + * + * In MIG mode, if device handle is provided, the API reports all the events for the available instances, + * only if the caller has appropriate privileges. In absence of required privileges, only the events which + * affect all the instances (i.e. whole device) are reported. + * + * This API does not currently support per-instance event reporting using MIG device handles. + * + * @param set Reference to set of events to wait on + * @param data Reference in which to return event data + * @param timeoutms Maximum amount of wait time in milliseconds for registered event + * + * @return + * - \ref NVML_SUCCESS if the data has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL + * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived + * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ + nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t *data, unsigned int timeoutms); + + /** + * Releases events in the set + * + * For Fermi &tm; or newer fully supported devices. + * + * @param set Reference to events to be released + * + * @return + * - \ref NVML_SUCCESS if the event has been successfully released + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceRegisterEvents + */ + nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); + + /** @} */ + nvmlReturn_t DECLDIR ixmlDeviceGetBoardPosition(nvmlDevice_t device, unsigned int *position); nvmlReturn_t DECLDIR ixmlDeviceGetGPUVoltage(nvmlDevice_t device, unsigned int *integer, unsigned int *decimal); diff --git a/pkg/ixml/cgo_helpers.go b/pkg/ixml/cgo_helpers.go index 92c7059..20f06a3 100644 --- a/pkg/ixml/cgo_helpers.go +++ b/pkg/ixml/cgo_helpers.go @@ -113,52 +113,6 @@ func (x *Device) PassRef() *C.nvmlDevice_t { return (*C.nvmlDevice_t)(unsafe.Pointer(x)) } -// Ref returns a reference to C object as it is. -func (x *EventSet) Ref() *C.nvmlEventSet_t { - if x == nil { - return nil - } - return (*C.nvmlEventSet_t)(unsafe.Pointer(x)) -} - -// Free cleanups the referenced memory using C free. -func (x *EventSet) Free() { - if x != nil { - C.free(unsafe.Pointer(x)) - } -} - -// NewEventSetRef converts the C object reference into a raw struct reference without wrapping. -func NewEventSetRef(ref unsafe.Pointer) *EventSet { - return (*EventSet)(ref) -} - -// NewEventSet allocates a new C object of this type and converts the reference into -// a raw struct reference without wrapping. -func NewEventSet() *EventSet { - return (*EventSet)(allocEventSetMemory(1)) -} - -// allocEventSetMemory allocates memory for type C.nvmlEventSet_t in C. -// The caller is responsible for freeing the this memory via C.free. -func allocEventSetMemory(n int) unsafe.Pointer { - mem, err := C.calloc(C.size_t(n), (C.size_t)(sizeOfEventSetValue)) - if mem == nil { - panic(fmt.Sprintln("memory alloc error: ", err)) - } - return mem -} - -const sizeOfEventSetValue = unsafe.Sizeof([1]C.nvmlEventSet_t{}) - -// PassRef returns a reference to C object as it is or allocates a new C object of this type. -func (x *EventSet) PassRef() *C.nvmlEventSet_t { - if x == nil { - x = (*EventSet)(allocEventSetMemory(1)) - } - return (*C.nvmlEventSet_t)(unsafe.Pointer(x)) -} - // Ref returns a reference to C object as it is. func (x *Memory) Ref() *C.nvmlMemory_t { if x == nil { @@ -167,7 +121,6 @@ func (x *Memory) Ref() *C.nvmlMemory_t { return (*C.nvmlMemory_t)(unsafe.Pointer(x)) } - // NewMemoryRef converts the C object reference into a raw struct reference without wrapping. func NewMemoryRef(ref unsafe.Pointer) *Memory { return (*Memory)(ref) @@ -346,3 +299,32 @@ type stringHeader struct { Data unsafe.Pointer Len int } + +func clen(n []byte) int { + for i := 0; i < len(n); i++ { + if n[i] == 0 { + return i + } + } + return len(n) +} + + +func uint32SliceToIntSlice(s []uint32) []int { + ret := make([]int, len(s)) + for i := range s { + ret[i] = int(s[i]) + } + return ret +} + +func convertSlice[T any, I any](input []T) []I { + output := make([]I, len(input)) + for i, obj := range input { + switch v := any(obj).(type) { + case I: + output[i] = v + } + } + return output +} \ No newline at end of file diff --git a/pkg/ixml/const.go b/pkg/ixml/const.go index f1bc1c1..119c273 100644 --- a/pkg/ixml/const.go +++ b/pkg/ixml/const.go @@ -13,46 +13,74 @@ package ixml import "C" const ( - // GPM_METRICS_GET_VERSION as defined in ixml/api.h:400 + // GPM_METRICS_GET_VERSION as defined in ixml/api.h:441 GPM_METRICS_GET_VERSION = 1 - // GPM_SUPPORT_VERSION as defined in ixml/api.h:411 + // MAX_PHYSICAL_BRIDGE as defined in ixml/api.h:455 + MAX_PHYSICAL_BRIDGE = 128 + // GPM_SUPPORT_VERSION as defined in ixml/api.h:603 GPM_SUPPORT_VERSION = 1 - // DEVICE_UUID_BUFFER_SIZE as defined in ixml/api.h:415 + // DEVICE_UUID_BUFFER_SIZE as defined in ixml/api.h:607 DEVICE_UUID_BUFFER_SIZE = 80 - // SYSTEM_DRIVER_VERSION_BUFFER_SIZE as defined in ixml/api.h:420 + // SYSTEM_DRIVER_VERSION_BUFFER_SIZE as defined in ixml/api.h:612 SYSTEM_DRIVER_VERSION_BUFFER_SIZE = 80 - // DEVICE_NAME_BUFFER_SIZE as defined in ixml/api.h:425 + // SYSTEM_NVML_VERSION_BUFFER_SIZE as defined in ixml/api.h:617 + SYSTEM_NVML_VERSION_BUFFER_SIZE = 80 + // DEVICE_NAME_BUFFER_SIZE as defined in ixml/api.h:622 DEVICE_NAME_BUFFER_SIZE = 64 - // DEVICE_NAME_V2_BUFFER_SIZE as defined in ixml/api.h:430 + // DEVICE_NAME_V2_BUFFER_SIZE as defined in ixml/api.h:627 DEVICE_NAME_V2_BUFFER_SIZE = 96 - // DEVICE_PCI_BUS_ID_BUFFER_SIZE as defined in ixml/api.h:435 + // DEVICE_PART_NUMBER_BUFFER_SIZE as defined in ixml/api.h:632 + DEVICE_PART_NUMBER_BUFFER_SIZE = 80 + // DEVICE_SERIAL_BUFFER_SIZE as defined in ixml/api.h:637 + DEVICE_SERIAL_BUFFER_SIZE = 30 + // DEVICE_VBIOS_VERSION_BUFFER_SIZE as defined in ixml/api.h:642 + DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 + // DEVICE_PCI_BUS_ID_BUFFER_SIZE as defined in ixml/api.h:647 DEVICE_PCI_BUS_ID_BUFFER_SIZE = 32 - // DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE as defined in ixml/api.h:440 + // DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE as defined in ixml/api.h:652 DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE = 16 - // HealthSYSHUBError as defined in ixml/api.h:442 + // HealthSYSHUBError as defined in ixml/api.h:654 HealthSYSHUBError = int64(0x0000000000000001) - // HealthMCError as defined in ixml/api.h:443 + // HealthMCError as defined in ixml/api.h:655 HealthMCError = int64(0x0000000000000002) - // HealthOverTempError as defined in ixml/api.h:444 + // HealthOverTempError as defined in ixml/api.h:656 HealthOverTempError = int64(0x0000000000000004) - // HealthOverVoltageError as defined in ixml/api.h:445 + // HealthOverVoltageError as defined in ixml/api.h:657 HealthOverVoltageError = int64(0x0000000000000008) - // HealthECCError as defined in ixml/api.h:446 + // HealthECCError as defined in ixml/api.h:658 HealthECCError = int64(0x0000000000000010) - // HealthMemoryError as defined in ixml/api.h:447 + // HealthMemoryError as defined in ixml/api.h:659 HealthMemoryError = int64(0x0000000000000020) - // HealthPCIEError as defined in ixml/api.h:448 + // HealthPCIEError as defined in ixml/api.h:660 HealthPCIEError = int64(0x0000000000000040) - // HealthOK as defined in ixml/api.h:449 + // HealthOK as defined in ixml/api.h:661 HealthOK = int64(0x0000000000000000) + // EventTypeSingleBitEccError as defined in ixml/api.h:2006 + EventTypeSingleBitEccError = int64(0x0000000000000001) + // EventTypeDoubleBitEccError as defined in ixml/api.h:2012 + EventTypeDoubleBitEccError = int64(0x0000000000000002) + // EventTypePState as defined in ixml/api.h:2020 + EventTypePState = int64(0x0000000000000004) + // EventTypeXidCriticalError as defined in ixml/api.h:2023 + EventTypeXidCriticalError = int64(0x0000000000000008) + // EventTypeClock as defined in ixml/api.h:2029 + EventTypeClock = int64(0x0000000000000010) + // EventTypePowerSourceChange as defined in ixml/api.h:2032 + EventTypePowerSourceChange = int64(0x0000000000000080) + // EventMigConfigChange as defined in ixml/api.h:2035 + EventMigConfigChange = int64(0x0000000000000100) + // EventTypeNone as defined in ixml/api.h:2038 + EventTypeNone = int64(0x0000000000000000) + // EventTypeAll as defined in ixml/api.h:2041 + EventTypeAll = (EventTypeNone | EventTypeSingleBitEccError | EventTypeDoubleBitEccError | EventTypePState | EventTypeClock | EventTypeXidCriticalError | EventTypePowerSourceChange | EventMigConfigChange) // NO_UNVERSIONED_FUNC_DEFS as defined in go-ixml/:348 NO_UNVERSIONED_FUNC_DEFS = 1 ) -// Return as declared in ixml/api.h:129 +// Return as declared in ixml/api.h:132 type Return int32 -// Return enumeration from ixml/api.h:129 +// Return enumeration from ixml/api.h:132 const ( SUCCESS Return = iota ERROR_UNINITIALIZED Return = 1 @@ -80,13 +108,51 @@ const ( ERROR_INSUFFICIENT_RESOURCES Return = 23 ERROR_FREQ_NOT_SUPPORTED Return = 24 ERROR_ARGUMENT_VERSION_MISMATCH Return = 25 + ERROR_DEPRECATED Return = 26 ERROR_UNKNOWN Return = 999 ) -// TemperatureThresholds as declared in ixml/api.h:190 +// MemoryLocation as declared in ixml/api.h:155 +type MemoryLocation int32 + +// MemoryLocation enumeration from ixml/api.h:155 +const ( + MEMORY_LOCATION_L1_CACHE MemoryLocation = iota + MEMORY_LOCATION_L2_CACHE MemoryLocation = 1 + MEMORY_LOCATION_DRAM MemoryLocation = 2 + MEMORY_LOCATION_DEVICE_MEMORY MemoryLocation = 2 + MEMORY_LOCATION_REGISTER_FILE MemoryLocation = 3 + MEMORY_LOCATION_TEXTURE_MEMORY MemoryLocation = 4 + MEMORY_LOCATION_TEXTURE_SHM MemoryLocation = 5 + MEMORY_LOCATION_CBU MemoryLocation = 6 + MEMORY_LOCATION_SRAM MemoryLocation = 7 + MEMORY_LOCATION_COUNT MemoryLocation = 8 +) + +// PageRetirementCause as declared in ixml/api.h:167 +type PageRetirementCause int32 + +// PageRetirementCause enumeration from ixml/api.h:167 +const ( + PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS PageRetirementCause = iota + PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR PageRetirementCause = 1 + PAGE_RETIREMENT_CAUSE_COUNT PageRetirementCause = 2 +) + +// RestrictedAPI as declared in ixml/api.h:180 +type RestrictedAPI int32 + +// RestrictedAPI enumeration from ixml/api.h:180 +const ( + RESTRICTED_API_SET_APPLICATION_CLOCKS RestrictedAPI = iota + RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS RestrictedAPI = 1 + RESTRICTED_API_COUNT RestrictedAPI = 2 +) + +// TemperatureThresholds as declared in ixml/api.h:231 type TemperatureThresholds int32 -// TemperatureThresholds enumeration from ixml/api.h:190 +// TemperatureThresholds enumeration from ixml/api.h:231 const ( TEMPERATURE_THRESHOLD_SHUTDOWN TemperatureThresholds = iota TEMPERATURE_THRESHOLD_SLOWDOWN TemperatureThresholds = 1 @@ -98,19 +164,28 @@ const ( TEMPERATURE_THRESHOLD_COUNT TemperatureThresholds = 7 ) -// TemperatureSensors as declared in ixml/api.h:201 +// TemperatureSensors as declared in ixml/api.h:242 type TemperatureSensors int32 -// TemperatureSensors enumeration from ixml/api.h:201 +// TemperatureSensors enumeration from ixml/api.h:242 const ( TEMPERATURE_GPU TemperatureSensors = iota TEMPERATURE_COUNT TemperatureSensors = 1 ) -// ClockType as declared in ixml/api.h:217 +// EnableState as declared in ixml/api.h:251 +type EnableState int32 + +// EnableState enumeration from ixml/api.h:251 +const ( + FEATURE_DISABLED EnableState = iota + FEATURE_ENABLED EnableState = 1 +) + +// ClockType as declared in ixml/api.h:267 type ClockType int32 -// ClockType enumeration from ixml/api.h:217 +// ClockType enumeration from ixml/api.h:267 const ( CLOCK_GRAPHICS ClockType = iota CLOCK_SM ClockType = 1 @@ -119,10 +194,10 @@ const ( CLOCK_COUNT ClockType = 4 ) -// GpuTopologyLevel as declared in ixml/api.h:256 +// GpuTopologyLevel as declared in ixml/api.h:306 type GpuTopologyLevel int32 -// GpuTopologyLevel enumeration from ixml/api.h:256 +// GpuTopologyLevel enumeration from ixml/api.h:306 const ( TOPOLOGY_INTERNAL GpuTopologyLevel = iota TOPOLOGY_SINGLE GpuTopologyLevel = 10 @@ -132,10 +207,10 @@ const ( TOPOLOGY_SYSTEM GpuTopologyLevel = 50 ) -// GpmMetricId as declared in ixml/api.h:345 +// GpmMetricId as declared in ixml/api.h:395 type GpmMetricId int32 -// GpmMetricId enumeration from ixml/api.h:345 +// GpmMetricId enumeration from ixml/api.h:395 const ( GPM_METRIC_GRAPHICS_UTIL GpmMetricId = 1 GPM_METRIC_SM_UTIL GpmMetricId = 2 @@ -208,3 +283,78 @@ const ( GPM_METRIC_NVLINK_L17_TX_PER_SEC GpmMetricId = 97 GPM_METRIC_MAX GpmMetricId = 98 ) + +// BridgeChipType as declared in ixml/api.h:464 +type BridgeChipType int32 + +// BridgeChipType enumeration from ixml/api.h:464 +const ( + BRIDGE_CHIP_PLX BridgeChipType = iota + BRIDGE_CHIP_BRO4 BridgeChipType = 1 +) + +// SamplingType as declared in ixml/api.h:502 +type SamplingType int32 + +// SamplingType enumeration from ixml/api.h:502 +const ( + TOTAL_POWER_SAMPLES SamplingType = iota + GPU_UTILIZATION_SAMPLES SamplingType = 1 + MEMORY_UTILIZATION_SAMPLES SamplingType = 2 + ENC_UTILIZATION_SAMPLES SamplingType = 3 + DEC_UTILIZATION_SAMPLES SamplingType = 4 + PROCESSOR_CLK_SAMPLES SamplingType = 5 + MEMORY_CLK_SAMPLES SamplingType = 6 + SAMPLINGTYPE_COUNT SamplingType = 7 +) + +// PcieUtilCounter as declared in ixml/api.h:514 +type PcieUtilCounter int32 + +// PcieUtilCounter enumeration from ixml/api.h:514 +const ( + PCIE_UTIL_TX_BYTES PcieUtilCounter = iota + PCIE_UTIL_RX_BYTES PcieUtilCounter = 1 + PCIE_UTIL_COUNT PcieUtilCounter = 2 +) + +// ValueType as declared in ixml/api.h:529 +type ValueType int32 + +// ValueType enumeration from ixml/api.h:529 +const ( + VALUE_TYPE_DOUBLE ValueType = iota + VALUE_TYPE_UNSIGNED_INT ValueType = 1 + VALUE_TYPE_UNSIGNED_LONG ValueType = 2 + VALUE_TYPE_UNSIGNED_LONG_LONG ValueType = 3 + VALUE_TYPE_SIGNED_LONG_LONG ValueType = 4 + VALUE_TYPE_COUNT ValueType = 5 +) + +// PerfPolicyType as declared in ixml/api.h:573 +type PerfPolicyType int32 + +// PerfPolicyType enumeration from ixml/api.h:573 +const ( + PERF_POLICY_POWER PerfPolicyType = iota + PERF_POLICY_THERMAL PerfPolicyType = 1 + PERF_POLICY_SYNC_BOOST PerfPolicyType = 2 + PERF_POLICY_BOARD_LIMIT PerfPolicyType = 3 + PERF_POLICY_LOW_UTILIZATION PerfPolicyType = 4 + PERF_POLICY_RELIABILITY PerfPolicyType = 5 + PERF_POLICY_TOTAL_APP_CLOCKS PerfPolicyType = 10 + PERF_POLICY_TOTAL_BASE_CLOCKS PerfPolicyType = 11 + PERF_POLICY_COUNT PerfPolicyType = 12 +) + +// ComputeMode as declared in ixml/api.h:592 +type ComputeMode int32 + +// ComputeMode enumeration from ixml/api.h:592 +const ( + COMPUTEMODE_DEFAULT ComputeMode = iota + COMPUTEMODE_EXCLUSIVE_THREAD ComputeMode = 1 + COMPUTEMODE_PROHIBITED ComputeMode = 2 + COMPUTEMODE_EXCLUSIVE_PROCESS ComputeMode = 3 + COMPUTEMODE_COUNT ComputeMode = 4 +) diff --git a/pkg/ixml/device.go b/pkg/ixml/device.go index 57242c5..ad1cfc3 100644 --- a/pkg/ixml/device.go +++ b/pkg/ixml/device.go @@ -17,71 +17,85 @@ limitations under the License. package ixml +// ixml.DeviceGetCount() func DeviceGetCount() (uint, Return) { var DeviceCount uint32 ret := nvmlDeviceGetCount(&DeviceCount) return uint(DeviceCount), ret } -func DeviceGetHandleByIndex(Index uint, device *Device) Return { - ret := nvmlDeviceGetHandleByIndex(uint32(Index), device) +// ixml.DeviceGetHandleByIndex() +func DeviceGetHandleByIndex(index uint, device *Device) Return { + ret := nvmlDeviceGetHandleByIndex(uint32(index), device) return ret } +// ixml.DeviceGetHandleBySerial() +func DeviceGetHandleBySerial(serial string) (Device, Return) { + var device Device + ret := nvmlDeviceGetHandleBySerial(serial+string(rune(0)), &device) + return device, ret +} + +// ixml.DeviceGetHandleByUUID() func GetHandleByUUID(Uuid string) (Device, Return) { var Device Device ret := nvmlDeviceGetHandleByUUID(Uuid, &Device) return Device, ret } -func (device Device) GetUUID() (string, Return) { - return deviceGetUUID(device) +// ixml.DeviceGetUUID() +func DeviceGetUUID(device Device) (string, Return) { + return device.GetUUID() } -func deviceGetUUID(Device Device) (string, Return) { - Uuid := make([]byte, DEVICE_UUID_BUFFER_SIZE) - ret := nvmlDeviceGetUUID(Device, &Uuid[0], DEVICE_UUID_BUFFER_SIZE) - return removeBytesSpaces(Uuid), ret +func (device Device) GetUUID() (string, Return) { + uuid := make([]byte, DEVICE_UUID_BUFFER_SIZE) + ret := nvmlDeviceGetUUID(device, &uuid[0], DEVICE_UUID_BUFFER_SIZE) + return removeBytesSpaces(uuid), ret } -// DeviceGetHandleByPciBusId returns a handle to the device with the specified PCI bus ID. -// The PCI bus ID is a string in the format "domain:bus:device.function", e.g., "00000000:1F:00.0". +// ixml.DeviceGetHandleByPciBusId() +// The format of pciBusId is "domain:bus:device.function", e.g., "00000000:1F:00.0". func DeviceGetHandleByPciBusId(pciBusId string) (Device, Return) { var device Device ret := nvmlDeviceGetHandleByPciBusId_v2(pciBusId+string(rune(0)), &device) return device, ret } -func (device Device) GetMinorNumber() (int, Return) { - return deviceGetMinorNumber(device) +// ixml.DeviceGetMinorNumber() +func DeviceGetMinorNumber(device Device) (int, Return) { + return device.GetMinorNumber() } -func deviceGetMinorNumber(Device Device) (int, Return) { +func (device Device) GetMinorNumber() (int, Return) { var minorNumber uint32 - ret := nvmlDeviceGetMinorNumber(Device, &minorNumber) + ret := nvmlDeviceGetMinorNumber(device, &minorNumber) return int(minorNumber), ret } -func (device Device) GetName() (string, Return) { - return deviceGetName(device) +// ixml.DeviceGetName() +func DeviceGetName(device Device) (string, Return) { + return device.GetName() } -func deviceGetName(device Device) (string, Return) { - Name := make([]byte, DEVICE_NAME_BUFFER_SIZE) - ret := nvmlDeviceGetName(device, &Name[0], DEVICE_NAME_BUFFER_SIZE) - removeBytesSpaces(Name) - return removeBytesSpaces(Name), ret +func (device Device) GetName() (string, Return) { + name := make([]byte, DEVICE_NAME_BUFFER_SIZE) + ret := nvmlDeviceGetName(device, &name[0], DEVICE_NAME_BUFFER_SIZE) + removeBytesSpaces(name) + return removeBytesSpaces(name), ret } -func (device Device) GetTemperature() (uint32, Return) { - var SensorType TemperatureSensors - return deviceGetTemperature(device, SensorType) +// ixml.DeviceGetTemperature() +func DeviceGetTemperature(device Device) (uint32, Return) { + return device.GetTemperature() } -func deviceGetTemperature(Device Device, SensorType TemperatureSensors) (uint32, Return) { - var Temperature uint32 - ret := nvmlDeviceGetTemperature(Device, SensorType, &Temperature) - return Temperature, ret +func (device Device) GetTemperature() (uint32, Return) { + var sensorType TemperatureSensors + var temp uint32 + ret := nvmlDeviceGetTemperature(device, sensorType, &temp) + return temp, ret } func (device Device) GetFanSpeed() (uint32, Return) { @@ -115,8 +129,8 @@ func (device Device) GetClockInfo() (ClockInfo, Return) { func deviceGetClockInfo(Device Device) (ClockInfo, Return) { var sm, mem uint32 - ret := nvmlDeviceGetClockInfo(Device, CLOCK_SM, &sm) - ret = nvmlDeviceGetClockInfo(Device, CLOCK_MEM, &mem) + _ = nvmlDeviceGetClockInfo(Device, CLOCK_SM, &sm) + ret := nvmlDeviceGetClockInfo(Device, CLOCK_MEM, &mem) return ClockInfo{ Sm: sm, Mem: mem, @@ -158,60 +172,121 @@ func deviceGetUtilizationRates(Device Device) (Utilization, Return) { return Utilization, ret } -func (device Device) GetPciInfo() (PciInfo, Return) { - return deviceGetPciInfo(device) +// ixml.DeviceGetComputeMode() +func DeviceGetComputeMode(device Device) (ComputeMode, Return) { + return device.GetComputeMode() +} + +func (device Device) GetComputeMode() (ComputeMode, Return) { + var mode ComputeMode + ret := nvmlDeviceGetComputeMode(device, &mode) + return mode, ret +} + +// ixml.DeviceGetCudaComputeCapability() +func DeviceGetCudaComputeCapability(device Device) (int, int, Return) { + return device.GetCudaComputeCapability() +} + +func (device Device) GetCudaComputeCapability() (int, int, Return) { + var major, minor int32 + ret := nvmlDeviceGetCudaComputeCapability(device, &major, &minor) + return int(major), int(minor), ret +} + +// ixml.DeviceGetEccMode() +func DeviceGetEccMode(device Device) (EnableState, EnableState, Return) { + return device.GetEccMode() +} + +func (device Device) GetEccMode() (EnableState, EnableState, Return) { + var current, pending EnableState + ret := nvmlDeviceGetEccMode(device, ¤t, &pending) + return current, pending, ret +} + +// ixml.DeviceGetBoardId() +func DeviceGetBoardId(device Device) (uint32, Return) { + return device.GetBoardId() } -func deviceGetPciInfo(Device Device) (PciInfo, Return) { +func (device Device) GetBoardId() (uint32, Return) { + var boardId uint32 + ret := nvmlDeviceGetBoardId(device, &boardId) + return boardId, ret +} + +// ixml.DeviceGetPciInfo() +func DeviceGetPciInfo(device Device) (PciInfo, Return) { + return device.GetPciInfo() +} + +func (device Device) GetPciInfo() (PciInfo, Return) { var PciInfo PciInfo - ret := nvmlDeviceGetPciInfo(Device, &PciInfo) + ret := nvmlDeviceGetPciInfo(device, &PciInfo) return PciInfo, ret } -func (device Device) GetIndex() (int, Return) { - return deviceGetIndex(device) +// ixml.DeviceGetIndex() +func DeviceGetIndex(device Device) (int, Return) { + return device.GetIndex() } -func deviceGetIndex(device Device) (int, Return) { +func (device Device) GetIndex() (int, Return) { var Index uint32 ret := nvmlDeviceGetIndex(device, &Index) return int(Index), ret } -func (device Device) GetPowerUsage() (uint32, Return) { - return deviceGetPowerUsage(device) +// ixml.DeviceGetSerial() +func DeviceGetSerial(device Device) (string, Return) { + return device.GetSerial() +} + +func (device Device) GetSerial() (string, Return) { + serial := make([]byte, DEVICE_SERIAL_BUFFER_SIZE) + ret := nvmlDeviceGetSerial(device, &serial[0], DEVICE_SERIAL_BUFFER_SIZE) + return string(serial[:clen(serial)]), ret } -func deviceGetPowerUsage(Device Device) (uint32, Return) { +// ixml.DeviceGetPowerUsage() +func DeviceGetPowerUsage(device Device) (uint32, Return) { + return device.GetPowerUsage() +} + +func (device Device) GetPowerUsage() (uint32, Return) { var Power uint32 - ret := nvmlDeviceGetPowerUsage(Device, &Power) + ret := nvmlDeviceGetPowerUsage(device, &Power) return Power, ret } +// ixml.DeviceGetOnSameBoard() func GetOnSameBoard(device1, device2 Device) (int, Return) { var OnSameBoard int32 ret := nvmlDeviceOnSameBoard(device1, device2, &OnSameBoard) return int(OnSameBoard), ret } -func (device Device) GetBoardPosition() (uint32, Return) { - return deviceGetBoardPosition(device) +// ixml.DeviceGetBoardPosition() +func DeviceGetBoardPosition(device Device) (uint32, Return) { + return device.GetBoardPosition() } -func deviceGetBoardPosition(device Device) (uint32, Return) { +func (device Device) GetBoardPosition() (uint32, Return) { var pos uint32 ret := ixmlDeviceGetBoardPosition(device, &pos) return pos, ret } -func (device Device) GetGPUVoltage() (uint32, uint32, Return) { - return deviceGetGPUVoltage(device) +// ixml.DeviceGetGPUVoltage() +func DeviceGetGPUVoltage(device Device) (uint32, uint32, Return) { + return device.GetGPUVoltage() } -func deviceGetGPUVoltage(device Device) (uint32, uint32, Return) { - var Integer, Decimal uint32 - ret := ixmlDeviceGetGPUVoltage(device, &Integer, &Decimal) - return Integer, Decimal, ret +func (device Device) GetGPUVoltage() (uint32, uint32, Return) { + var integer, decimal uint32 + ret := ixmlDeviceGetGPUVoltage(device, &integer, &decimal) + return integer, decimal, ret } type Info struct { @@ -250,78 +325,141 @@ func deviceGetComputeRunningProcesses(device Device) ([]ProcessInfo_v1, Return) } } -func (device Device) GetCurrentClocksThrottleReasons() (uint64, Return) { - return deviceGetCurrentClocksThrottleReasons(device) +// ixml.DeviceGetCurrentClocksThrottleReasons() +func DeviceGetCurrentClocksThrottleReasons(device Device) (uint64, Return) { + return device.GetCurrentClocksThrottleReasons() } -func deviceGetCurrentClocksThrottleReasons(device Device) (uint64, Return) { +func (device Device) GetCurrentClocksThrottleReasons() (uint64, Return) { var clocksThrottleReasons uint64 ret := nvmlDeviceGetCurrentClocksThrottleReasons(device, &clocksThrottleReasons) return clocksThrottleReasons, ret } -func (device Device) GetPcieReplayCounter() (uint32, Return) { - return deviceGetPcieReplayCounter(device) +// ixml.DeviceGetMaxPcieLinkGeneration() +func DeviceGetMaxPcieLinkGeneration(device Device) (int, Return) { + return device.GetMaxPcieLinkGeneration() +} + +func (device Device) GetMaxPcieLinkGeneration() (int, Return) { + var maxLinkGen uint32 + ret := nvmlDeviceGetMaxPcieLinkGeneration(device, &maxLinkGen) + return int(maxLinkGen), ret +} + +// ixml.DeviceGetMaxPcieLinkWidth() +func DeviceGetMaxPcieLinkWidth(device Device) (int, Return) { + return device.GetMaxPcieLinkWidth() +} + +func (device Device) GetMaxPcieLinkWidth() (int, Return) { + var maxLinkWidth uint32 + ret := nvmlDeviceGetMaxPcieLinkWidth(device, &maxLinkWidth) + return int(maxLinkWidth), ret } -func deviceGetPcieReplayCounter(device Device) (uint32, Return) { +// ixml.DeviceGetCurrPcieLinkGeneration() +func DeviceGetCurrPcieLinkGeneration(device Device) (int, Return) { + return device.GetCurrPcieLinkGeneration() +} + +func (device Device) GetCurrPcieLinkGeneration() (int, Return) { + var currLinkGen uint32 + ret := nvmlDeviceGetCurrPcieLinkGeneration(device, &currLinkGen) + return int(currLinkGen), ret +} + +// ixml.DeviceGetCurrPcieLinkWidth() +func DeviceGetCurrPcieLinkWidth(device Device) (int, Return) { + return device.GetCurrPcieLinkWidth() +} + +func (device Device) GetCurrPcieLinkWidth() (int, Return) { + var currLinkWidth uint32 + ret := nvmlDeviceGetCurrPcieLinkWidth(device, &currLinkWidth) + return int(currLinkWidth), ret +} + +// ixml.DeviceGetPcieThroughput() +func DeviceGetPcieThroughput(device Device, counter PcieUtilCounter) (uint32, Return) { + return device.GetPcieThroughput(counter) +} + +func (device Device) GetPcieThroughput(counter PcieUtilCounter) (uint32, Return) { + var value uint32 + ret := nvmlDeviceGetPcieThroughput(device, counter, &value) + return value, ret +} + +// ixml.DeviceGetPcieReplayCounter() +func DeviceGetPcieReplayCounter(device Device) (uint32, Return) { + return device.GetPcieReplayCounter() +} + +func (device Device) GetPcieReplayCounter() (uint32, Return) { var value uint32 ret := nvmlDeviceGetPcieReplayCounter(device, &value) return value, ret } -func (device Device) GetEccErros() (uint32, uint32, Return) { - return deviceGetEccErros(device) +// ixml.DeviceGetEccErros() +func DeviceGetEccErros(device Device) (uint32, uint32, Return) { + return device.GetEccErros() } -func deviceGetEccErros(device Device) (uint32, uint32, Return) { +func (device Device) GetEccErros() (uint32, uint32, Return) { var singleErr, doubleErr uint32 ret := ixmlDeviceGetEccErros(device, &singleErr, &doubleErr) return singleErr, doubleErr, ret } -func (device Device) GetHealth() (uint64, Return) { - return deviceGetHealth(device) +// ixml.DeviceGetHealth() +func DeviceGetHealth(device Device) (uint64, Return) { + return device.GetHealth() } -func deviceGetHealth(device Device) (uint64, Return) { +func (device Device) GetHealth() (uint64, Return) { var health uint64 ret := ixmlDeviceGetHealth(device, &health) return health, ret } -func (device Device) GetTopology(device2 Device) (GpuTopologyLevel, Return) { - return deviceGetTopology(device, device2) +// ixml.DeviceGetTopology() +func DeviceGetTopology(device1, device2 Device) (GpuTopologyLevel, Return) { + return device1.GetTopology(device2) } -func deviceGetTopology(device, device2 Device) (GpuTopologyLevel, Return) { +func (device Device) GetTopology(device2 Device) (GpuTopologyLevel, Return) { var pathInfo GpuTopologyLevel ret := nvmlDeviceGetTopologyCommonAncestor(device, device2, &pathInfo) return pathInfo, ret } +// ixml.DeviceGetPowerManagementLimit() +func DeviceGetPowerManagementLimit(device Device) (uint32, Return) { + return device.GetPowerManagementLimit() +} + func (device Device) GetPowerManagementLimit() (uint32, Return) { var limit uint32 ret := nvmlDeviceGetPowerManagementLimit(device, &limit) return limit, ret } -func DeviceGetPowerManagementLimit(device Device) (uint32, Return) { - return device.GetPowerManagementLimit() +// ixml.DeviceGetPowerManagementLimitConstraints() +func DeviceGetPowerManagementLimitConstraints(device Device) (uint32, uint32, Return) { + return device.GetPowerManagementLimitConstraints() } func (device Device) GetPowerManagementLimitConstraints() (uint32, uint32, Return) { - return deviceGetPowerManagementLimitConstraints(device) -} - -func deviceGetPowerManagementLimitConstraints(device Device) (uint32, uint32, Return) { var minLimit, maxLimit uint32 ret := nvmlDeviceGetPowerManagementLimitConstraints(device, &minLimit, &maxLimit) return minLimit, maxLimit, ret } -func DeviceGetPowerManagementLimitConstraints(device Device) (uint32, uint32, Return) { - return device.GetPowerManagementLimitConstraints() +// ixml.DeviceGetPowerManagementDefaultLimit() +func DeviceGetPowerManagementDefaultLimit(device Device) (uint32, Return) { + return device.GetPowerManagementDefaultLimit() } func (device Device) GetPowerManagementDefaultLimit() (uint32, Return) { @@ -330,8 +468,9 @@ func (device Device) GetPowerManagementDefaultLimit() (uint32, Return) { return defaultLimit, ret } -func DeviceGetPowerManagementDefaultLimit(device Device) (uint32, Return) { - return device.GetPowerManagementDefaultLimit() +// ixml.DeviceGetTemperatureThreshold() +func DeviceGetTemperatureThreshold(device Device, thresholdType TemperatureThresholds) (uint32, Return) { + return device.GetTemperatureThreshold(thresholdType) } func (device Device) GetTemperatureThreshold(thresholdType TemperatureThresholds) (uint32, Return) { @@ -340,6 +479,33 @@ func (device Device) GetTemperatureThreshold(thresholdType TemperatureThresholds return temp, ret } -func DeviceGetTemperatureThreshold(device Device, thresholdType TemperatureThresholds) (uint32, Return) { - return device.GetTemperatureThreshold(thresholdType) +// ixml.DeviceRegisterEvents() +func DeviceRegisterEvents(device Device, eventTypes uint64, set EventSet) Return { + return device.RegisterEvents(eventTypes, set) +} + +func (device Device) RegisterEvents(eventTypes uint64, set EventSet) Return { + return nvmlDeviceRegisterEvents(device, eventTypes, set.(nvmlEventSet)) +} + +// ixml.DeviceGetSupportedEventTypes() +func DeviceGetSupportedEventTypes(device Device) (uint64, Return) { + return device.GetSupportedEventTypes() +} + +func (device Device) GetSupportedEventTypes() (uint64, Return) { + var eventTypes uint64 + ret := nvmlDeviceGetSupportedEventTypes(device, &eventTypes) + return eventTypes, ret +} + +// ixml.DeviceGetBoardPartNumber() +func DeviceGetBoardPartNumber(device Device) (string, Return) { + return device.GetBoardPartNumber() +} + +func (device Device) GetBoardPartNumber() (string, Return) { + partNumber := make([]byte, DEVICE_PART_NUMBER_BUFFER_SIZE) + ret := nvmlDeviceGetBoardPartNumber(device, &partNumber[0], DEVICE_PART_NUMBER_BUFFER_SIZE) + return string(partNumber[:clen(partNumber)]), ret } diff --git a/pkg/ixml/event_set.go b/pkg/ixml/event_set.go new file mode 100644 index 0000000..5fefcf7 --- /dev/null +++ b/pkg/ixml/event_set.go @@ -0,0 +1,66 @@ +/* +Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixml + +// EventSet represents the interface for the nvmlEventSet type. +type EventSet interface { + Free() Return + Wait(uint32) (EventData, Return) +} + +// EventData includes an interface type for Device instead of nvmlDevice +type EventData struct { + Device Device + EventType uint64 + EventData uint64 + GpuInstanceId uint32 + ComputeInstanceId uint32 +} + +func (e nvmlEventData) convert() EventData { + return EventData(e) +} + +// ixml.EventSetCreate() +func EventSetCreate() (EventSet, Return) { + var Set nvmlEventSet + ret := nvmlEventSetCreate(&Set) + return Set, ret +} + +// ixml.EventSetWait() +func EventSetWait(set EventSet, timeoutms uint32) (EventData, Return) { + return set.Wait(timeoutms) +} + +// Wait waits for a registered event in the EventSet for up to timeoutms milliseconds. +func (set nvmlEventSet) Wait(timeoutms uint32) (EventData, Return) { + var data nvmlEventData + ret := nvmlEventSetWait(set, &data, timeoutms) + return data.convert(), ret +} + +// ixml.EventSetFree() +func EventSetFree(set EventSet) Return { + return set.Free() +} + +func (set nvmlEventSet) Free() Return { + return nvmlEventSetFree(set) +} diff --git a/pkg/ixml/ixml.go b/pkg/ixml/ixml.go index 1335cb0..0752243 100644 --- a/pkg/ixml/ixml.go +++ b/pkg/ixml/ixml.go @@ -16,21 +16,21 @@ import ( "unsafe" ) -// nvmlInit function as declared in ixml/api.h:495 +// nvmlInit function as declared in ixml/api.h:707 func nvmlInit() Return { __ret := C.nvmlInit_v2() __v := (Return)(__ret) return __v } -// nvmlShutdown function as declared in ixml/api.h:512 +// nvmlShutdown function as declared in ixml/api.h:724 func nvmlShutdown() Return { __ret := C.nvmlShutdown() __v := (Return)(__ret) return __v } -// nvmlDeviceGetCount function as declared in ixml/api.h:534 +// nvmlDeviceGetCount function as declared in ixml/api.h:746 func nvmlDeviceGetCount(DeviceCount *uint32) Return { cDeviceCount, cDeviceCountAllocMap := (*C.uint)(unsafe.Pointer(DeviceCount)), cgoAllocsUnknown __ret := C.nvmlDeviceGetCount_v2(cDeviceCount) @@ -39,7 +39,7 @@ func nvmlDeviceGetCount(DeviceCount *uint32) Return { return __v } -// nvmlDeviceGetHandleByIndex function as declared in ixml/api.h:582 +// nvmlDeviceGetHandleByIndex function as declared in ixml/api.h:794 func nvmlDeviceGetHandleByIndex(Index uint32, Device *Device) Return { cIndex, cIndexAllocMap := (C.uint)(Index), cgoAllocsUnknown cDevice, cDeviceAllocMap := (*C.nvmlDevice_t)(unsafe.Pointer(Device)), cgoAllocsUnknown @@ -50,7 +50,7 @@ func nvmlDeviceGetHandleByIndex(Index uint32, Device *Device) Return { return __v } -// nvmlDeviceGetHandleByUUID function as declared in ixml/api.h:607 +// nvmlDeviceGetHandleByUUID function as declared in ixml/api.h:819 func nvmlDeviceGetHandleByUUID(Uuid string, Device *Device) Return { cUuid, cUuidAllocMap := unpackPCharString(Uuid) cDevice, cDeviceAllocMap := (*C.nvmlDevice_t)(unsafe.Pointer(Device)), cgoAllocsUnknown @@ -61,7 +61,7 @@ func nvmlDeviceGetHandleByUUID(Uuid string, Device *Device) Return { return __v } -// nvmlDeviceGetHandleByPciBusId_v2 function as declared in ixml/api.h:637 +// nvmlDeviceGetHandleByPciBusId_v2 function as declared in ixml/api.h:849 func nvmlDeviceGetHandleByPciBusId_v2(PciBusId string, Device *Device) Return { cPciBusId, cPciBusIdAllocMap := unpackPCharString(PciBusId) cDevice, cDeviceAllocMap := (*C.nvmlDevice_t)(unsafe.Pointer(Device)), cgoAllocsUnknown @@ -72,7 +72,7 @@ func nvmlDeviceGetHandleByPciBusId_v2(PciBusId string, Device *Device) Return { return __v } -// nvmlDeviceGetMinorNumber function as declared in ixml/api.h:656 +// nvmlDeviceGetMinorNumber function as declared in ixml/api.h:868 func nvmlDeviceGetMinorNumber(Device Device, MinorNumber *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cMinorNumber, cMinorNumberAllocMap := (*C.uint)(unsafe.Pointer(MinorNumber)), cgoAllocsUnknown @@ -83,7 +83,7 @@ func nvmlDeviceGetMinorNumber(Device Device, MinorNumber *uint32) Return { return __v } -// nvmlDeviceGetUUID function as declared in ixml/api.h:684 +// nvmlDeviceGetUUID function as declared in ixml/api.h:896 func nvmlDeviceGetUUID(Device Device, Uuid *byte, Length uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cUuid, cUuidAllocMap := (*C.char)(unsafe.Pointer(Uuid)), cgoAllocsUnknown @@ -96,7 +96,7 @@ func nvmlDeviceGetUUID(Device Device, Uuid *byte, Length uint32) Return { return __v } -// nvmlDeviceGetName function as declared in ixml/api.h:710 +// nvmlDeviceGetName function as declared in ixml/api.h:922 func nvmlDeviceGetName(Device Device, Name *byte, Length uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cName, cNameAllocMap := (*C.char)(unsafe.Pointer(Name)), cgoAllocsUnknown @@ -109,7 +109,7 @@ func nvmlDeviceGetName(Device Device, Name *byte, Length uint32) Return { return __v } -// nvmlSystemGetDriverVersion function as declared in ixml/api.h:729 +// nvmlSystemGetDriverVersion function as declared in ixml/api.h:941 func nvmlSystemGetDriverVersion(Version *byte, Length uint32) Return { cVersion, cVersionAllocMap := (*C.char)(unsafe.Pointer(Version)), cgoAllocsUnknown cLength, cLengthAllocMap := (C.uint)(Length), cgoAllocsUnknown @@ -120,7 +120,18 @@ func nvmlSystemGetDriverVersion(Version *byte, Length uint32) Return { return __v } -// nvmlSystemGetCudaDriverVersion function as declared in ixml/api.h:745 +// nvmlSystemGetNVMLVersion function as declared in ixml/api.h:959 +func nvmlSystemGetNVMLVersion(Version *byte, Length uint32) Return { + cVersion, cVersionAllocMap := (*C.char)(unsafe.Pointer(Version)), cgoAllocsUnknown + cLength, cLengthAllocMap := (C.uint)(Length), cgoAllocsUnknown + __ret := C.nvmlSystemGetNVMLVersion(cVersion, cLength) + runtime.KeepAlive(cLengthAllocMap) + runtime.KeepAlive(cVersionAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlSystemGetCudaDriverVersion function as declared in ixml/api.h:975 func nvmlSystemGetCudaDriverVersion(CudaDriverVersion *int32) Return { cCudaDriverVersion, cCudaDriverVersionAllocMap := (*C.int)(unsafe.Pointer(CudaDriverVersion)), cgoAllocsUnknown __ret := C.nvmlSystemGetCudaDriverVersion(cCudaDriverVersion) @@ -129,7 +140,7 @@ func nvmlSystemGetCudaDriverVersion(CudaDriverVersion *int32) Return { return __v } -// nvmlSystemGetCudaDriverVersion_v2 function as declared in ixml/api.h:762 +// nvmlSystemGetCudaDriverVersion_v2 function as declared in ixml/api.h:992 func nvmlSystemGetCudaDriverVersion_v2(CudaDriverVersion *int32) Return { cCudaDriverVersion, cCudaDriverVersionAllocMap := (*C.int)(unsafe.Pointer(CudaDriverVersion)), cgoAllocsUnknown __ret := C.nvmlSystemGetCudaDriverVersion_v2(cCudaDriverVersion) @@ -138,7 +149,7 @@ func nvmlSystemGetCudaDriverVersion_v2(CudaDriverVersion *int32) Return { return __v } -// nvmlDeviceGetTemperature function as declared in ixml/api.h:783 +// nvmlDeviceGetTemperature function as declared in ixml/api.h:1013 func nvmlDeviceGetTemperature(Device Device, SensorType TemperatureSensors, Temp *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cSensorType, cSensorTypeAllocMap := (C.nvmlTemperatureSensors_t)(SensorType), cgoAllocsUnknown @@ -151,7 +162,7 @@ func nvmlDeviceGetTemperature(Device Device, SensorType TemperatureSensors, Temp return __v } -// nvmlDeviceGetTemperatureThreshold function as declared in ixml/api.h:804 +// nvmlDeviceGetTemperatureThreshold function as declared in ixml/api.h:1034 func nvmlDeviceGetTemperatureThreshold(Device Device, ThresholdType TemperatureThresholds, Temp *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cThresholdType, cThresholdTypeAllocMap := (C.nvmlTemperatureThresholds_t)(ThresholdType), cgoAllocsUnknown @@ -164,7 +175,7 @@ func nvmlDeviceGetTemperatureThreshold(Device Device, ThresholdType TemperatureT return __v } -// nvmlDeviceGetFanSpeed function as declared in ixml/api.h:830 +// nvmlDeviceGetFanSpeed function as declared in ixml/api.h:1060 func nvmlDeviceGetFanSpeed(Device Device, Speed *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cSpeed, cSpeedAllocMap := (*C.uint)(unsafe.Pointer(Speed)), cgoAllocsUnknown @@ -175,7 +186,7 @@ func nvmlDeviceGetFanSpeed(Device Device, Speed *uint32) Return { return __v } -// nvmlDeviceGetClockInfo function as declared in ixml/api.h:851 +// nvmlDeviceGetClockInfo function as declared in ixml/api.h:1081 func nvmlDeviceGetClockInfo(Device Device, _type ClockType, Clock *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown c_type, c_typeAllocMap := (C.nvmlClockType_t)(_type), cgoAllocsUnknown @@ -188,7 +199,7 @@ func nvmlDeviceGetClockInfo(Device Device, _type ClockType, Clock *uint32) Retur return __v } -// nvmlDeviceGetMemoryInfo function as declared in ixml/api.h:884 +// nvmlDeviceGetMemoryInfo function as declared in ixml/api.h:1114 func nvmlDeviceGetMemoryInfo(Device Device, Memory *Memory) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cMemory, cMemoryAllocMap := (*C.nvmlMemory_t)(unsafe.Pointer(Memory)), cgoAllocsUnknown @@ -199,7 +210,7 @@ func nvmlDeviceGetMemoryInfo(Device Device, Memory *Memory) Return { return __v } -// nvmlDeviceGetMemoryInfo_v2 function as declared in ixml/api.h:885 +// nvmlDeviceGetMemoryInfo_v2 function as declared in ixml/api.h:1115 func nvmlDeviceGetMemoryInfo_v2(Device Device, Memory *Memory_v2) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cMemory, cMemoryAllocMap := (*C.nvmlMemory_v2_t)(unsafe.Pointer(Memory)), cgoAllocsUnknown @@ -210,7 +221,7 @@ func nvmlDeviceGetMemoryInfo_v2(Device Device, Memory *Memory_v2) Return { return __v } -// nvmlDeviceGetFanSpeed_v2 function as declared in ixml/api.h:910 +// nvmlDeviceGetFanSpeed_v2 function as declared in ixml/api.h:1140 func nvmlDeviceGetFanSpeed_v2(Device Device, Fan uint32, Speed *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cFan, cFanAllocMap := (C.uint)(Fan), cgoAllocsUnknown @@ -223,7 +234,7 @@ func nvmlDeviceGetFanSpeed_v2(Device Device, Fan uint32, Speed *uint32) Return { return __v } -// nvmlDeviceGetUtilizationRates function as declared in ixml/api.h:935 +// nvmlDeviceGetUtilizationRates function as declared in ixml/api.h:1165 func nvmlDeviceGetUtilizationRates(Device Device, Utilization *Utilization) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cUtilization, cUtilizationAllocMap := (*C.nvmlUtilization_t)(unsafe.Pointer(Utilization)), cgoAllocsUnknown @@ -234,7 +245,77 @@ func nvmlDeviceGetUtilizationRates(Device Device, Utilization *Utilization) Retu return __v } -// nvmlDeviceGetPciInfo function as declared in ixml/api.h:954 +// nvmlDeviceGetComputeMode function as declared in ixml/api.h:1187 +func nvmlDeviceGetComputeMode(Device Device, Mode *ComputeMode) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cMode, cModeAllocMap := (*C.nvmlComputeMode_t)(unsafe.Pointer(Mode)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetComputeMode(cDevice, cMode) + runtime.KeepAlive(cModeAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetCudaComputeCapability function as declared in ixml/api.h:1211 +func nvmlDeviceGetCudaComputeCapability(Device Device, Major *int32, Minor *int32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cMajor, cMajorAllocMap := (*C.int)(unsafe.Pointer(Major)), cgoAllocsUnknown + cMinor, cMinorAllocMap := (*C.int)(unsafe.Pointer(Minor)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetCudaComputeCapability(cDevice, cMajor, cMinor) + runtime.KeepAlive(cMinorAllocMap) + runtime.KeepAlive(cMajorAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetEccMode function as declared in ixml/api.h:1239 +func nvmlDeviceGetEccMode(Device Device, Current *EnableState, Pending *EnableState) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cCurrent, cCurrentAllocMap := (*C.nvmlEnableState_t)(unsafe.Pointer(Current)), cgoAllocsUnknown + cPending, cPendingAllocMap := (*C.nvmlEnableState_t)(unsafe.Pointer(Pending)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetEccMode(cDevice, cCurrent, cPending) + runtime.KeepAlive(cPendingAllocMap) + runtime.KeepAlive(cCurrentAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetDefaultEccMode function as declared in ixml/api.h:1265 +func nvmlDeviceGetDefaultEccMode(Device Device, DefaultMode *EnableState) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cDefaultMode, cDefaultModeAllocMap := (*C.nvmlEnableState_t)(unsafe.Pointer(DefaultMode)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetDefaultEccMode(cDevice, cDefaultMode) + runtime.KeepAlive(cDefaultModeAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetBoardId function as declared in ixml/api.h:1290 +func nvmlDeviceGetBoardId(Device Device, BoardId *uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cBoardId, cBoardIdAllocMap := (*C.uint)(unsafe.Pointer(BoardId)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetBoardId(cDevice, cBoardId) + runtime.KeepAlive(cBoardIdAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetMultiGpuBoard function as declared in ixml/api.h:1310 +func nvmlDeviceGetMultiGpuBoard(Device Device, MultiGpuBool *uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cMultiGpuBool, cMultiGpuBoolAllocMap := (*C.uint)(unsafe.Pointer(MultiGpuBool)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMultiGpuBoard(cDevice, cMultiGpuBool) + runtime.KeepAlive(cMultiGpuBoolAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetPciInfo function as declared in ixml/api.h:1329 func nvmlDeviceGetPciInfo(Device Device, Pci *PciInfo) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cPci, cPciAllocMap := (*C.nvmlPciInfo_t)(unsafe.Pointer(Pci)), cgoAllocsUnknown @@ -245,7 +326,75 @@ func nvmlDeviceGetPciInfo(Device Device, Pci *PciInfo) Return { return __v } -// nvmlDeviceGetIndex function as declared in ixml/api.h:988 +// nvmlDeviceGetMaxPcieLinkGeneration function as declared in ixml/api.h:1350 +func nvmlDeviceGetMaxPcieLinkGeneration(Device Device, MaxLinkGen *uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cMaxLinkGen, cMaxLinkGenAllocMap := (*C.uint)(unsafe.Pointer(MaxLinkGen)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMaxPcieLinkGeneration(cDevice, cMaxLinkGen) + runtime.KeepAlive(cMaxLinkGenAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetMaxPcieLinkWidth function as declared in ixml/api.h:1371 +func nvmlDeviceGetMaxPcieLinkWidth(Device Device, MaxLinkWidth *uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cMaxLinkWidth, cMaxLinkWidthAllocMap := (*C.uint)(unsafe.Pointer(MaxLinkWidth)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMaxPcieLinkWidth(cDevice, cMaxLinkWidth) + runtime.KeepAlive(cMaxLinkWidthAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetCurrPcieLinkGeneration function as declared in ixml/api.h:1389 +func nvmlDeviceGetCurrPcieLinkGeneration(Device Device, CurrLinkGen *uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cCurrLinkGen, cCurrLinkGenAllocMap := (*C.uint)(unsafe.Pointer(CurrLinkGen)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetCurrPcieLinkGeneration(cDevice, cCurrLinkGen) + runtime.KeepAlive(cCurrLinkGenAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetCurrPcieLinkWidth function as declared in ixml/api.h:1407 +func nvmlDeviceGetCurrPcieLinkWidth(Device Device, CurrLinkWidth *uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cCurrLinkWidth, cCurrLinkWidthAllocMap := (*C.uint)(unsafe.Pointer(CurrLinkWidth)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetCurrPcieLinkWidth(cDevice, cCurrLinkWidth) + runtime.KeepAlive(cCurrLinkWidthAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetPcieThroughput function as declared in ixml/api.h:1431 +func nvmlDeviceGetPcieThroughput(Device Device, Counter PcieUtilCounter, Value *uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cCounter, cCounterAllocMap := (C.nvmlPcieUtilCounter_t)(Counter), cgoAllocsUnknown + cValue, cValueAllocMap := (*C.uint)(unsafe.Pointer(Value)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetPcieThroughput(cDevice, cCounter, cValue) + runtime.KeepAlive(cValueAllocMap) + runtime.KeepAlive(cCounterAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetPcieReplayCounter function as declared in ixml/api.h:1455 +func nvmlDeviceGetPcieReplayCounter(Device Device, Value *uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cValue, cValueAllocMap := (*C.uint)(unsafe.Pointer(Value)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetPcieReplayCounter(cDevice, cValue) + runtime.KeepAlive(cValueAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetIndex function as declared in ixml/api.h:1489 func nvmlDeviceGetIndex(Device Device, Index *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cIndex, cIndexAllocMap := (*C.uint)(unsafe.Pointer(Index)), cgoAllocsUnknown @@ -256,7 +405,68 @@ func nvmlDeviceGetIndex(Device Device, Index *uint32) Return { return __v } -// nvmlDeviceGetPowerUsage function as declared in ixml/api.h:1010 +// nvmlDeviceGetSerial function as declared in ixml/api.h:1513 +func nvmlDeviceGetSerial(Device Device, Serial *byte, Length uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cSerial, cSerialAllocMap := (*C.char)(unsafe.Pointer(Serial)), cgoAllocsUnknown + cLength, cLengthAllocMap := (C.uint)(Length), cgoAllocsUnknown + __ret := C.nvmlDeviceGetSerial(cDevice, cSerial, cLength) + runtime.KeepAlive(cLengthAllocMap) + runtime.KeepAlive(cSerialAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetHandleBySerial function as declared in ixml/api.h:1548 +func nvmlDeviceGetHandleBySerial(Serial string, Device *Device) Return { + cSerial, cSerialAllocMap := unpackPCharString(Serial) + cDevice, cDeviceAllocMap := (*C.nvmlDevice_t)(unsafe.Pointer(Device)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetHandleBySerial(cSerial, cDevice) + runtime.KeepAlive(cDeviceAllocMap) + runtime.KeepAlive(cSerialAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetVbiosVersion function as declared in ixml/api.h:1570 +func nvmlDeviceGetVbiosVersion(Device Device, Version *byte, Length uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cVersion, cVersionAllocMap := (*C.char)(unsafe.Pointer(Version)), cgoAllocsUnknown + cLength, cLengthAllocMap := (C.uint)(Length), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVbiosVersion(cDevice, cVersion, cLength) + runtime.KeepAlive(cLengthAllocMap) + runtime.KeepAlive(cVersionAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetBoardPartNumber function as declared in ixml/api.h:1590 +func nvmlDeviceGetBoardPartNumber(Device Device, PartNumber *byte, Length uint32) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cPartNumber, cPartNumberAllocMap := (*C.char)(unsafe.Pointer(PartNumber)), cgoAllocsUnknown + cLength, cLengthAllocMap := (C.uint)(Length), cgoAllocsUnknown + __ret := C.nvmlDeviceGetBoardPartNumber(cDevice, cPartNumber, cLength) + runtime.KeepAlive(cLengthAllocMap) + runtime.KeepAlive(cPartNumberAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetEccMode function as declared in ixml/api.h:1620 +func nvmlDeviceSetEccMode(Device Device, Ecc EnableState) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cEcc, cEccAllocMap := (C.nvmlEnableState_t)(Ecc), cgoAllocsUnknown + __ret := C.nvmlDeviceSetEccMode(cDevice, cEcc) + runtime.KeepAlive(cEccAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetPowerUsage function as declared in ixml/api.h:1696 func nvmlDeviceGetPowerUsage(Device Device, Power *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cPower, cPowerAllocMap := (*C.uint)(unsafe.Pointer(Power)), cgoAllocsUnknown @@ -267,7 +477,7 @@ func nvmlDeviceGetPowerUsage(Device Device, Power *uint32) Return { return __v } -// nvmlDeviceOnSameBoard function as declared in ixml/api.h:1030 +// nvmlDeviceOnSameBoard function as declared in ixml/api.h:1716 func nvmlDeviceOnSameBoard(Device1 Device, Device2 Device, OnSameBoard *int32) Return { cDevice1, cDevice1AllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device1)), cgoAllocsUnknown cDevice2, cDevice2AllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device2)), cgoAllocsUnknown @@ -280,7 +490,7 @@ func nvmlDeviceOnSameBoard(Device1 Device, Device2 Device, OnSameBoard *int32) R return __v } -// nvmlDeviceGetComputeRunningProcesses function as declared in ixml/api.h:1073 +// nvmlDeviceGetComputeRunningProcesses function as declared in ixml/api.h:1759 func nvmlDeviceGetComputeRunningProcesses(Device Device, InfoCount *uint32, Infos *ProcessInfo_v1) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cInfoCount, cInfoCountAllocMap := (*C.uint)(unsafe.Pointer(InfoCount)), cgoAllocsUnknown @@ -293,18 +503,7 @@ func nvmlDeviceGetComputeRunningProcesses(Device Device, InfoCount *uint32, Info return __v } -// nvmlDeviceGetPcieReplayCounter function as declared in ixml/api.h:1095 -func nvmlDeviceGetPcieReplayCounter(Device Device, Value *uint32) Return { - cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown - cValue, cValueAllocMap := (*C.uint)(unsafe.Pointer(Value)), cgoAllocsUnknown - __ret := C.nvmlDeviceGetPcieReplayCounter(cDevice, cValue) - runtime.KeepAlive(cValueAllocMap) - runtime.KeepAlive(cDeviceAllocMap) - __v := (Return)(__ret) - return __v -} - -// nvmlGpmMetricsGet function as declared in ixml/api.h:1116 +// nvmlGpmMetricsGet function as declared in ixml/api.h:1780 func nvmlGpmMetricsGet(MetricsGet *nvmlGpmMetricsGetType) Return { cMetricsGet, cMetricsGetAllocMap := (*C.nvmlGpmMetricsGet_t)(unsafe.Pointer(MetricsGet)), cgoAllocsUnknown __ret := C.nvmlGpmMetricsGet(cMetricsGet) @@ -313,7 +512,7 @@ func nvmlGpmMetricsGet(MetricsGet *nvmlGpmMetricsGetType) Return { return __v } -// nvmlGpmQueryDeviceSupport function as declared in ixml/api.h:1129 +// nvmlGpmQueryDeviceSupport function as declared in ixml/api.h:1793 func nvmlGpmQueryDeviceSupport(Device Device, GpmSupport *GpmSupport) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cGpmSupport, cGpmSupportAllocMap := (*C.nvmlGpmSupport_t)(unsafe.Pointer(GpmSupport)), cgoAllocsUnknown @@ -324,7 +523,7 @@ func nvmlGpmQueryDeviceSupport(Device Device, GpmSupport *GpmSupport) Return { return __v } -// nvmlGpmSampleFree function as declared in ixml/api.h:1142 +// nvmlGpmSampleFree function as declared in ixml/api.h:1806 func nvmlGpmSampleFree(GpmSample GpmSample) Return { cGpmSample, cGpmSampleAllocMap := *(*C.nvmlGpmSample_t)(unsafe.Pointer(&GpmSample)), cgoAllocsUnknown __ret := C.nvmlGpmSampleFree(cGpmSample) @@ -333,7 +532,7 @@ func nvmlGpmSampleFree(GpmSample GpmSample) Return { return __v } -// nvmlGpmSampleAlloc function as declared in ixml/api.h:1157 +// nvmlGpmSampleAlloc function as declared in ixml/api.h:1821 func nvmlGpmSampleAlloc(GpmSample *GpmSample) Return { cGpmSample, cGpmSampleAllocMap := (*C.nvmlGpmSample_t)(unsafe.Pointer(GpmSample)), cgoAllocsUnknown __ret := C.nvmlGpmSampleAlloc(cGpmSample) @@ -342,7 +541,7 @@ func nvmlGpmSampleAlloc(GpmSample *GpmSample) Return { return __v } -// nvmlGpmSampleGet function as declared in ixml/api.h:1173 +// nvmlGpmSampleGet function as declared in ixml/api.h:1837 func nvmlGpmSampleGet(Device Device, GpmSample GpmSample) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cGpmSample, cGpmSampleAllocMap := *(*C.nvmlGpmSample_t)(unsafe.Pointer(&GpmSample)), cgoAllocsUnknown @@ -353,7 +552,7 @@ func nvmlGpmSampleGet(Device Device, GpmSample GpmSample) Return { return __v } -// nvmlDeviceGetPowerManagementLimit function as declared in ixml/api.h:1197 +// nvmlDeviceGetPowerManagementLimit function as declared in ixml/api.h:1861 func nvmlDeviceGetPowerManagementLimit(Device Device, Limit *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cLimit, cLimitAllocMap := (*C.uint)(unsafe.Pointer(Limit)), cgoAllocsUnknown @@ -364,7 +563,7 @@ func nvmlDeviceGetPowerManagementLimit(Device Device, Limit *uint32) Return { return __v } -// nvmlDeviceGetPowerManagementLimitConstraints function as declared in ixml/api.h:1220 +// nvmlDeviceGetPowerManagementLimitConstraints function as declared in ixml/api.h:1884 func nvmlDeviceGetPowerManagementLimitConstraints(Device Device, MinLimit *uint32, MaxLimit *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cMinLimit, cMinLimitAllocMap := (*C.uint)(unsafe.Pointer(MinLimit)), cgoAllocsUnknown @@ -377,7 +576,7 @@ func nvmlDeviceGetPowerManagementLimitConstraints(Device Device, MinLimit *uint3 return __v } -// nvmlDeviceGetPowerManagementDefaultLimit function as declared in ixml/api.h:1242 +// nvmlDeviceGetPowerManagementDefaultLimit function as declared in ixml/api.h:1906 func nvmlDeviceGetPowerManagementDefaultLimit(Device Device, DefaultLimit *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cDefaultLimit, cDefaultLimitAllocMap := (*C.uint)(unsafe.Pointer(DefaultLimit)), cgoAllocsUnknown @@ -388,7 +587,7 @@ func nvmlDeviceGetPowerManagementDefaultLimit(Device Device, DefaultLimit *uint3 return __v } -// nvmlDeviceGetCurrentClocksThrottleReasons function as declared in ixml/api.h:1266 +// nvmlDeviceGetCurrentClocksThrottleReasons function as declared in ixml/api.h:1930 func nvmlDeviceGetCurrentClocksThrottleReasons(Device Device, ClocksThrottleReasons *uint64) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cClocksThrottleReasons, cClocksThrottleReasonsAllocMap := (*C.ulonglong)(unsafe.Pointer(ClocksThrottleReasons)), cgoAllocsUnknown @@ -399,7 +598,7 @@ func nvmlDeviceGetCurrentClocksThrottleReasons(Device Device, ClocksThrottleReas return __v } -// nvmlDeviceGetSupportedClocksThrottleReasons function as declared in ixml/api.h:1292 +// nvmlDeviceGetSupportedClocksThrottleReasons function as declared in ixml/api.h:1956 func nvmlDeviceGetSupportedClocksThrottleReasons(Device Device, SupportedClocksThrottleReasons *uint64) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cSupportedClocksThrottleReasons, cSupportedClocksThrottleReasonsAllocMap := (*C.ulonglong)(unsafe.Pointer(SupportedClocksThrottleReasons)), cgoAllocsUnknown @@ -410,7 +609,7 @@ func nvmlDeviceGetSupportedClocksThrottleReasons(Device Device, SupportedClocksT return __v } -// nvmlDeviceGetTopologyCommonAncestor function as declared in ixml/api.h:1312 +// nvmlDeviceGetTopologyCommonAncestor function as declared in ixml/api.h:1976 func nvmlDeviceGetTopologyCommonAncestor(Device1 Device, Device2 Device, PathInfo *GpuTopologyLevel) Return { cDevice1, cDevice1AllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device1)), cgoAllocsUnknown cDevice2, cDevice2AllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device2)), cgoAllocsUnknown @@ -423,7 +622,62 @@ func nvmlDeviceGetTopologyCommonAncestor(Device1 Device, Device2 Device, PathInf return __v } -// ixmlDeviceGetBoardPosition function as declared in ixml/api.h:1314 +// nvmlEventSetCreate function as declared in ixml/api.h:2091 +func nvmlEventSetCreate(Set *nvmlEventSet) Return { + cSet, cSetAllocMap := (*C.nvmlEventSet_t)(unsafe.Pointer(Set)), cgoAllocsUnknown + __ret := C.nvmlEventSetCreate(cSet) + runtime.KeepAlive(cSetAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceRegisterEvents function as declared in ixml/api.h:2131 +func nvmlDeviceRegisterEvents(Device Device, EventTypes uint64, Set nvmlEventSet) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cEventTypes, cEventTypesAllocMap := (C.ulonglong)(EventTypes), cgoAllocsUnknown + cSet, cSetAllocMap := *(*C.nvmlEventSet_t)(unsafe.Pointer(&Set)), cgoAllocsUnknown + __ret := C.nvmlDeviceRegisterEvents(cDevice, cEventTypes, cSet) + runtime.KeepAlive(cSetAllocMap) + runtime.KeepAlive(cEventTypesAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetSupportedEventTypes function as declared in ixml/api.h:2155 +func nvmlDeviceGetSupportedEventTypes(Device Device, EventTypes *uint64) Return { + cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cEventTypes, cEventTypesAllocMap := (*C.ulonglong)(unsafe.Pointer(EventTypes)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetSupportedEventTypes(cDevice, cEventTypes) + runtime.KeepAlive(cEventTypesAllocMap) + runtime.KeepAlive(cDeviceAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlEventSetWait function as declared in ixml/api.h:2194 +func nvmlEventSetWait(Set nvmlEventSet, Data *nvmlEventData, Timeoutms uint32) Return { + cSet, cSetAllocMap := *(*C.nvmlEventSet_t)(unsafe.Pointer(&Set)), cgoAllocsUnknown + cData, cDataAllocMap := (*C.nvmlEventData_t)(unsafe.Pointer(Data)), cgoAllocsUnknown + cTimeoutms, cTimeoutmsAllocMap := (C.uint)(Timeoutms), cgoAllocsUnknown + __ret := C.nvmlEventSetWait_v2(cSet, cData, cTimeoutms) + runtime.KeepAlive(cTimeoutmsAllocMap) + runtime.KeepAlive(cDataAllocMap) + runtime.KeepAlive(cSetAllocMap) + __v := (Return)(__ret) + return __v +} + +// nvmlEventSetFree function as declared in ixml/api.h:2210 +func nvmlEventSetFree(Set nvmlEventSet) Return { + cSet, cSetAllocMap := *(*C.nvmlEventSet_t)(unsafe.Pointer(&Set)), cgoAllocsUnknown + __ret := C.nvmlEventSetFree(cSet) + runtime.KeepAlive(cSetAllocMap) + __v := (Return)(__ret) + return __v +} + +// ixmlDeviceGetBoardPosition function as declared in ixml/api.h:2214 func ixmlDeviceGetBoardPosition(Device Device, Position *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cPosition, cPositionAllocMap := (*C.uint)(unsafe.Pointer(Position)), cgoAllocsUnknown @@ -434,7 +688,7 @@ func ixmlDeviceGetBoardPosition(Device Device, Position *uint32) Return { return __v } -// ixmlDeviceGetGPUVoltage function as declared in ixml/api.h:1316 +// ixmlDeviceGetGPUVoltage function as declared in ixml/api.h:2216 func ixmlDeviceGetGPUVoltage(Device Device, Integer *uint32, Decimal *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cInteger, cIntegerAllocMap := (*C.uint)(unsafe.Pointer(Integer)), cgoAllocsUnknown @@ -447,7 +701,7 @@ func ixmlDeviceGetGPUVoltage(Device Device, Integer *uint32, Decimal *uint32) Re return __v } -// ixmlDeviceGetEccErros function as declared in ixml/api.h:1318 +// ixmlDeviceGetEccErros function as declared in ixml/api.h:2218 func ixmlDeviceGetEccErros(Device Device, Single_error *uint32, Double_error *uint32) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cSingle_error, cSingle_errorAllocMap := (*C.uint)(unsafe.Pointer(Single_error)), cgoAllocsUnknown @@ -460,7 +714,7 @@ func ixmlDeviceGetEccErros(Device Device, Single_error *uint32, Double_error *ui return __v } -// ixmlDeviceGetHealth function as declared in ixml/api.h:1320 +// ixmlDeviceGetHealth function as declared in ixml/api.h:2220 func ixmlDeviceGetHealth(Device Device, Health *uint64) Return { cDevice, cDeviceAllocMap := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown cHealth, cHealthAllocMap := (*C.ulonglong)(unsafe.Pointer(Health)), cgoAllocsUnknown diff --git a/pkg/ixml/ixml_test.go b/pkg/ixml/ixml_test.go new file mode 100644 index 0000000..4427695 --- /dev/null +++ b/pkg/ixml/ixml_test.go @@ -0,0 +1,87 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixml + +import ( + "testing" + + "gitee.com/deep-spark/go-ixml/pkg/dl" +) + +func requireLibIXML(t *testing.T) { + lib := dl.New(ixmlLibraryName, ixmlLibraryLoadFlags) + if err := lib.Open(); err != nil { + t.Skipf("This test requires %v", ixmlLibraryName) + } + lib.Close() +} + +func TestInit(t *testing.T) { + requireLibIXML(t) + + ret := Init() + if ret != SUCCESS { + t.Errorf("Init: %v", ret) + } else { + t.Logf("Init: %v", ret) + } + + ret = Shutdown() + if ret != SUCCESS { + t.Errorf("Shutdown: %v", ret) + } else { + t.Logf("Shutdown: %v", ret) + } +} + +func TestSystem(t *testing.T) { + requireLibIXML(t) + + Init() + defer Shutdown() + + driverVersion, ret := SystemGetDriverVersion() + if ret != SUCCESS { + t.Errorf("SystemGetDriverVersion: %v", ret) + } else { + t.Logf("SystemGetDriverVersion: %v", ret) + t.Logf("Driver version: %v", driverVersion) + } + + ixmlVersion, ret := SystemGetNVMLVersion() + if ret != SUCCESS { + t.Errorf("SystemGetNVMLVersion: %v", ret) + } else { + t.Logf("IXML version: %v", ixmlVersion) + } + + cudaDriverVersion, ret := SystemGetCudaDriverVersion() + if ret != SUCCESS { + t.Errorf("SystemGetCudaDriverVersion: %v", ret) + } else { + t.Logf("Cuda driver version: %v", cudaDriverVersion) + } + + cudaDriverVersionV2, ret := SystemGetCudaDriverVersion_v2() + if ret != SUCCESS { + t.Errorf("SystemGetCudaDriverVersion_v2: %v", ret) + } else { + t.Logf("SystemGetCudaDriverVersion_v2: %v", ret) + t.Logf("Cuda driver version_v2: %v", cudaDriverVersionV2) + } +} diff --git a/pkg/ixml/return.go b/pkg/ixml/return.go new file mode 100644 index 0000000..597a95e --- /dev/null +++ b/pkg/ixml/return.go @@ -0,0 +1,106 @@ +/* +Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixml + +import ( + "fmt" +) + +func ErrorString(r Return) string { + return r.Error() +} + +// String returns the string representation of a Return. +func (r Return) String() string { + return r.Error() +} + +// Error returns the string representation of a Return. +func (r Return) Error() string { + return errorStringFunc(r) +} + +// Assigned to ixml.ErrorString if the system ixml library is in use. +var errorStringFunc = defaultErrorStringFunc + +// defaultErrorStringFunc provides a basic ixmlErrorString implementation. +// This allows the ixml.ErrorString function to be used even if the IXML library +// is not loaded. +var defaultErrorStringFunc = func(r Return) string { + switch r { + case SUCCESS: + return "SUCCESS" + case ERROR_UNINITIALIZED: + return "ERROR_UNINITIALIZED" + case ERROR_INVALID_ARGUMENT: + return "ERROR_INVALID_ARGUMENT" + case ERROR_NOT_SUPPORTED: + return "ERROR_NOT_SUPPORTED" + case ERROR_NO_PERMISSION: + return "ERROR_NO_PERMISSION" + case ERROR_ALREADY_INITIALIZED: + return "ERROR_ALREADY_INITIALIZED" + case ERROR_NOT_FOUND: + return "ERROR_NOT_FOUND" + case ERROR_INSUFFICIENT_SIZE: + return "ERROR_INSUFFICIENT_SIZE" + case ERROR_INSUFFICIENT_POWER: + return "ERROR_INSUFFICIENT_POWER" + case ERROR_DRIVER_NOT_LOADED: + return "ERROR_DRIVER_NOT_LOADED" + case ERROR_TIMEOUT: + return "ERROR_TIMEOUT" + case ERROR_IRQ_ISSUE: + return "ERROR_IRQ_ISSUE" + case ERROR_LIBRARY_NOT_FOUND: + return "ERROR_LIBRARY_NOT_FOUND" + case ERROR_FUNCTION_NOT_FOUND: + return "ERROR_FUNCTION_NOT_FOUND" + case ERROR_CORRUPTED_INFOROM: + return "ERROR_CORRUPTED_INFOROM" + case ERROR_GPU_IS_LOST: + return "ERROR_GPU_IS_LOST" + case ERROR_RESET_REQUIRED: + return "ERROR_RESET_REQUIRED" + case ERROR_OPERATING_SYSTEM: + return "ERROR_OPERATING_SYSTEM" + case ERROR_LIB_RM_VERSION_MISMATCH: + return "ERROR_LIB_RM_VERSION_MISMATCH" + case ERROR_IN_USE: + return "ERROR_IN_USE" + case ERROR_MEMORY: + return "ERROR_MEMORY" + case ERROR_NO_DATA: + return "ERROR_NO_DATA" + case ERROR_VGPU_ECC_NOT_SUPPORTED: + return "ERROR_VGPU_ECC_NOT_SUPPORTED" + case ERROR_INSUFFICIENT_RESOURCES: + return "ERROR_INSUFFICIENT_RESOURCES" + case ERROR_FREQ_NOT_SUPPORTED: + return "ERROR_FREQ_NOT_SUPPORTED" + case ERROR_ARGUMENT_VERSION_MISMATCH: + return "ERROR_ARGUMENT_VERSION_MISMATCH" + case ERROR_DEPRECATED: + return "ERROR_DEPRECATED" + case ERROR_UNKNOWN: + return "ERROR_UNKNOWN" + default: + return fmt.Sprintf("unknown return value: %d", r) + } +} diff --git a/pkg/ixml/system.go b/pkg/ixml/system.go index 9b74f75..7aed2dd 100644 --- a/pkg/ixml/system.go +++ b/pkg/ixml/system.go @@ -27,6 +27,12 @@ func SystemGetDriverVersion() (string, Return) { return removeBytesSpaces(version), ret } +func SystemGetNVMLVersion() (string, Return) { + Version := make([]byte, SYSTEM_NVML_VERSION_BUFFER_SIZE) + ret := nvmlSystemGetNVMLVersion(&Version[0], SYSTEM_NVML_VERSION_BUFFER_SIZE) + return string(Version[:clen(Version)]), ret +} + func SystemGetCudaDriverVersion() (string, Return) { var CudaDriverVersion int32 ret := nvmlSystemGetCudaDriverVersion(&CudaDriverVersion) diff --git a/pkg/ixml/types_gen.go b/pkg/ixml/types_gen.go index 658cd1d..f188f79 100644 --- a/pkg/ixml/types_gen.go +++ b/pkg/ixml/types_gen.go @@ -3,14 +3,12 @@ package ixml +import "unsafe" + type Device struct { Handle *_Ctype_struct_nvmlDevice_st } -type EventSet struct { - Handle *_Ctype_struct_nvmlEventSet_st -} - type Memory struct { Total uint64 Free uint64 @@ -67,6 +65,30 @@ type GpmSupport struct { IsSupportedDevice uint32 } +type BridgeChipInfo struct { + Type uint32 + FwVersion uint32 +} + +type BridgeChipHierarchy struct { + BridgeCount uint8 + BridgeChipInfo [128]BridgeChipInfo +} + +const sizeofValue = unsafe.Sizeof([8]byte{}) + +type Value [sizeofValue]byte + +type Sample struct { + TimeStamp uint64 + SampleValue [8]byte +} + +type ViolationTime struct { + ReferenceTime uint64 + ViolationTime uint64 +} + type PciInfo struct { BusIdLegacy [16]int8 Domain uint32 @@ -76,3 +98,15 @@ type PciInfo struct { PciSubSystemId uint32 BusId [32]int8 } + +type nvmlEventSet struct { + Handle *_Ctype_struct_nvmlEventSet_st +} + +type nvmlEventData struct { + Device Device + EventType uint64 + EventData uint64 + GpuInstanceId uint32 + ComputeInstanceId uint32 +} diff --git a/samples/attributes/main.go b/samples/attributes/main.go index fb34163..1a69e6c 100644 --- a/samples/attributes/main.go +++ b/samples/attributes/main.go @@ -24,29 +24,9 @@ import ( "gitee.com/deep-spark/go-ixml/pkg/ixml" ) +// Replace with your actual GPU UUID const defalutGpu = "GPU-6d2ec5fa-f293-57a3-9f2c-335f78120578" -const gpu2 = "GPU-7edb0dc9-9291-5e13-9e1c-ad92672bdfec" -func checkOnSameBoard(uuid1, uuid2 string) error { - device1, ret := ixml.GetHandleByUUID(uuid1) - if ret != ixml.SUCCESS { - return fmt.Errorf("failed to get handle by uuid, ret: %v", ret) - } - device2, ret := ixml.GetHandleByUUID(uuid2) - if ret != ixml.SUCCESS { - return fmt.Errorf("failed to get Handle by uuid, ret: %v", ret) - } - - OnSameBoard, ret := ixml.GetOnSameBoard(device1, device2) - if ret == ixml.ERROR_NOT_SUPPORTED { - return fmt.Errorf("nvmlDeviceOnSameBoard: ERROR_NOT_SUPPORTED") - } else if ret != ixml.SUCCESS { - return fmt.Errorf("%s and %s are NOT on same board: %v", uuid1, uuid2, ret) - } else { - fmt.Printf("%s and %s on same board: %d\n", uuid1, uuid2, OnSameBoard) - } - return nil -} func main() { var device ixml.Device @@ -71,38 +51,41 @@ func main() { if ret != ixml.SUCCESS { log.Fatalf("Unable to get name, ret: %v", ret) } - fmt.Printf("name:%s, len(name): %d\n", name, len(name)) + fmt.Printf("Device Name: %s\n", name) index, ret := device.GetIndex() if ret != ixml.SUCCESS { log.Fatalf("Unable to get index, ret: %v", ret) } - fmt.Printf("index: %d\n", index) + fmt.Printf("Device Index: %d\n", index) - Integer, Decimal, ret := device.GetGPUVoltage() + uuid, ret := device.GetUUID() if ret != ixml.SUCCESS { - log.Fatalf("Unable to get GPU Voltage, ret: %v", ret) + fmt.Printf("Unable to get GPU Uuid , ret: %v\n", ret) + } else { + fmt.Printf("Device Uuid: %s\n", uuid) } - fmt.Printf("GPU Voltage: %v.%v\n", Integer, Decimal) - pos, ret := device.GetBoardPosition() - if ret == ixml.ERROR_NOT_SUPPORTED { - fmt.Printf("GetBoardPosition interface is not supported\n") - } else if ret != ixml.SUCCESS { - log.Fatalf("Unable to get BoardPosition, ret: %v", ret) + serialNumber, ret := device.GetSerial() + if ret != ixml.SUCCESS { + fmt.Printf("Unable to get GPU Serial Number , ret: %v\n", ret) } else { - fmt.Printf("position: %d\n", pos) + fmt.Printf("Device Serial Number: %s\n", serialNumber) } - usage, ret := device.GetPowerUsage() + minorNumber, ret := device.GetMinorNumber() if ret != ixml.SUCCESS { - log.Fatalf("Unable to get usage, ret: %v", ret) + fmt.Printf("Unable to get GPU MinorNumber, ret: %v\n", ret) + } else { + fmt.Printf("Device MinorNumber: %d\n", minorNumber) } - fmt.Printf("usage: %d\n", usage) - if err := checkOnSameBoard(defalutGpu, gpu2); err != nil { - fmt.Println(err) + currentEccMode, pendingEccMode, ret := device.GetEccMode() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get ECC Mode, ret: %v", ret) } + fmt.Printf("Current ECC Mode: %d\n", currentEccMode) + fmt.Printf("Pending ECC Mode: %d\n", pendingEccMode) fmt.Println("========================================") } diff --git a/samples/board/main.go b/samples/board/main.go new file mode 100644 index 0000000..ed27e38 --- /dev/null +++ b/samples/board/main.go @@ -0,0 +1,113 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "log" + + "gitee.com/deep-spark/go-ixml/pkg/ixml" +) + +func main() { + ret := ixml.Init() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to initialize IXML, ret: %v", ret) + } + defer func() { + ret := ixml.Shutdown() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to shutdown IXML, ret: %v", ret) + } + }() + + count, ret := ixml.DeviceGetCount() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get device count, ret: %v", ret) + } else if count == 0 { + log.Fatalf("No GPUs found.") + } + fmt.Printf("GPU Count: %v\n", count) + + if err := CheckOnSameBoard(count); err != nil { + fmt.Println(err) + } + + for i := uint(0); i < count; i++ { + var device ixml.Device + ret = ixml.DeviceGetHandleByIndex(i, &device) // Get the first GPU device + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get Handle by index, ret: %v", ret) + } + fmt.Printf("Device Index: %d\n", i) + + pos, ret := device.GetBoardPosition() + if ret == ixml.ERROR_NOT_SUPPORTED { + fmt.Printf("Unable to get Board Position, ret: %v\n", ret) + } else if ret != ixml.SUCCESS { + log.Fatalf("Unable to get Board Position, ret: %v", ret) + } else { + fmt.Printf("Board Position: %d\n", pos) + } + + boardId, ret := device.GetBoardId() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get Board ID, ret: %v", ret) + } + fmt.Printf("Board ID: %d\n", boardId) + + boardPartNumber, ret := device.GetBoardPartNumber() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get Board Part Number, ret: %v", ret) + } + fmt.Printf("Board Part Number: %s\n", boardPartNumber) + + fmt.Println("----------------------------------------") + } + +} + +func CheckOnSameBoard(gpuCount uint) error { + fmt.Println("Check if the first two GPUs are on the same board...") + if gpuCount < 2 { + return fmt.Errorf("not enough GPUs to check on same board, gpu count: %d", gpuCount) + } + + devIdx1, devIdx2 := uint(0), uint(1) + var device1, device2 ixml.Device + + ret := ixml.DeviceGetHandleByIndex(devIdx1, &device1) // Get the first GPU device + if ret != ixml.SUCCESS { + return fmt.Errorf("failed to get handle by index, ret: %v", ret) + } + ret = ixml.DeviceGetHandleByIndex(devIdx2, &device2) // Get the second GPU device + if ret != ixml.SUCCESS { + return fmt.Errorf("failed to get Handle by index, ret: %v", ret) + } + + onSameBoard, ret := ixml.GetOnSameBoard(device1, device2) + if ret == ixml.ERROR_NOT_SUPPORTED { + return fmt.Errorf("nvmlDeviceOnSameBoard: ERROR_NOT_SUPPORTED") + } else if ret != ixml.SUCCESS { + return fmt.Errorf("GPU %d and %d are NOT on same board: %v", devIdx1, devIdx2, ret) + } else { + fmt.Printf("GPU %d and %d on same board: %d\n", devIdx1, devIdx2, onSameBoard) + } + fmt.Println("----------------------------------------") + return nil +} diff --git a/samples/events/main.go b/samples/events/main.go new file mode 100644 index 0000000..b869ad6 --- /dev/null +++ b/samples/events/main.go @@ -0,0 +1,96 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "log" + + "gitee.com/deep-spark/go-ixml/pkg/ixml" +) + +func main() { + ret := ixml.Init() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to initialize IXML, ret : %v", ret) + } + defer func() { + ret := ixml.Shutdown() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to shutdown IXML, ret: %v", ret) + } + }() + + count, ret := ixml.DeviceGetCount() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get device count, ret: %v", ret) + } + fmt.Printf("GPU Count: %v\n", count) + + for i := uint(0); i < count; i++ { + var device ixml.Device + ret = ixml.DeviceGetHandleByIndex(i, &device) + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get device at index %d, ret: %v", i, ret) + } + + // Supported Event Types: + // ixml.EventTypeSingleBitEccError (1) + // ixml.EventTypeDoubleBitEccError (2) + // ixml.EventTypeXidCriticalError (8) + // ixml.EventTypeClock (16) + supportTypes, ret := device.GetSupportedEventTypes() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get supported event types, ret: %v", ret) + } else { + log.Printf("Successfully retrieved supported event types: %v\n", supportTypes) + } + + set, ret := ixml.EventSetCreate() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to create event set, ret: %v", ret) + } else { + log.Printf("Successfully created event set: %v\n", set) + } + + // Register event types to the event set, supported types can be found above. + eventTypes := uint64(ixml.EventTypeXidCriticalError | ixml.EventTypeClock) + ret = device.RegisterEvents(eventTypes, set) + if ret != ixml.SUCCESS { + log.Fatalf("Unable to register events, ret: %v", ret) + } else { + log.Printf("Successfully registered events: %v\n", ret) + } + + timeoutms := uint32(10) + eventData, ret := ixml.EventSetWait(set, timeoutms) // or set.Wait(timeoutms) + if ret != ixml.SUCCESS && ret != ixml.ERROR_TIMEOUT { + log.Fatalf("EventSetWait failed, ret: %v", ret) + } else if ret == ixml.ERROR_TIMEOUT { + log.Printf("EventSetWait timed out after %d ms.\n", timeoutms) + } else { + log.Printf("Successfully received the event data: %+v \n", eventData) + } + + ret = ixml.EventSetFree(set) // or set.Free() + log.Printf("EventSetFree: %v\n", ret) + + fmt.Println("------------------------------------") + } + +} diff --git a/samples/metrics/main.go b/samples/metrics/main.go index 7d69922..3391b07 100644 --- a/samples/metrics/main.go +++ b/samples/metrics/main.go @@ -51,48 +51,40 @@ func main() { fmt.Printf("Get device at index %d\n", i) } - Uuid, ret := device.GetUUID() + integer, decimal, ret := device.GetGPUVoltage() if ret != ixml.SUCCESS { - fmt.Printf("Unable to get GPU Uuid of device %d, ret: %v\n", i, ret) - } else { - fmt.Printf("Uuid of device %d: %s\n", i, Uuid) - } - - MinorNumber, ret := device.GetMinorNumber() - if ret != ixml.SUCCESS { - fmt.Printf("Unable to get GPU MinorNumber of device %d, ret: %v\n", i, ret) - } else { - fmt.Printf("MinorNumber of device %d: %d\n", i, MinorNumber) + log.Fatalf("Unable to get GPU Voltage, ret: %v", ret) } + fmt.Printf("GPU Voltage of device %d: %v.%v\n", i, integer, decimal) temperature, ret := device.GetTemperature() if ret != ixml.SUCCESS { fmt.Printf("Unable to get GPU temperature of device %d, ret: %v\n", i, ret) } else { - fmt.Printf("temperature of device %d: %d\n", i, temperature) + fmt.Printf("Temperature of device %d: %d\n", i, temperature) } - FanSpeed, ret := device.GetFanSpeed() + fanSpeed, ret := device.GetFanSpeed() if ret != ixml.SUCCESS { fmt.Printf("Unable to get GPU FanSpeed of device %d, ret: %v\n", i, ret) } else { - fmt.Printf("FanSpeed of device %d: %d\n", i, FanSpeed) + fmt.Printf("FanSpeed of device %d: %d\n", i, fanSpeed) } - ClockInfo, ret := device.GetClockInfo() + clockInfo, ret := device.GetClockInfo() if ret != ixml.SUCCESS { fmt.Printf("Unable to get GPU MemClock of device %d, ret: %v\n", i, ret) } else { - fmt.Printf("MemClock of device %d: %d\n", i, ClockInfo.Mem) + fmt.Printf("MemClock of device %d: %d\n", i, clockInfo.Mem) } - MemoryInfo, ret := device.GetMemoryInfo() + memoryInfo, ret := device.GetMemoryInfo() if ret != ixml.SUCCESS { fmt.Printf("Unable to get MemoryInfo of device %d, ret: %v\n", i, ret) } else { - fmt.Printf("MemoryInfo totalMem of device %d: %d (MiB)\n", i, MemoryInfo.Total) - fmt.Printf("MemoryInfo usedMem of device %d: %d (MiB)\n", i, MemoryInfo.Used) - fmt.Printf("MemoryInfo freeMem of device %d: %v (MiB)\n", i, MemoryInfo.Free) + fmt.Printf("MemoryInfo totalMem of device %d: %d (MiB)\n", i, memoryInfo.Total) + fmt.Printf("MemoryInfo usedMem of device %d: %d (MiB)\n", i, memoryInfo.Used) + fmt.Printf("MemoryInfo freeMem of device %d: %v (MiB)\n", i, memoryInfo.Free) } utilizationRates, ret := device.GetUtilizationRates() @@ -103,11 +95,25 @@ func main() { fmt.Printf("GPU utilizationRates of device %d: %d\n", i, utilizationRates.Gpu) } - PciInfo, ret := device.GetPciInfo() + pciInfo, ret := device.GetPciInfo() if ret != ixml.SUCCESS { fmt.Printf("Unable to get PciInfo of device %d, ret: %v\n", i, ret) } else { - fmt.Printf("PciInfo of device %d: %v\n", i, PciInfo.BusId) + fmt.Printf("PciInfo of device %d: %v\n", i, pciInfo.BusId) + } + + pcieGeneration, ret := device.GetCurrPcieLinkGeneration() + if ret != ixml.SUCCESS { + fmt.Printf("Unable to get PcieGeneration of device %d, ret: %v\n", i, ret) + } else { + fmt.Printf("PcieGeneration of device %d: %d\n", i, pcieGeneration) + } + + pcieWidth, ret := device.GetCurrPcieLinkWidth() + if ret != ixml.SUCCESS { + fmt.Printf("Unable to get PcieWidth of device %d, ret: %v\n", i, ret) + } else { + fmt.Printf("PcieWidth of device %d: %d\n", i, pcieWidth) } pcieReplyCnt, ret := device.GetPcieReplayCounter() @@ -128,8 +134,14 @@ func main() { if ret != ixml.SUCCESS { fmt.Printf("Unable to get ecc errors %d, ret: %v\n", i, ret) } else { - fmt.Printf("singleErr: %d, doubleErr: %d\n", singleErr, doubleErr) + fmt.Printf("SingleEccErr: %d, DoubleEccErr: %d\n", singleErr, doubleErr) + } + + usage, ret := device.GetPowerUsage() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get Power Usage, ret: %v", ret) } + fmt.Printf("Power Usage: %d\n", usage) limit, ret := device.GetPowerManagementLimit() if ret != ixml.SUCCESS { @@ -149,7 +161,7 @@ func main() { if ret != ixml.SUCCESS { fmt.Printf("Unable to get PowerManagementLimitConstraints of device %d, ret: %v\n", i, ret) } else { - fmt.Printf("minLimit: %d, maxLimit: %d\n", minLimit, maxLimit) + fmt.Printf("MinPowerMgmtLimit: %d, MaxPowerMgmtLimit: %d\n", minLimit, maxLimit) } threshType := ixml.TEMPERATURE_THRESHOLD_SLOWDOWN diff --git a/samples/system/main.go b/samples/system/main.go index e43998c..8afbd0c 100644 --- a/samples/system/main.go +++ b/samples/system/main.go @@ -20,6 +20,7 @@ package main import ( "fmt" "log" + "strconv" "gitee.com/deep-spark/go-ixml/pkg/ixml" ) @@ -41,14 +42,27 @@ func main() { if ret != ixml.SUCCESS { log.Fatalf("Unable to get driver version: %v", ret) } - fmt.Printf("Driver Version: len(version)=%v, version=%v\n", len(version), version) + fmt.Printf("Driver Version: %v\n", version) - // Get the cuda driver version + // Get the cuda version cudaVersion, ret := ixml.SystemGetCudaDriverVersion() if ret != ixml.SUCCESS { - log.Fatalf("Unable to get cuda driver version: %v", ret) + log.Fatalf("Unable to get cuda version: %v", ret) + } + cudaVersionInt, err := strconv.Atoi(cudaVersion) + if err != nil { + log.Fatalf("Failed to convert cuda version: %v", err) + } + major, minor := uint(cudaVersionInt)/1000, uint(cudaVersionInt)%1000/10 + fmt.Printf("Cuda Version: %d.%d\n", major, minor) + + // Get the IXML version + ixmlVersion, ret := ixml.SystemGetNVMLVersion() + if ret != ixml.SUCCESS { + log.Fatalf("Unable to get IXML version: %v", ret) + } else { + fmt.Printf("IXML Version: %v\n", ixmlVersion) } - fmt.Printf("Cuda Driver Version: %v\n", cudaVersion) fmt.Println("========================================") } -- Gitee