[INIT] Use rocm-smi API instead of CLI for querying FW version (#1920)
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
0b03bb718a
Коммит
da06c69cb8
@@ -95,5 +95,5 @@ void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize);
|
||||
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount);
|
||||
ncclResult_t commSetUnrollFactor(struct ncclComm* comm);
|
||||
bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, const char* archName);
|
||||
int parseFirmwareVersion(const char* command);
|
||||
int parseFirmwareVersion();
|
||||
#endif
|
||||
|
||||
+8
-1
@@ -131,14 +131,21 @@ static ncclResult_t initResult = ncclSuccess;
|
||||
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
||||
|
||||
ncclResult_t checkHsaEnvSetting() {
|
||||
// get user-specified value for `HSA_NO_SCRATCH_RECLAIM`
|
||||
const char* hsaScratchEnv = getenv("HSA_NO_SCRATCH_RECLAIM");
|
||||
|
||||
int hipRuntimeVersion = 0;
|
||||
// hipVer is an integer e.g., 6.2.41133 -> 60241133
|
||||
CUDACHECK(hipRuntimeGetVersion(&hipRuntimeVersion));
|
||||
const int firmwareVersion = parseFirmwareVersion("amd-smi firmware");
|
||||
|
||||
// using rocm-smi API to query FW version, instead of parsing CLI output
|
||||
// will switch to amd-smi API soon
|
||||
const int firmwareVersion = parseFirmwareVersion();
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
// use GPU0 should be good enough
|
||||
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
|
||||
|
||||
INFO(NCCL_INIT, "Hipruntime version: %d, firmware version: %d", hipRuntimeVersion, firmwareVersion);
|
||||
if (!validHsaScratchEnvSetting(hsaScratchEnv, hipRuntimeVersion, firmwareVersion, devProp.gcnArchName)) {
|
||||
WARN("HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid RCCL perf hit, rocm ver:%d", hipRuntimeVersion);
|
||||
|
||||
+14
-24
@@ -24,6 +24,7 @@ THE SOFTWARE.
|
||||
#include "comm.h"
|
||||
#include "graph/topo.h"
|
||||
#include "enqueue.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
// Use this param to experiment pipelining new data types besides bfloat16
|
||||
// Make sure you generate the device code with the new data type (i.e. in generate.py)
|
||||
@@ -342,37 +343,26 @@ std::vector<std::string> splitString(const std::string& s, char delimiter) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
int parseFirmwareVersionImpl(FILE* file) {
|
||||
constexpr std::size_t MAX_LINE_SZ = 1024;
|
||||
char line[MAX_LINE_SZ];
|
||||
bool found_pattern = false;
|
||||
while (fgets(line, MAX_LINE_SZ, file)) {
|
||||
auto parts = splitString(line, ':');
|
||||
if (parts == std::vector<std::string>{"FW_ID", "CP_MEC1"}) {
|
||||
if (!found_pattern) {
|
||||
found_pattern = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
int parseFirmwareVersionImpl() {
|
||||
uint64_t fw_version = -1;
|
||||
|
||||
if (found_pattern && (parts[0] == "FW_VERSION")) {
|
||||
return stoi(parts[1]) & 0x7ff;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
// using rocm-smi APIs for now to query MEC FW version
|
||||
// will switch to amd-smi APIs soon
|
||||
rsmi_status_t ret;
|
||||
ret = rsmi_init(0);
|
||||
if (ret != RSMI_STATUS_SUCCESS) return -1;
|
||||
ret = rsmi_dev_firmware_version_get(0, RSMI_FW_BLOCK_MEC, &fw_version);
|
||||
if (ret != RSMI_STATUS_SUCCESS) return -1;
|
||||
|
||||
return fw_version;
|
||||
}
|
||||
|
||||
int parseFirmwareVersion(const char* command) {
|
||||
auto file = popen(command, "r");
|
||||
if (file == nullptr) {
|
||||
return -1;
|
||||
}
|
||||
int parseFirmwareVersion() {
|
||||
int version = -1;
|
||||
try {
|
||||
version = parseFirmwareVersionImpl(file);
|
||||
version = parseFirmwareVersionImpl();
|
||||
} catch (const std::exception& ex) {
|
||||
}
|
||||
pclose(file);
|
||||
return version;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user