/* * Copyright © 2014 Advanced Micro Devices, Inc. * Copyright 2016-2018 Raptor Engineering, LLC. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including * the next paragraph) shall be included in all copies or substantial * portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "impl/wddm/types.h" #include "impl/wddm/device.h" #include "util/utils.h" /* Number of memory banks added by thunk on top of topology * This only includes static heaps like LDS, scratch and SVM, * not for MMIO_REMAP heap. MMIO_REMAP memory bank is reported * dynamically based on whether mmio aperture was mapped * successfully on this node. */ #define NUM_OF_IGPU_HEAPS 3 #define NUM_OF_DGPU_HEAPS 3 typedef struct { HsaNodeProperties node; std::vector mem; /* node->NumBanks elements */ std::vector cache; std::vector link; } node_props_t; struct _topology_props { HsaSystemProperties *g_system = nullptr; std::vector g_props; std::vector wdevices_; uint32_t wdevice_num_ = 0; uint32_t num_sysfs_nodes = 0; int processor_vendor = -1; double freq_max_ = 0.0; }; static _topology_props* dxg_topology = new _topology_props(); /* Supported System Vendors */ enum SUPPORTED_PROCESSOR_VENDORS { GENUINE_INTEL = 0, AUTHENTIC_AMD, IBM_POWER }; /* Adding newline to make the search easier */ static const char *supported_processor_vendor_name[] = { "GenuineIntel", "AuthenticAMD", "" // POWER requires a different search method }; static HSAKMT_STATUS topology_take_snapshot(void); static void topology_drop_snapshot(void); /* information from /proc/cpuinfo */ struct proc_cpuinfo { uint32_t proc_num; /* processor */ uint32_t apicid; /* apicid */ char model_name[HSA_PUBLIC_NAME_SIZE]; /* model name */ }; /* CPU cache table for all CPUs on the system. Each entry has the relative CPU * info and caches connected to that CPU. */ typedef struct cpu_cacheinfo { int32_t proc_num; /* this cpu's processor number */ uint32_t num_caches; /* number of caches reported by this cpu */ } cpu_cacheinfo_t; /* num_subdirs - find the number of sub-directories in the specified path * @dirpath - directory path to find sub-directories underneath * @prefix - only count sub-directory names starting with prefix. * Use blank string, "", to count all. * Return - number of sub-directories */ static int num_subdirs(char *dirpath, const char *prefix) { int count = 0; DIR *dirp; struct dirent *dir; int prefix_len = strlen(prefix); dirp = opendir(dirpath); if (dirp) { while ((dir = readdir(dirp)) != 0) { if ((strcmp(dir->d_name, ".") == 0) || (strcmp(dir->d_name, "..") == 0)) continue; if (prefix_len && strncmp(dir->d_name, prefix, prefix_len)) continue; count++; } closedir(dirp); } return count; } /* fscanf_dec - read a file whose content is a decimal number * @file [IN ] file to read * @num [OUT] number in the file */ static HSAKMT_STATUS fscanf_dec(char *file, uint32_t *num) { FILE *fd; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; fd = fopen(file, "r"); if (!fd) { pr_err("Failed to open %s\n", file); return HSAKMT_STATUS_INVALID_PARAMETER; } if (fscanf(fd, "%u", num) != 1) { pr_err("Failed to parse %s as a decimal.\n", file); ret = HSAKMT_STATUS_ERROR; } fclose(fd); return ret; } /* fscanf_str - read a file whose content is a string * @file [IN ] file to read * @str [OUT] string in the file */ static HSAKMT_STATUS fscanf_str(char *file, char *str) { FILE *fd; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; fd = fopen(file, "r"); if (!fd) { pr_err("Failed to open %s\n", file); return HSAKMT_STATUS_INVALID_PARAMETER; } if (fscanf(fd, "%s", str) != 1) { pr_err("Failed to parse %s as a string.\n", file); ret = HSAKMT_STATUS_ERROR; } fclose(fd); return ret; } /* fscanf_size - read a file whose content represents size as a string * @file [IN ] file to read * @bytes [OUT] sizes in bytes */ static HSAKMT_STATUS fscanf_size(char *file, uint32_t *bytes) { FILE *fd; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; char unit; int n; fd = fopen(file, "r"); if (!fd) { pr_err("Failed to open %s\n", file); return HSAKMT_STATUS_INVALID_PARAMETER; } n = fscanf(fd, "%u%c", bytes, &unit); if (n < 1) { pr_err("Failed to parse %s\n", file); ret = HSAKMT_STATUS_ERROR; } if (n == 2) { switch (unit) { case 'K': *bytes <<= 10; break; case 'M': *bytes <<= 20; break; case 'G': *bytes <<= 30; break; default: ret = HSAKMT_STATUS_ERROR; break; } } fclose(fd); return ret; } /* cpumap_to_cpu_ci - translate shared_cpu_map string + cpuinfo->apicid into * SiblingMap in cache * @shared_cpu_map [IN ] shared_cpu_map string * @cpuinfo [IN ] cpuinfo to get apicid * @this_cache [OUT] CPU cache to fill in SiblingMap */ static void cpumap_to_cpu_ci(char *shared_cpu_map, const std::vector& cpuinfo, HsaCacheProperties *this_cache) { int num_hexs, bit; uint32_t proc, apicid, mask; char *ch_ptr; /* shared_cpu_map is shown as ...X3,X2,X1 Each X is a hex without 0x * and it's up to 8 characters(32 bits). For the first 32 CPUs(actually * procs), it's presented in X1. The next 32 is in X2, and so on. */ num_hexs = (strlen(shared_cpu_map) + 8) / 9; /* 8 characters + "," */ ch_ptr = strtok(shared_cpu_map, ","); while (num_hexs-- > 0) { mask = strtol(ch_ptr, NULL, 16); /* each X */ for (bit = 0; bit < 32; bit++) { if (!((1 << bit) & mask)) continue; proc = num_hexs * 32 + bit; apicid = cpuinfo[proc].apicid; if (apicid >= HSA_CPU_SIBLINGS) { pr_warn("SiblingMap buffer %d is too small\n", HSA_CPU_SIBLINGS); continue; } this_cache->SiblingMap[apicid] = 1; } ch_ptr = strtok(NULL, ","); } } /* get_cpu_cache_info - get specified CPU's cache information from sysfs * @prefix [IN] sysfs path for target cpu cache, * /sys/devices/system/node/nodeX/cpuY/cache * @cpuinfo [IN] /proc/cpuinfo data to get apicid * @cpu_ci: CPU specified. This parameter is an input and also an output. * [IN] cpu_ci->num_caches: number of index dirs * [OUT] cpu_ci->cache_info: to store cache info collected * [OUT] cpu_ci->num_caches: reduces when shared with other cpu(s) * Return: number of cache reported from this cpu */ static int get_cpu_cache_info(const char *prefix, const std::vector& cpuinfo, std::vector& cache, cpu_cacheinfo_t& cpu_ci) { int n; char path[256], str[256]; bool is_power9 = false; if (dxg_topology->processor_vendor == IBM_POWER) { if (strcmp(cpuinfo[0].model_name, "POWER9") == 0) { is_power9 = true; } } HsaCacheProperties this_cache; int num_idx = cpu_ci.num_caches; for (int idx = 0; idx < num_idx; idx++) { memset(&this_cache, 0, sizeof(this_cache)); /* If this cache is shared by multiple CPUs, we only need * to list it in the first CPU. */ if (is_power9) { // POWER9 has SMT4 if (cpu_ci.proc_num & 0x3) { /* proc is not 0,4,8,etc. Skip and reduce the cache count. */ --cpu_ci.num_caches; continue; } } else { snprintf(path, 256, "%s/index%d/shared_cpu_list", prefix, idx); /* shared_cpu_list is shown as n1,n2... or n1-n2,n3-n4... * For both cases, this cache is listed to proc n1 only. */ fscanf_dec(path, (uint32_t *)&n); if (cpu_ci.proc_num != n) { /* proc is not n1. Skip and reduce the cache count. */ --cpu_ci.num_caches; continue; } this_cache.ProcessorIdLow = cpuinfo[cpu_ci.proc_num].apicid; } /* CacheLevel */ snprintf(path, 256, "%s/index%d/level", prefix, idx); fscanf_dec(path, &this_cache.CacheLevel); /* CacheType */ snprintf(path, 256, "%s/index%d/type", prefix, idx); memset(str, 0, sizeof(str)); fscanf_str(path, str); if (!strcmp(str, "Data")) this_cache.CacheType.ui32.Data = 1; if (!strcmp(str, "Instruction")) this_cache.CacheType.ui32.Instruction = 1; if (!strcmp(str, "Unified")) { this_cache.CacheType.ui32.Data = 1; this_cache.CacheType.ui32.Instruction = 1; } this_cache.CacheType.ui32.CPU = 1; /* CacheSize */ snprintf(path, 256, "%s/index%d/size", prefix, idx); fscanf_size(path, &this_cache.CacheSize); /* CacheLineSize */ snprintf(path, 256, "%s/index%d/coherency_line_size", prefix, idx); fscanf_dec(path, &this_cache.CacheLineSize); /* CacheAssociativity */ snprintf(path, 256, "%s/index%d/ways_of_associativity", prefix, idx); fscanf_dec(path, &this_cache.CacheAssociativity); /* CacheLinesPerTag */ snprintf(path, 256, "%s/index%d/physical_line_partition", prefix, idx); fscanf_dec(path, &this_cache.CacheLinesPerTag); /* CacheSiblings */ snprintf(path, 256, "%s/index%d/shared_cpu_map", prefix, idx); fscanf_str(path, str); cpumap_to_cpu_ci(str, cpuinfo, &this_cache); cache.push_back(this_cache); } return cpu_ci.num_caches; } static HSAKMT_STATUS topology_map_node_id(uint32_t node_id, wsl::thunk::WDDMDevice *&device) { uint32_t idx = node_id; if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes)) { device = nullptr; return HSAKMT_STATUS_ERROR; } device = dxg_topology->wdevices_[node_id - 1]; return HSAKMT_STATUS_SUCCESS; } HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props) { HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; bool is_node_supported = true; uint32_t num_supported_nodes = 0; std::memset(&props, 0, sizeof(props)); dxg_runtime->HeapFini(); for (auto device : dxg_topology->wdevices_) delete device; dxg_topology->wdevices_.clear(); WDDMCreateDevices(dxg_topology->wdevices_); int num_adapters = dxg_topology->wdevices_.size(); if (num_adapters == 0) { pr_err("No WDDM adapters found.\n"); return HSAKMT_STATUS_ERROR; } dxg_topology->num_sysfs_nodes = num_adapters + 1; dxg_runtime->HeapInit(); props.NumNodes = dxg_topology->num_sysfs_nodes; if (dxg_runtime->default_node > num_adapters) dxg_runtime->default_node = num_adapters; return ret; } void topology_setup_is_dgpu_param(HsaNodeProperties *props) { /* if we found a dGPU node, then treat the whole system as dGPU */ /* noted that some APUs are also treated as dGPU in runtime */ if (!props->NumCPUCores && props->NumFComputeCores) dxg_runtime->hsakmt_is_dgpu = true; } static HSAKMT_STATUS topology_get_cpu_model_name(HsaNodeProperties& props, const std::vector& cpuinfo) { for (int i = 0; i < cpuinfo.size(); i++) { if (props.CComputeIdLo == cpuinfo[i].apicid) { if (!props.DeviceId) /* CPU-only node */ strncpy((char *)props.AMDName, cpuinfo[i].model_name, sizeof(props.AMDName)); /* Convert from UTF8 to UTF16 */ int j; for (j = 0; cpuinfo[i].model_name[j] != '\0' && j < HSA_PUBLIC_NAME_SIZE - 1; j++) props.MarketingName[j] = cpuinfo[i].model_name[j]; props.MarketingName[j] = '\0'; return HSAKMT_STATUS_SUCCESS; } } return HSAKMT_STATUS_ERROR; } static int topology_search_processor_vendor(const std::string& processor_name) { for (unsigned int i = 0; i < ARRAY_LEN(supported_processor_vendor_name); i++) { if (processor_name == supported_processor_vendor_name[i]) return i; if (processor_name == "POWER9, altivec supported") return IBM_POWER; } return -1; } /* topology_parse_cpuinfo - Parse /proc/cpuinfo and fill up required * topology information * cpuinfo [OUT]: output buffer to hold cpu information * num_procs: number of processors the output buffer can hold */ static HSAKMT_STATUS topology_parse_cpuinfo(std::vector& cpuinfo) { HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; uint32_t num_procs = cpuinfo.size(); std::ifstream cpuinfo_max_freq( "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"); if (cpuinfo_max_freq) { std::string line; std::getline(cpuinfo_max_freq, line); dxg_topology->freq_max_ = static_cast(std::stod(line) / 1000); } std::ifstream cpuinfo_file("/proc/cpuinfo"); if (!cpuinfo_file) { pr_err("Failed to open /proc/cpuinfo. Unable to get CPU information"); return HSAKMT_STATUS_ERROR; } std::string line; uint32_t proc = 0; while (std::getline(cpuinfo_file, line)) { if (line.substr(0, 9) == "processor") { proc = std::stoi(line.substr(line.find(':') + 2)); if (proc >= num_procs) { pr_err("cpuinfo contains processor %d larger than %u\n", proc, num_procs); return HSAKMT_STATUS_NO_MEMORY; } continue; } if (line.substr(0, 9) == "vendor_id" && dxg_topology->processor_vendor == -1) { std::string vendor = line.substr(line.find(':') + 2); dxg_topology->processor_vendor = topology_search_processor_vendor(vendor.c_str()); continue; } if (line.substr(0, 10) == "model name") { std::string model_name = line.substr(line.find(':') + 2); if (model_name.size() > HSA_PUBLIC_NAME_SIZE) model_name.resize(HSA_PUBLIC_NAME_SIZE); std::strncpy(cpuinfo[proc].model_name, model_name.c_str(), HSA_PUBLIC_NAME_SIZE); continue; } if (line.substr(0, 6) == "apicid") { cpuinfo[proc].apicid = std::stoi(line.substr(line.find(':') + 2)); continue; } if (!cpuinfo_max_freq) { if (line.substr(0, 7) == "cpu MHz") { double freq = std::stod(line.substr(line.find(':') + 2)); if (freq > dxg_topology->freq_max_) { dxg_topology->freq_max_ = freq; } continue; } } } if (dxg_topology->processor_vendor < 0) { pr_err("Failed to get Processor Vendor. Setting to %s", supported_processor_vendor_name[GENUINE_INTEL]); dxg_topology->processor_vendor = GENUINE_INTEL; } return ret; } static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, HsaNodeProperties& props, bool& p2p_links, uint32_t& num_p2pLinks) { HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; memset(&props, 0, sizeof(props)); p2p_links = false; num_p2pLinks = 0; props.MaxEngineClockMhzCCompute = dxg_topology->freq_max_; if (node_id == 0) { /* CPU node */ props.NumCPUCores = sysconf(_SC_NPROCESSORS_ONLN); props.NumMemoryBanks = 1; props.KFDGpuID = 0; return HSAKMT_STATUS_SUCCESS; } /* gpu node */ wsl::thunk::WDDMDevice *device; ret = topology_map_node_id(node_id, device); if (ret != HSAKMT_STATUS_SUCCESS) return ret; props.NumCPUCores = 0; props.NumFComputeCores = device->SimdPerCu() * device->ComputeUnitCount(); props.NumMemoryBanks = 1; props.NumCaches = 3; props.NumIOLinks = 1; props.CComputeIdLo = 0; props.FComputeIdLo = 0; props.Capability.ui32.ASICRevision = device->AsicRevision(); props.Capability.ui32.WatchPointsTotalBits = std::log2(device->WatchPointsNum()); props.MaxWavesPerSIMD = device->WavePerCu() / device->SimdPerCu(); props.LDSSizeInKB = device->LdsSize() / 1024; props.GDSSizeInKB = 0; props.WaveFrontSize = device->WavefrontSize(); props.NumShaderBanks = device->NumShaderEngine(); props.NumArrays = device->ShaderArrayPerShaderEngine(); props.NumCUPerArray = device->ComputeUnitCount() / props.NumArrays; props.NumSIMDPerCU = device->SimdPerCu(); props.MaxSlotsScratchCU = device->MaxScratchSlotsPerCu(); props.VendorId = 0x1002; props.DeviceId = device->DeviceId(); props.LocationId = device->PciBusAddr(); props.LocalMemSize = 0; props.MaxEngineClockMhzFCompute = device->MaxEngineClockMhz(); props.DrmRenderMinor = node_id; { int i; const char *name = device->ProductName(); for (i = 0; name[i] != 0 && i < HSA_PUBLIC_NAME_SIZE - 1; i++) props.MarketingName[i] = name[i]; props.MarketingName[i] = '\0'; } props.uCodeEngineVersions.uCodeSDMA = device->GetSdmaFwVersion(); props.DebugProperties.Value = 0; props.HiveID = 0; props.NumSdmaEngines = device->NumSdmaEngine(); props.NumSdmaXgmiEngines = 0; props.NumSdmaQueuesPerEngine = 6; // TODO props.NumCpQueues = device->GetNumCpQueues(); props.NumGws = 0; /* * In Native Linux, if the asic is APU, this value will be set to 1, * if the asic is dGPU, this value will be set to 0. clr use this info * to set hostUnifiedMemory_, but for now wsl does not support this feature. * Therefore, fore vaule to 0 temporarily. */ props.Integrated = 0; props.Domain = device->Domain(); props.UniqueID = device->Uuid(); props.NumXcc = 1; props.KFDGpuID = device->DeviceId(); // TODO props.FamilyID = device->GfxFamily(); props.EngineId.ui32.uCode = device->GetMecFwVersion(); char *envvar = getenv("HSA_OVERRIDE_GFX_VERSION"); if (envvar) { char dummy = '\0'; uint32_t major = 0, minor = 0, step = 0; /* HSA_OVERRIDE_GFX_VERSION=major.minor.stepping */ if ((sscanf(envvar, "%u.%u.%u%c", &major, &minor, &step, &dummy) != 3) || (major > 63 || minor > 255 || step > 255)) { pr_err("HSA_OVERRIDE_GFX_VERSION %s is invalid\n", envvar); return HSAKMT_STATUS_ERROR; } props.OverrideEngineId.ui32.Major = major & 0x3f; props.OverrideEngineId.ui32.Minor = minor & 0xff; props.OverrideEngineId.ui32.Stepping = step & 0xff; } props.EngineId.ui32.Major = device->Major(); props.EngineId.ui32.Minor = device->Minor(); props.EngineId.ui32.Stepping = device->Stepping(); snprintf((char *)props.AMDName, sizeof(props.AMDName) - 1, "GFX%06x", HSA_GET_GFX_VERSION_FULL(props.EngineId.ui32)); if (!dxg_runtime->is_svm_api_supported) props.Capability.ui32.SVMAPISupported = 0; props.Capability.ui32.DoorbellType = 2; /* Get VGPR/SGPR size in byte per CU */ props.SGPRSizePerCU = SGPR_SIZE_PER_CU; props.VGPRSizePerCU = get_vgpr_size_per_cu(props.EngineId); if (props.NumFComputeCores) assert(props.EngineId.ui32.Major && "HSA_OVERRIDE_GFX_VERSION may be needed"); return ret; } static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id, uint32_t mem_id, HsaMemoryProperties& props) { HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; std::memset(&props, 0, sizeof(props)); if (node_id == 0) { /* CPU node */ props.HeapType = HSA_HEAPTYPE_SYSTEM; struct sysinfo info; sysinfo(&info); props.SizeInBytes = info.totalram; /* props.SizeInBytes is the actual physical system * memory size. Reserve 1/16th for WSL system usage. */ dxg_runtime->max_single_alloc_size = info.totalram - (info.totalram >> 4); props.Flags.MemoryProperty = 0; /* TODO: sudo dmidecode --type memory doesn't work on wsl */ props.Width = 64; props.MemoryClockMax = 2133; return HSAKMT_STATUS_SUCCESS; } wsl::thunk::WDDMDevice *device; ret = topology_map_node_id(node_id, device); if (ret != HSAKMT_STATUS_SUCCESS) return ret; props.HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE; if (device->IsDgpu()) props.SizeInBytes = device->LocalHeapSize(); else props.SizeInBytes = device->NonLocalHeapSize(); props.Width = device->MemoryBusWidth(); props.MemoryClockMax = device->MaxMemoryClockMhz(); return ret; } /* topology_get_cpu_cache_props - Read CPU cache information from sysfs * @node [IN] CPU node number * @cpuinfo [IN] /proc/cpuinfo data * @tbl [OUT] the node table to fill up * Return: HSAKMT_STATUS_SUCCESS in success or error number in failure */ static HSAKMT_STATUS topology_get_cpu_cache_props(int node, const std::vector& cpuinfo, node_props_t& tbl) { HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; /* Get max path size from /sys/devices/system/node/node%d/%s/cache * below, which will max out according to the largest filename, * which can be present twice in the string above. 29 is for the prefix * and the +6 is for the cache suffix */ #ifndef MAXNAMLEN /* MAXNAMLEN is the BSD name for NAME_MAX. glibc aliases this as NAME_MAX, but * not musl */ #define MAXNAMLEN NAME_MAX #endif constexpr uint32_t MAXPATHSIZE = 29 + MAXNAMLEN + (MAXNAMLEN + 6); char path[MAXPATHSIZE], node_dir[MAXPATHSIZE]; int max_cpus; int cache_cnt = 0; DIR *dirp = NULL; struct dirent *dir; char *p; /* Get info from /sys/devices/system/node/nodeX/cpuY/cache */ int node_real = node; if (dxg_topology->processor_vendor == IBM_POWER) { if (!strcmp(cpuinfo[0].model_name, "POWER9")) { node_real = node * 8; } } snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/node/node%d", node_real); /* Other than cpuY folders, this dir also has cpulist and cpumap */ max_cpus = num_subdirs(node_dir, "cpu"); if (max_cpus <= 0) { /* If CONFIG_NUMA is not enabled in the kernel, * /sys/devices/system/node doesn't exist. */ if (node) { /* CPU node must be 0 or something is wrong */ pr_err("Fail to get cpu* dirs under %s.", node_dir); ret = HSAKMT_STATUS_ERROR; goto exit; } /* Fall back to use /sys/devices/system/cpu */ snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/cpu"); max_cpus = num_subdirs(node_dir, "cpu"); if (max_cpus <= 0) { pr_err("Fail to get cpu* dirs under %s\n", node_dir); ret = HSAKMT_STATUS_ERROR; goto exit; } } dirp = opendir(node_dir); while ((dir = readdir(dirp)) != 0) { if (strncmp(dir->d_name, "cpu", 3)) continue; if (!isdigit(dir->d_name[3])) /* ignore files like cpulist */ continue; if (strlen(node_dir) + strlen(dir->d_name) + strlen("/cache") + 2 < MAXPATHSIZE) { std::string path_str = std::string(node_dir) + "/" + dir->d_name + "/cache"; strncpy(path, path_str.c_str(), MAXPATHSIZE); path[MAXPATHSIZE - 1] = '\0'; } else { pr_err("Path is too long and was truncated.\n"); goto exit; } cpu_cacheinfo_t cpu_ci; cpu_ci.num_caches = num_subdirs(path, "index"); cpu_ci.proc_num= atoi(dir->d_name+3); cache_cnt += get_cpu_cache_info(path, cpuinfo, tbl.cache, cpu_ci); } assert(cache_cnt == tbl.cache.size()); tbl.node.NumCaches = cache_cnt; exit: if (dirp) closedir(dirp); return ret; } /* For a give Node @node_id the function gets @iolink_id information i.e. parses * sysfs the following sysfs entry * ./nodes/@node_id/io_links/@iolink_id/properties. @node_id has to be valid * accessible node. * * If node_to specified by the @iolink_id is not accessible the function returns * HSAKMT_STATUS_NOT_SUPPORTED. If node_to is accessible, then node_to is mapped * from sysfs_node to user_node and returns HSAKMT_STATUS_SUCCESS. */ static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id, uint32_t iolink_id, HsaIoLinkProperties& props, bool p2pLink) { wsl::thunk::WDDMDevice *device; topology_map_node_id(node_id, device); std::memset(&props, 0, sizeof(props)); props.IoLinkType = HSA_IOLINKTYPE_PCIEXPRESS; props.VersionMajor = props.VersionMinor = 0; props.NodeFrom = node_id; props.NodeTo = 0; props.Weight = 20; props.Flags.ui32.Override = 1; props.Flags.ui32.NonCoherent = 1; props.Flags.ui32.NoAtomics32bit = !(device->SupportPlatformAtomic()); props.Flags.ui32.NoAtomics64bit = !(device->SupportPlatformAtomic()); props.RecSdmaEngIdMask = 0; return HSAKMT_STATUS_SUCCESS; } /* topology_get_free_io_link_slot_for_node - For the given node_id, find the * next available free slot to add an io_link */ static HsaIoLinkProperties * topology_get_free_io_link_slot_for_node(uint32_t node_id, const HsaSystemProperties& sys_props, std::vector& node_props) { std::vector& props = node_props[node_id].link; if (node_id >= sys_props.NumNodes) { pr_err("Invalid node [%d]\n", node_id); return NULL; } if (!props.size()) { pr_err("No io_link reported for Node [%d]\n", node_id); return NULL; } if (node_props[node_id].node.NumIOLinks >= sys_props.NumNodes - 1) { pr_err("No more space for io_link for Node [%d]\n", node_id); return NULL; } return &props[node_props[node_id].node.NumIOLinks]; } /* topology_add_io_link_for_node - If a free slot is available, * add io_link for the given Node. * TODO: Add other members of HsaIoLinkProperties */ static HSAKMT_STATUS topology_add_io_link_for_node( uint32_t node_from, const HsaSystemProperties& sys_props, std::vector& node_props, HSA_IOLINKTYPE IoLinkType, uint32_t node_to, uint32_t Weight) { HsaIoLinkProperties *props; props = topology_get_free_io_link_slot_for_node(node_from, sys_props, node_props); if (!props) return HSAKMT_STATUS_NO_MEMORY; props->IoLinkType = IoLinkType; props->NodeFrom = node_from; props->NodeTo = node_to; props->Weight = Weight; node_props[node_from].node.NumIOLinks++; return HSAKMT_STATUS_SUCCESS; } /* Find the CPU that this GPU (gpu_node) directly connects to */ static int32_t gpu_get_direct_link_cpu(uint32_t gpu_node, const std::vector& node_props) { const std::vector& props = node_props[gpu_node].link; uint32_t i; if (!node_props[gpu_node].node.KFDGpuID || props.empty() || node_props[gpu_node].node.NumIOLinks == 0) return -1; for (i = 0; i < node_props[gpu_node].node.NumIOLinks; i++) if (props[i].IoLinkType == HSA_IOLINKTYPE_PCIEXPRESS && props[i].Weight <= 20) /* >20 is GPU->CPU->GPU */ return props[i].NodeTo; return -1; } /* Get node1->node2 IO link information. This should be a direct link that has * been created in the kernel. */ static HSAKMT_STATUS get_direct_iolink_info(uint32_t node1, uint32_t node2, const std::vector& node_props, HSAuint32 *weight, HSA_IOLINKTYPE *type) { const std::vector& props = node_props[node1].link; uint32_t i; if (!props.size()) return HSAKMT_STATUS_INVALID_NODE_UNIT; for (i = 0; i < node_props[node1].node.NumIOLinks; i++) if (props[i].NodeTo == node2) { if (weight) *weight = props[i].Weight; if (type) *type = props[i].IoLinkType; return HSAKMT_STATUS_SUCCESS; } return HSAKMT_STATUS_INVALID_PARAMETER; } static HSAKMT_STATUS get_indirect_iolink_info(uint32_t node1, uint32_t node2, const std::vector& node_props, HSAuint32 *weight, HSA_IOLINKTYPE *type) { int32_t dir_cpu1 = -1, dir_cpu2 = -1; HSAKMT_STATUS ret; uint32_t i; *weight = 0; *type = HSA_IOLINKTYPE_UNDEFINED; if (node1 == node2) return HSAKMT_STATUS_INVALID_PARAMETER; /* CPU->CPU is not an indirect link */ if (!node_props[node1].node.KFDGpuID && !node_props[node2].node.KFDGpuID) return HSAKMT_STATUS_INVALID_NODE_UNIT; if (node_props[node1].node.HiveID && node_props[node2].node.HiveID && node_props[node1].node.HiveID == node_props[node2].node.HiveID) return HSAKMT_STATUS_INVALID_PARAMETER; if (node_props[node1].node.KFDGpuID) dir_cpu1 = gpu_get_direct_link_cpu(node1, node_props); if (node_props[node2].node.KFDGpuID) dir_cpu2 = gpu_get_direct_link_cpu(node2, node_props); if (dir_cpu1 < 0 && dir_cpu2 < 0) return HSAKMT_STATUS_ERROR; /* if the node2(dst) is GPU , it need to be large bar for host access*/ if (node_props[node2].node.KFDGpuID) { for (i = 0; i < node_props[node2].node.NumMemoryBanks; ++i) if (node_props[node2].mem[i].HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC) break; if (i >= node_props[node2].node.NumMemoryBanks) return HSAKMT_STATUS_ERROR; } /* Possible topology: * GPU --(weight1) -- CPU -- (weight2) -- GPU * GPU --(weight1) -- CPU -- (weight2) -- CPU -- (weight3) -- GPU * GPU --(weight1) -- CPU -- (weight2) -- CPU * CPU -- (weight2) -- CPU -- (weight3) -- GPU */ HSAuint32 weight1 = 0, weight2 = 0, weight3 = 0; if (dir_cpu1 >= 0) { /* GPU->CPU ... */ if (dir_cpu2 >= 0) { if (dir_cpu1 == dir_cpu2) /* GPU->CPU->GPU*/ { ret = get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); if (ret != HSAKMT_STATUS_SUCCESS) return ret; ret = get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type); } else /* GPU->CPU->CPU->GPU*/ { ret = get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); if (ret != HSAKMT_STATUS_SUCCESS) return ret; ret = get_direct_iolink_info(dir_cpu1, dir_cpu2, node_props, &weight2, type); if (ret != HSAKMT_STATUS_SUCCESS) return ret; /* On QPI interconnection, GPUs can't access * each other if they are attached to different * CPU sockets. CPU<->CPU weight larger than 20 * means the two CPUs are in different sockets. */ if (*type == HSA_IOLINK_TYPE_QPI_1_1 && weight2 > 20) return HSAKMT_STATUS_NOT_SUPPORTED; ret = get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL); } } else /* GPU->CPU->CPU */ { ret = get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); if (ret != HSAKMT_STATUS_SUCCESS) return ret; ret = get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type); } } else { /* CPU->CPU->GPU */ ret = get_direct_iolink_info(node1, dir_cpu2, node_props, &weight2, type); if (ret != HSAKMT_STATUS_SUCCESS) return ret; ret = get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL); } if (ret != HSAKMT_STATUS_SUCCESS) return ret; *weight = weight1 + weight2 + weight3; return HSAKMT_STATUS_SUCCESS; } static void topology_create_indirect_gpu_links(const HsaSystemProperties& sys_props, std::vector& node_props) { uint32_t i, j; HSAuint32 weight; HSA_IOLINKTYPE type; for (i = 0; i < sys_props.NumNodes - 1; i++) { for (j = i + 1; j < sys_props.NumNodes; j++) { get_indirect_iolink_info(i, j, node_props, &weight, &type); if (!weight) goto try_alt_dir; if (topology_add_io_link_for_node(i, sys_props, node_props, type, j, weight) != HSAKMT_STATUS_SUCCESS) pr_err("Fail to add IO link %d->%d\n", i, j); try_alt_dir: get_indirect_iolink_info(j, i, node_props, &weight, &type); if (!weight) continue; if (topology_add_io_link_for_node(j, sys_props, node_props, type, i, weight) != HSAKMT_STATUS_SUCCESS) pr_err("Fail to add IO link %d->%d\n", j, i); } } } HSAKMT_STATUS topology_take_snapshot(void) { uint32_t i, mem_id, cache_id; HsaSystemProperties sys_props; std::vector& temp_props = dxg_topology->g_props; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; const uint32_t num_procs = sysconf(_SC_NPROCESSORS_ONLN); std::vector cpuinfo(num_procs); uint32_t num_ioLinks; bool p2p_links = false; uint32_t num_p2pLinks = 0; topology_parse_cpuinfo(cpuinfo); ret = topology_sysfs_get_system_props(sys_props); if (ret != HSAKMT_STATUS_SUCCESS) goto err; if (sys_props.NumNodes > 0) { temp_props.resize(sys_props.NumNodes); for (i = 0; i < sys_props.NumNodes; i++) { wsl::thunk::WDDMDevice *device_; topology_map_node_id(i, device_); ret = topology_sysfs_get_node_props(i, temp_props[i].node, p2p_links, num_p2pLinks); if (ret != HSAKMT_STATUS_SUCCESS) { goto err; } topology_setup_is_dgpu_param(&temp_props[i].node); if (temp_props[i].node.NumCPUCores) topology_get_cpu_model_name(temp_props[i].node, cpuinfo); if (temp_props[i].node.NumMemoryBanks) { temp_props[i].mem.resize(temp_props[i].node.NumMemoryBanks); for (mem_id = 0; mem_id < temp_props[i].node.NumMemoryBanks; mem_id++) { ret = topology_sysfs_get_mem_props(i, mem_id, temp_props[i].mem[mem_id]); if (ret != HSAKMT_STATUS_SUCCESS) { goto err; } } } if (temp_props[i].node.NumCaches) { temp_props[i].cache.resize(temp_props[i].node.NumCaches); for (int j = 0; j < 3; j++) { temp_props[i].cache[j].CacheType.ui32.Data = 1; temp_props[i].cache[j].CacheType.ui32.HSACU = 1; temp_props[i].cache[j].CacheLevel = j + 1; } temp_props[i].cache[0].CacheSize = device_->GetL1CacheSize() / 1024; temp_props[i].cache[1].CacheSize = device_->GetL2CacheSize() / 1024; temp_props[i].cache[2].CacheSize = device_->GetL3CacheSize() / 1024; } else if (!temp_props[i].node.KFDGpuID) { /* a CPU node */ ret = topology_get_cpu_cache_props(i, cpuinfo, temp_props[i]); if (ret != HSAKMT_STATUS_SUCCESS) { goto err; } } /* To simplify, allocate maximum needed memory for io_links for each node. * This removes the need for realloc when indirect and QPI links are added * later */ temp_props[i].link.resize(sys_props.NumNodes - 1); num_ioLinks = temp_props[i].node.NumIOLinks - num_p2pLinks; uint32_t link_id = 0; if (num_ioLinks) { uint32_t sys_link_id = 0; /* Parse all the sysfs specified io links. Skip the ones where the * remote node (node_to) is not accessible */ while (sys_link_id < num_ioLinks && link_id < sys_props.NumNodes - 1) { ret = topology_sysfs_get_iolink_props( i, sys_link_id++, temp_props[i].link[link_id], false); if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { ret = HSAKMT_STATUS_SUCCESS; continue; } else if (ret != HSAKMT_STATUS_SUCCESS) { goto err; } link_id++; } /* sysfs specifies all the io links. Limit the number to valid ones */ temp_props[i].node.NumIOLinks = link_id; } if (num_p2pLinks) { uint32_t sys_link_id = 0; /* Parse all the sysfs specified p2p links. */ while (sys_link_id < num_p2pLinks && link_id < sys_props.NumNodes - 1) { ret = topology_sysfs_get_iolink_props( i, sys_link_id++, temp_props[i].link[link_id], true); if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { ret = HSAKMT_STATUS_SUCCESS; continue; } else if (ret != HSAKMT_STATUS_SUCCESS) { goto err; } link_id++; } temp_props[i].node.NumIOLinks = link_id; } } } if (!p2p_links) { /* All direct IO links are created in the kernel. Here we need to * connect GPU<->GPU or GPU<->CPU indirect IO links. */ topology_create_indirect_gpu_links(sys_props, temp_props); } if (!dxg_topology->g_system) { dxg_topology->g_system = (HsaSystemProperties *)malloc(sizeof(HsaSystemProperties)); if (!dxg_topology->g_system) { ret = HSAKMT_STATUS_NO_MEMORY; goto err; } } *dxg_topology->g_system = sys_props; err: return ret; } /* Drop the Snashot of the HSA topology information. Assume lock is held. */ void topology_drop_snapshot(void) { if (!!dxg_topology->g_system != !!dxg_topology->g_props.size()) pr_warn("Probably inconsistency?\n"); dxg_topology->g_props.clear(); free(dxg_topology->g_system); dxg_topology->g_system = NULL; trim_suballocator(); for (auto device : dxg_topology->wdevices_) delete device; dxg_topology->wdevices_.clear(); } HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id) { if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= nodeid) return HSAKMT_STATUS_INVALID_NODE_UNIT; if (gpu_id) *gpu_id = dxg_topology->g_props[nodeid].node.KFDGpuID; return HSAKMT_STATUS_SUCCESS; } HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id) { uint64_t node_idx; for (node_idx = 0; node_idx < dxg_topology->g_system->NumNodes; node_idx++) { if (dxg_topology->g_props[node_idx].node.KFDGpuID == gpu_id) { *node_id = node_idx; return HSAKMT_STATUS_SUCCESS; } } return HSAKMT_STATUS_INVALID_NODE_UNIT; } HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) { HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; CHECK_DXG_OPEN(); if (!SystemProperties) return HSAKMT_STATUS_INVALID_PARAMETER; pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); /* We already have a valid snapshot. Avoid double initialization that * would leak memory. */ if (dxg_topology->g_system) { *SystemProperties = *dxg_topology->g_system; goto out; } err = topology_take_snapshot(); if (err != HSAKMT_STATUS_SUCCESS) goto out; assert(dxg_topology->g_system); // err = fmm_init_process_apertures(dxg_topology->g_system->NumNodes); if (err != HSAKMT_STATUS_SUCCESS) goto init_process_apertures_failed; // err = init_process_doorbells(dxg_topology->g_system->NumNodes); if (err != HSAKMT_STATUS_SUCCESS) goto init_doorbells_failed; *SystemProperties = *dxg_topology->g_system; goto out; init_doorbells_failed: // fmm_destroy_process_apertures(); init_process_apertures_failed: topology_drop_snapshot(); out: pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); return err; } HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void) { pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); topology_drop_snapshot(); pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); return HSAKMT_STATUS_SUCCESS; } HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId, HsaNodeProperties *NodeProperties) { if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes) return HSAKMT_STATUS_ERROR; *NodeProperties = dxg_topology->g_props[NodeId].node; return HSAKMT_STATUS_SUCCESS; } HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId, HsaNodeProperties *NodeProperties) { HSAKMT_STATUS err; uint32_t gpu_id; if (!NodeProperties) return HSAKMT_STATUS_INVALID_PARAMETER; CHECK_DXG_OPEN(); pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); err = validate_nodeid(NodeId, &gpu_id); if (err != HSAKMT_STATUS_SUCCESS) goto out; err = topology_get_node_props(NodeId, NodeProperties); if (err != HSAKMT_STATUS_SUCCESS) goto out; /* For CPU only node don't add any additional GPU memory banks. */ if (gpu_id) { uint64_t base, limit; if (!(NodeProperties->Integrated)) NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS; else NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS; // TODO: for apu /*if (fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base, &limit) == HSAKMT_STATUS_SUCCESS) NodeProperties->NumMemoryBanks += 1;*/ } out: pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); return err; } HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, HSAuint32 NumBanks, HsaMemoryProperties *MemoryProperties) { HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; uint32_t i; if (!MemoryProperties) return HSAKMT_STATUS_INVALID_PARAMETER; CHECK_DXG_OPEN(); pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); memset(MemoryProperties, 0, NumBanks * sizeof(HsaMemoryProperties)); for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumMemoryBanks, NumBanks); i++) { assert(dxg_topology->g_props[NodeId].mem.size()); MemoryProperties[i] = dxg_topology->g_props[NodeId].mem[i]; } /* The following memory banks does not apply to CPU only node */ wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId); if (device_ == nullptr) goto out; /*Add LDS*/ if (i < NumBanks) { MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS; MemoryProperties[i].VirtualBaseAddress = device_->SharedApertureBase(); MemoryProperties[i].SizeInBytes = dxg_topology->g_props[NodeId].node.LDSSizeInKB * 1024; i++; } /* Add SCRATCH */ if (i < NumBanks) { MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_SCRATCH; MemoryProperties[i].VirtualBaseAddress = device_->PrivateApertureBase(); MemoryProperties[i].SizeInBytes = device_->PrivateApertureSize(); i++; } out: pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); return err; } HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties( HSAuint32 NodeId, HSAuint32 ProcessorId, HSAuint32 NumCaches, HsaCacheProperties *CacheProperties) { HSAKMT_STATUS err; uint32_t i; if (!CacheProperties) return HSAKMT_STATUS_INVALID_PARAMETER; CHECK_DXG_OPEN(); pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); /* KFD ADD page 18, snapshot protocol violation */ if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) { err = HSAKMT_STATUS_INVALID_NODE_UNIT; goto out; } if (NumCaches > dxg_topology->g_props[NodeId].node.NumCaches) { err = HSAKMT_STATUS_INVALID_PARAMETER; goto out; } for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumCaches, NumCaches); i++) { assert(dxg_topology->g_props[NodeId].cache.size()); CacheProperties[i] = dxg_topology->g_props[NodeId].cache[i]; } err = HSAKMT_STATUS_SUCCESS; out: pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); return err; } HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, HSAuint32 NumIoLinks, HsaIoLinkProperties *IoLinkProperties) { if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes) return HSAKMT_STATUS_ERROR; memcpy(IoLinkProperties, dxg_topology->g_props[NodeId].link.data(), NumIoLinks * sizeof(*IoLinkProperties)); return HSAKMT_STATUS_SUCCESS; } HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, HSAuint32 NumIoLinks, HsaIoLinkProperties *IoLinkProperties) { HSAKMT_STATUS err; if (!IoLinkProperties) return HSAKMT_STATUS_INVALID_PARAMETER; CHECK_DXG_OPEN(); pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); /* KFD ADD page 18, snapshot protocol violation */ if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) { err = HSAKMT_STATUS_INVALID_NODE_UNIT; goto out; } if (NumIoLinks > dxg_topology->g_props[NodeId].node.NumIOLinks) { err = HSAKMT_STATUS_INVALID_PARAMETER; goto out; } assert(dxg_topology->g_props[NodeId].link.size()); err = topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties); out: pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); return err; } uint16_t get_device_id_by_node_id(HSAuint32 node_id) { if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= node_id) return 0; return dxg_topology->g_props[node_id].node.DeviceId; } bool prefer_ats(HSAuint32 node_id) { return dxg_topology->g_props[node_id].node.Capability.ui32.HSAMMUPresent && dxg_topology->g_props[node_id].node.NumCPUCores && dxg_topology->g_props[node_id].node.NumFComputeCores; } uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id) { unsigned int i; if (dxg_topology->g_props.empty() || !dxg_topology->g_system) return 0; for (i = 0; i < dxg_topology->g_system->NumNodes; i++) { if (dxg_topology->g_props[i].node.KFDGpuID == gpu_id) return dxg_topology->g_props[i].node.DeviceId; } return 0; } uint32_t get_direct_link_cpu(uint32_t gpu_node) { HSAuint64 size = 0; int32_t cpu_id; HSAuint32 i; cpu_id = gpu_get_direct_link_cpu(gpu_node, dxg_topology->g_props); if (cpu_id == -1) return INVALID_NODEID; assert(dxg_topology->g_props[cpu_id].mem.size()); for (i = 0; i < dxg_topology->g_props[cpu_id].node.NumMemoryBanks; i++) size += dxg_topology->g_props[cpu_id].mem[i].SizeInBytes; return size ? (uint32_t)cpu_id : INVALID_NODEID; } HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array, uint32_t NumberOfNodes, uint32_t *NodeArray) { HSAKMT_STATUS ret; unsigned int i; if (NumberOfNodes == 0 || !NodeArray || !gpu_id_array) return HSAKMT_STATUS_INVALID_PARAMETER; /* Translate Node IDs to gpu_ids */ *gpu_id_array = (uint32_t *)malloc(NumberOfNodes * sizeof(uint32_t)); if (!(*gpu_id_array)) return HSAKMT_STATUS_NO_MEMORY; for (i = 0; i < NumberOfNodes; i++) { ret = validate_nodeid(NodeArray[i], *gpu_id_array + i); if (ret != HSAKMT_STATUS_SUCCESS) { free(*gpu_id_array); break; } } return ret; } uint32_t get_num_sysfs_nodes(void) { return dxg_topology->num_sysfs_nodes; } wsl::thunk::WDDMDevice *get_wddmdev(uint32_t node_id) { if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes)) return nullptr; return dxg_topology->wdevices_[node_id - 1]; } uint32_t get_num_wddmdev() { return dxg_topology->wdevices_.size(); }