Fix maxSharedMemoryPerMultiProcessor attribute (#1927)

The maxSharedMemoryPerMultiProcessor attribute is meant to describe the number of bytes of shared memory (LDS space in AMD terminology) in each SM (CU in AMD terminology). For instance, on AMD GPUs this is often 64KB per CU, and some Nvidia GPUs it's 96KB per SM. This shared memory is a different address space from the normal global memory. However, the current HIP-HCC properties fill this in with a size that matches the totalGlboalMem property. This gives a drastically too-high calculation for the amount of LDS space that each CU has -- tens of GBs vs. 10s of KBs. This patch fixes this by pulling the maxSharedMemoryPerMultiProcessor property from the HSA pool that describes how much workgroup-local space is available on each CU. The HSA runtime eventually pulls this from the topology information about LDSSizeInKB, defined as "Size of Local Data Store in Kilobytes per SIMD". Previously, this HSA query was used to fill in the value of the sharedMemPerBlock property. On today's AMD GPUs, we know that the amount of LDS avaialble to the workgroup is identical to the amount of LDS space in the CU. However, in the future this may differ. As such, this patch changes around the order and fills in the "PerMultiProcessor" property from the HSA query (since what's what the query is defined to return), and then separately fills in the "PerBlock" property as we know it.
2020-03-17 03:30:51 -05:00
commit 55e55e78bb
@@ -56,6 +56,7 @@ void printCompilerInfo() {
 #endif
 }

+double bytesToKB(size_t s) { return (double)s / (1024.0); }
 double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }

 #define printLimit(w1, limit, units)                                                               \
@@ -97,7 +98,7 @@ void printDeviceProp(int deviceId) {
    cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2)
         << bytesToGB(props.totalGlobalMem) << " GB" << endl;
    cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2)
-         << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl;
+         << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << endl;
    cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl;
    cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB"
         << endl;
@@ -677,7 +677,7 @@ hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) {
            break;
        case HSA_REGION_SEGMENT_GROUP:
            err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
-                                               &(p_prop->sharedMemPerBlock));
+                                               &(p_prop->maxSharedMemoryPerMultiProcessor));
            break;
        default:
            break;
@@ -835,10 +835,8 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
    hsa_region_t* am_region = static_cast<hsa_region_t*>(_acc.get_hsa_am_region());
    err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem);
    DeviceErrorCheck(err);
-    // maxSharedMemoryPerMultiProcessor should be as the same as group memory size.
-    // Group memory will not be paged out, so, the physical memory size is the total shared memory
-    // size, and also equal to the group pool size.
-    prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem;
+    // Current GPUs allow a workgroup to use all of LDS in a CU, so these two are equal.
+    prop->sharedMemPerBlock = prop->maxSharedMemoryPerMultiProcessor;

    // Get Max memory clock frequency
    err =