Fix typo: maxThreadsPerMultiProcessor -> MaxSharedMemoryPerMultiprocessor
Device property MaxSharedMemoryPerMultiprocessor set equal to totalGlobalMem (HIP path).
Reason: MaxSharedMemoryPerMultiprocessor should be as the same as group memory size. Group memory will not be paged out, so, the physical memory size = total shared memory size = group region size. NVCC path remains untouched: CUDA's device property MaxSharedMemoryPerMultiprocessor is reported.
hipify is updated as well.
[ROCm/clr commit: 460b501cbb]
This commit is contained in:
@@ -364,6 +364,9 @@ while (@ARGV) {
|
||||
$ft{'err'} += s/\bcudaDevAttrMaxThreadsPerMultiProcessor\b/hipDeviceAttributeMaxThreadsPerMultiProcessor/g;
|
||||
$ft{'err'} += s/\bcudaDevAttrComputeCapabilityMajor\b/hipDeviceAttributeComputeCapabilityMajor/g;
|
||||
$ft{'err'} += s/\bcudaDevAttrComputeCapabilityMinor\b/hipDeviceAttributeComputeCapabilityMinor/g;
|
||||
$ft{'err'} += s/\bcudaDevAttrPciBusId\b/hipDeviceAttributePciBusId/g;
|
||||
$ft{'err'} += s/\bcudaDevAttrPciDeviceId\b/hipDeviceAttributePciDeviceId/g;
|
||||
$ft{'err'} += s/\bcudaDevAttrMaxSharedMemoryPerMultiprocessor\b/hipDeviceAttributeMaxSharedMemoryPerMultiprocessor/g;
|
||||
$ft{'dev'} += s/\bcudaDeviceAttr\b/hipDeviceAttribute_t/g;
|
||||
$ft{'dev'} += s/\bcudaDeviceGetAttribute\b/hipDeviceGetAttribute/g;
|
||||
|
||||
|
||||
@@ -72,27 +72,28 @@ typedef struct {
|
||||
*
|
||||
*/
|
||||
typedef struct hipDeviceProp_t {
|
||||
char name[256]; ///< Device name.
|
||||
size_t totalGlobalMem; ///< Size of global memory region (in bytes).
|
||||
size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes).
|
||||
int regsPerBlock; ///< Registers per block.
|
||||
int warpSize; ///< Warp size.
|
||||
int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size.
|
||||
int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block.
|
||||
int maxGridSize[3]; ///< Max grid dimensions (XYZ).
|
||||
int clockRate; ///< Max clock frequency of the multiProcessors, in khz.
|
||||
size_t totalConstMem; ///< Size of shared memory region (in bytes).
|
||||
int major; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps.
|
||||
int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps.
|
||||
int multiProcessorCount; ///< Number of multi-processors (compute units).
|
||||
int l2CacheSize; ///< L2 cache size.
|
||||
int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor.
|
||||
int computeMode; ///< Compute mode.
|
||||
int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP.
|
||||
hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP.
|
||||
int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently.
|
||||
int pciBusID; ///< PCI Bus ID.
|
||||
int pciDeviceID; ///< PCI Device ID.
|
||||
char name[256]; ///< Device name.
|
||||
size_t totalGlobalMem; ///< Size of global memory region (in bytes).
|
||||
size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes).
|
||||
int regsPerBlock; ///< Registers per block.
|
||||
int warpSize; ///< Warp size.
|
||||
int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size.
|
||||
int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block.
|
||||
int maxGridSize[3]; ///< Max grid dimensions (XYZ).
|
||||
int clockRate; ///< Max clock frequency of the multiProcessors, in khz.
|
||||
size_t totalConstMem; ///< Size of shared memory region (in bytes).
|
||||
int major; ///< Major compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps.
|
||||
int minor; ///< Minor compute capability. On HCC, this is an approximation and features may differ from CUDA CC. See the arch feature flags for portable ways to query feature caps.
|
||||
int multiProcessorCount; ///< Number of multi-processors (compute units).
|
||||
int l2CacheSize; ///< L2 cache size.
|
||||
int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor.
|
||||
int computeMode; ///< Compute mode.
|
||||
int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP.
|
||||
hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP.
|
||||
int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently.
|
||||
int pciBusID; ///< PCI Bus ID.
|
||||
int pciDeviceID; ///< PCI Device ID.
|
||||
size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor.
|
||||
} hipDeviceProp_t;
|
||||
|
||||
|
||||
@@ -130,26 +131,27 @@ typedef enum hipError_t {
|
||||
* @ingroup Enumerations
|
||||
*/
|
||||
typedef enum hipDeviceAttribute_t {
|
||||
hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block.
|
||||
hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block.
|
||||
hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block.
|
||||
hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block.
|
||||
hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid.
|
||||
hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid.
|
||||
hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid.
|
||||
hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes.
|
||||
hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes.
|
||||
hipDeviceAttributeWarpSize, ///< Warp size in threads.
|
||||
hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor.
|
||||
hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz.
|
||||
hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device.
|
||||
hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in.
|
||||
hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
|
||||
hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor.
|
||||
hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number.
|
||||
hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number.
|
||||
hipDeviceAttributePciBusId, ///< PCI Bus ID.
|
||||
hipDeviceAttributePciDeviceId, ///< PCI Device ID.
|
||||
hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block.
|
||||
hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block.
|
||||
hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block.
|
||||
hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block.
|
||||
hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid.
|
||||
hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid.
|
||||
hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid.
|
||||
hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes.
|
||||
hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes.
|
||||
hipDeviceAttributeWarpSize, ///< Warp size in threads.
|
||||
hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor.
|
||||
hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz.
|
||||
hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device.
|
||||
hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in.
|
||||
hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
|
||||
hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor.
|
||||
hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number.
|
||||
hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number.
|
||||
hipDeviceAttributePciBusId, ///< PCI Bus ID.
|
||||
hipDeviceAttributePciDeviceId, ///< PCI Device ID.
|
||||
hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory Per Multiprocessor.
|
||||
} hipDeviceAttribute_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -256,6 +256,8 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
|
||||
cdattr = cudaDevAttrPciBusId; break;
|
||||
case hipDeviceAttributePciDeviceId:
|
||||
cdattr = cudaDevAttrPciDeviceId; break;
|
||||
case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
|
||||
cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor; break;
|
||||
default:
|
||||
cerror = cudaErrorInvalidValue; break;
|
||||
}
|
||||
|
||||
@@ -342,6 +342,19 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop)
|
||||
Default compute mode (Multiple threads can use cudaSetDevice() with this device) */
|
||||
prop->computeMode = 0;
|
||||
|
||||
// Get Max Threads Per Multiprocessor
|
||||
/*
|
||||
HsaSystemProperties props;
|
||||
hsaKmtReleaseSystemProperties();
|
||||
if(HSAKMT_STATUS_SUCCESS == hsaKmtAcquireSystemProperties(&props)) {
|
||||
HsaNodeProperties node_prop = {0};
|
||||
if(HSAKMT_STATUS_SUCCESS == hsaKmtGetNodeProperties(node, &node_prop)) {
|
||||
uint32_t waves_per_cu = node_prop.MaxWavesPerSIMD;
|
||||
prop-> maxThreadsPerMultiProcessor = prop->warpsize*waves_per_cu;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// Get memory properties
|
||||
|
||||
err = hsa_agent_iterate_regions(_hsa_agent,get_region_info,prop);
|
||||
@@ -351,9 +364,9 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop)
|
||||
hsa_region_t *am_region = static_cast<hsa_region_t*> (_acc.get_hsa_am_region());
|
||||
err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &(prop->totalGlobalMem));
|
||||
DeviceErrorCheck(err);
|
||||
// maxThreadsPerMultiProcessor should be as the same as group memory size.
|
||||
// maxSharedMemoryPerMultiProcessor should be as the same as group memory size.
|
||||
// Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size.
|
||||
prop->maxThreadsPerMultiProcessor = prop->totalGlobalMem;
|
||||
prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem;
|
||||
|
||||
// Set feature flags - these are all mandatory for HIP on HCC path:
|
||||
// Some features are under-development and future revs may support flags that are currently 0.
|
||||
@@ -841,6 +854,8 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
|
||||
*pi = prop->pciBusID; break;
|
||||
case hipDeviceAttributePciDeviceId:
|
||||
*pi = prop->pciDeviceID; break;
|
||||
case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
|
||||
*pi = prop->maxSharedMemoryPerMultiProcessor; break;
|
||||
default:
|
||||
e = hipErrorInvalidValue; break;
|
||||
}
|
||||
|
||||
@@ -75,6 +75,7 @@ int main(int argc, char *argv[])
|
||||
CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMinor, props.minor));
|
||||
CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciBusId, props.pciBusID));
|
||||
CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciDeviceId, props.pciDeviceID));
|
||||
CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, props.maxSharedMemoryPerMultiProcessor));
|
||||
passed();
|
||||
|
||||
};
|
||||
|
||||
Viittaa uudesa ongelmassa
Block a user