Match Occupancy APIs syntax with CUDA (#1625)
* Match Occupancy APIs syntax with CUDA and fix tests using these APIs
[ROCm/hip commit: fa98798b63]
Этот коммит содержится в:
@@ -154,20 +154,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
|
||||
dynSharedMemPerBlk, blockSizeLimit);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
inline
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel,
|
||||
uint32_t blockSize, size_t dynSharedMemPerBlk) {
|
||||
|
||||
using namespace hip_impl;
|
||||
|
||||
hip_impl::hip_init();
|
||||
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
|
||||
target_agent(0));
|
||||
|
||||
return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk);
|
||||
}
|
||||
|
||||
template <typename... Args, typename F = void (*)(Args...)>
|
||||
inline
|
||||
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
|
||||
@@ -2948,7 +2948,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
|
||||
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
|
||||
*/
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk);
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk);
|
||||
|
||||
/**
|
||||
* @brief Returns occupancy for a device function.
|
||||
@@ -2960,7 +2960,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
|
||||
*/
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
|
||||
|
||||
#if __HIP_VDI__ && !defined(__HCC__)
|
||||
/**
|
||||
@@ -3230,21 +3230,6 @@ hipError_t hipLaunchKernel(const void* function_address,
|
||||
} /* extern "c" */
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
|
||||
template <typename F>
|
||||
static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk) {
|
||||
return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (hipFunction_t)func, blockSize,
|
||||
dynSharedMemPerBlk);
|
||||
}
|
||||
template <typename F>
|
||||
static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
|
||||
return ::hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
numBlocks, (hipFunction_t)func, blockSize, dynSharedMemPerBlk, flags);
|
||||
}
|
||||
#endif // defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
|
||||
|
||||
#if USE_PROF_API
|
||||
#include <hip/hcc_detail/hip_prof_str.h>
|
||||
#endif
|
||||
@@ -3385,6 +3370,20 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) {
|
||||
return hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
|
||||
return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
|
||||
}
|
||||
|
||||
#if __HIP_VDI__ && !defined(__HCC__)
|
||||
template <class T>
|
||||
inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
|
||||
|
||||
@@ -86,8 +86,8 @@ void launchKernel(float* C, float* A, float* B, bool manual){
|
||||
printf("kernel Execution time = %6.3fms\n", eventMs);
|
||||
|
||||
//Calculate Occupancy
|
||||
uint32_t numBlock = 0;
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
|
||||
int numBlock = 0;
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply,(int)blockSize, 0));
|
||||
|
||||
if(devProp.maxThreadsPerMultiProcessor){
|
||||
std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl;
|
||||
|
||||
@@ -1368,7 +1368,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
|
||||
}
|
||||
|
||||
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
using namespace hip_impl;
|
||||
|
||||
@@ -1408,35 +1408,41 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
|
||||
|
||||
// Calculate blocks occupancy per CU based on SGPR usage
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
|
||||
*numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));
|
||||
|
||||
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
|
||||
if (total_used_lds != 0) {
|
||||
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
|
||||
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
|
||||
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
|
||||
*numBlocks = std::min(*numBlocks, (int) lds_occupancy);
|
||||
}
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk)
|
||||
{
|
||||
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
|
||||
|
||||
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
|
||||
hip_impl::target_agent(0));
|
||||
|
||||
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
|
||||
tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
|
||||
}
|
||||
|
||||
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk,
|
||||
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk,
|
||||
unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
|
||||
|
||||
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
|
||||
hip_impl::target_agent(0));
|
||||
|
||||
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
|
||||
tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
|
||||
}
|
||||
|
||||
hipError_t hipLaunchKernel(
|
||||
|
||||
@@ -116,7 +116,7 @@ int main() {
|
||||
|
||||
dimBlock.x = workgroups[i];
|
||||
// Calculate the device occupancy to know how many blocks can be run concurrently
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast<uint32_t*>(&numBlocks),
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
|
||||
test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long));
|
||||
|
||||
dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32);
|
||||
|
||||
+3
-18
@@ -30,10 +30,6 @@ THE SOFTWARE.
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
#define fileName "vcpy_kernel.code"
|
||||
#define kernel_name "hello_world"
|
||||
|
||||
|
||||
__global__ void f1(float *a) { *a = 1.0; }
|
||||
|
||||
template <typename T>
|
||||
@@ -49,11 +45,10 @@ int main(int argc, char* argv[]) {
|
||||
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
|
||||
assert(gridSize != 0 && blockSize != 0);
|
||||
|
||||
uint32_t numBlock = 0;
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0);
|
||||
int numBlock = 0;
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, (int)blockSize, 0);
|
||||
assert(numBlock != 0);
|
||||
|
||||
|
||||
// test case for using kernel function pointer with template
|
||||
gridSize = 0;
|
||||
blockSize = 0;
|
||||
@@ -61,17 +56,7 @@ int main(int argc, char* argv[]) {
|
||||
assert(gridSize != 0 && blockSize != 0);
|
||||
|
||||
numBlock = 0;
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, blockSize, 0);
|
||||
assert(numBlock != 0);
|
||||
|
||||
|
||||
// test case for using kernel with hipFunction_t type
|
||||
numBlock = 0;
|
||||
hipModule_t Module;
|
||||
hipFunction_t Function;
|
||||
HIPCHECK(hipModuleLoad(&Module, fileName));
|
||||
HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0));
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, (int)blockSize, 0);
|
||||
assert(numBlock != 0);
|
||||
|
||||
passed();
|
||||
@@ -33,7 +33,6 @@ THE SOFTWARE.
|
||||
#define fileName "vcpy_kernel.code"
|
||||
#define kernel_name "hello_world"
|
||||
|
||||
|
||||
__global__ void f1(float *a) { *a = 1.0; }
|
||||
|
||||
template <typename T>
|
||||
Ссылка в новой задаче
Block a user