Match Occupancy APIs syntax with CUDA (#1625)

* Match Occupancy APIs syntax with CUDA and fix tests using these APIs


[ROCm/hip commit: fa98798b63]
Этот коммит содержится в:
satyanveshd
2020-01-30 02:35:53 +05:30
коммит произвёл GitHub
родитель 1af35a6044
Коммит e38db9fb6f
7 изменённых файлов: 35 добавлений и 60 удалений
-14
Просмотреть файл
@@ -154,20 +154,6 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
dynSharedMemPerBlk, blockSizeLimit);
}
template <typename F>
inline
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel,
uint32_t blockSize, size_t dynSharedMemPerBlk) {
using namespace hip_impl;
hip_impl::hip_init();
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
target_agent(0));
return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk);
}
template <typename... Args, typename F = void (*)(Args...)>
inline
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+16 -17
Просмотреть файл
@@ -2948,7 +2948,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
* @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk);
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk);
/**
* @brief Returns occupancy for a device function.
@@ -2960,7 +2960,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
* @param [in] flags Extra flags for occupancy calculation (currently ignored)
*/
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
#if __HIP_VDI__ && !defined(__HCC__)
/**
@@ -3230,21 +3230,6 @@ hipError_t hipLaunchKernel(const void* function_address,
} /* extern "c" */
#endif
#if defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
template <typename F>
static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk) {
return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, (hipFunction_t)func, blockSize,
dynSharedMemPerBlk);
}
template <typename F>
static hipError_t __host__ inline hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, F func, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
return ::hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
numBlocks, (hipFunction_t)func, blockSize, dynSharedMemPerBlk, flags);
}
#endif // defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
#if USE_PROF_API
#include <hip/hcc_detail/hip_prof_str.h>
#endif
@@ -3385,6 +3370,20 @@ hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
return hipSuccess;
}
template <class T>
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) {
return hipOccupancyMaxActiveBlocksPerMultiprocessor(
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk);
}
template <class T>
inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
}
#if __HIP_VDI__ && !defined(__HCC__)
template <class T>
inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
+2 -2
Просмотреть файл
@@ -86,8 +86,8 @@ void launchKernel(float* C, float* A, float* B, bool manual){
printf("kernel Execution time = %6.3fms\n", eventMs);
//Calculate Occupancy
uint32_t numBlock = 0;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
int numBlock = 0;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply,(int)blockSize, 0));
if(devProp.maxThreadsPerMultiProcessor){
std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl;
+13 -7
Просмотреть файл
@@ -1368,7 +1368,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block
}
hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
{
using namespace hip_impl;
@@ -1408,35 +1408,41 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));
// Calculate blocks occupancy per CU based on SGPR usage
*numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts));
*numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts));
size_t total_used_lds = usedLDS + dynSharedMemPerBlk;
if (total_used_lds != 0) {
// Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds)
size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds;
*numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy);
*numBlocks = std::min(*numBlocks, (int) lds_occupancy);
}
return hipSuccess;
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk)
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk)
{
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk);
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
}
hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk,
int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk,
unsigned int flags)
{
HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags);
auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f),
hip_impl::target_agent(0));
return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor(
tls, numBlocks, f, blockSize, dynSharedMemPerBlk));
tls, numBlocks, F, blockSize, dynSharedMemPerBlk));
}
hipError_t hipLaunchKernel(
+1 -1
Просмотреть файл
@@ -116,7 +116,7 @@ int main() {
dimBlock.x = workgroups[i];
// Calculate the device occupancy to know how many blocks can be run concurrently
hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast<uint32_t*>(&numBlocks),
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long));
dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32);
@@ -30,10 +30,6 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "test_common.h"
#define fileName "vcpy_kernel.code"
#define kernel_name "hello_world"
__global__ void f1(float *a) { *a = 1.0; }
template <typename T>
@@ -49,11 +45,10 @@ int main(int argc, char* argv[]) {
hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0);
assert(gridSize != 0 && blockSize != 0);
uint32_t numBlock = 0;
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0);
int numBlock = 0;
hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, (int)blockSize, 0);
assert(numBlock != 0);
// test case for using kernel function pointer with template
gridSize = 0;
blockSize = 0;
@@ -61,17 +56,7 @@ int main(int argc, char* argv[]) {
assert(gridSize != 0 && blockSize != 0);
numBlock = 0;
hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, blockSize, 0);
assert(numBlock != 0);
// test case for using kernel with hipFunction_t type
numBlock = 0;
hipModule_t Module;
hipFunction_t Function;
HIPCHECK(hipModuleLoad(&Module, fileName));
HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0));
hipOccupancyMaxActiveBlocksPerMultiprocessor<void(*)(int *)>(&numBlock, f2, (int)blockSize, 0);
assert(numBlock != 0);
passed();
@@ -33,7 +33,6 @@ THE SOFTWARE.
#define fileName "vcpy_kernel.code"
#define kernel_name "hello_world"
__global__ void f1(float *a) { *a = 1.0; }
template <typename T>