From ec4f4a643d32df694ecb7ea85a5a536fb995da8c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 22 Dec 2016 12:23:58 -0600 Subject: [PATCH] Support size_t in memset kernel. Add disable for HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU Remove one copy of completion_future in memset. [ROCm/hip commit: c325c988b11e65c031734b4ae8206ed666abb8bc] --- projects/hip/src/hip_hcc.cpp | 8 ++++++++ projects/hip/src/hip_memory.cpp | 20 ++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 14cfd61982..a4ef2b392b 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -52,6 +52,10 @@ THE SOFTWARE. #define USE_COPY_EXT_V2 1 #endif +#ifndef USE_ROCR_1_4 +#define USE_ROCR_1_4 1 +#endif + //================================================================================================= //Global variables: //================================================================================================= @@ -733,7 +737,11 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) // Get Max Threads Per Multiprocessor uint32_t max_waves_per_cu; +#if USE_ROCR_1_4 err = hsa_agent_get_info(_hsaAgent,(hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, &max_waves_per_cu); +#else + max_waves_per_cu = 10; +#endif DeviceErrorCheck(err); prop-> maxThreadsPerMultiProcessor = prop->warpSize*max_waves_per_cu; diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index 7aa6ef4942..7e1a1738a6 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -765,10 +765,11 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, // TODO - make member function of stream? template -hc::completion_future +void ihipMemsetKernel(hipStream_t stream, LockedAccessor_StreamCrit_t &crit, - T * ptr, T val, size_t sizeBytes) + T * ptr, T val, size_t sizeBytes, + hc::completion_future *cf) { int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits); const int threads_per_wg = 256; @@ -782,7 +783,7 @@ ihipMemsetKernel(hipStream_t stream, hc::extent<1> ext(threads); auto ext_tile = ext.tile(threads_per_wg); - hc::completion_future cf = + *cf = hc::parallel_for_each( crit->_av, ext_tile, @@ -798,7 +799,6 @@ ihipMemsetKernel(hipStream_t stream, } }); - return cf; } // TODO-sync: function is async unless target is pinned host memory - then these are fully sync. @@ -819,8 +819,8 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s // use a faster dword-per-workitem copy: try { value = value & 0xff; - unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - cf = ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); + uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; + ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -828,7 +828,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s } else { // use a slow byte-per-workitem copy: try { - cf = ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes); + ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -870,8 +870,8 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) // use a faster dword-per-workitem copy: try { value = value & 0xff; - unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - cf = ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); + uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; + ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(uint32_t), &cf); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -879,7 +879,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) } else { // use a slow byte-per-workitem copy: try { - cf = ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes); + ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes, &cf); } catch (std::exception &ex) { e = hipErrorInvalidValue;