From 797d2761d4672f3a3a79952c180e81afadcba123 Mon Sep 17 00:00:00 2001 From: jujiang Date: Mon, 13 Jul 2020 15:57:15 -0400 Subject: [PATCH 01/41] SWDEV-242337-Fix hipHostRegister test failure for MGPU Change-Id: I748dac0c5a47fa00184f065d9ef716573aa70b12 --- hipamd/rocclr/hip_memory.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hipamd/rocclr/hip_memory.cpp b/hipamd/rocclr/hip_memory.cpp index 553b59a77c..a7b23bcc4c 100755 --- a/hipamd/rocclr/hip_memory.cpp +++ b/hipamd/rocclr/hip_memory.cpp @@ -725,9 +725,14 @@ hipError_t hipHostUnregister(void* hostPtr) { amd::Memory* mem = getMemoryObject(hostPtr, offset); if(mem) { - for (const auto& device: hip::getCurrentDevice()->devices()) { - const device::Memory* devMem = mem->getDeviceMemory(*device); - amd::MemObjMap::RemoveMemObj(reinterpret_cast(devMem->virtualAddress())); + for (const auto& device: g_devices) { + const device::Memory* devMem = mem->getDeviceMemory(*device->devices()[0]); + if (devMem != nullptr) { + void* vAddr = reinterpret_cast(devMem->virtualAddress()); + if (amd::MemObjMap::FindMemObj(vAddr)) { + amd::MemObjMap::RemoveMemObj(vAddr); + } + } } amd::MemObjMap::RemoveMemObj(hostPtr); mem->release(); From e27d53043cd76734ca570a642a8f7fb841c0541f Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Fri, 10 Jul 2020 12:07:50 -0400 Subject: [PATCH 02/41] Add missing stream null check for some hipStreamCreate APIs Change-Id: I716d71e4ec59b0bd7922869bfa0ed908c22c289e --- hipamd/rocclr/hip_stream.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/hipamd/rocclr/hip_stream.cpp b/hipamd/rocclr/hip_stream.cpp index 8d3d3e4340..9da673f023 100755 --- a/hipamd/rocclr/hip_stream.cpp +++ b/hipamd/rocclr/hip_stream.cpp @@ -202,6 +202,10 @@ static hipError_t ihipStreamCreate(hipStream_t* stream, hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) { HIP_INIT_API(hipStreamCreateWithFlags, stream, flags); + if (stream == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + HIP_RETURN(ihipStreamCreate(stream, flags, hip::Stream::Priority::Normal), *stream); } @@ -209,6 +213,10 @@ hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) { hipError_t hipStreamCreate(hipStream_t *stream) { HIP_INIT_API(hipStreamCreate, stream); + if (stream == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + HIP_RETURN(ihipStreamCreate(stream, hipStreamDefault, hip::Stream::Priority::Normal), *stream); } @@ -216,6 +224,10 @@ hipError_t hipStreamCreate(hipStream_t *stream) { hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) { HIP_INIT_API(hipStreamCreateWithPriority, stream, flags, priority); + if (stream == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + hip::Stream::Priority streamPriority; if (priority <= hip::Stream::Priority::High) { streamPriority = hip::Stream::Priority::High; From 19ede4fd23af10e2e79611637b38aca0aa6d6fc1 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Fri, 17 Jul 2020 19:55:58 +0000 Subject: [PATCH 03/41] Handle size 0 symbols Change-Id: Ie282b37cf6bbccae4611dfc17c5c7f31977feea7 --- hipamd/rocclr/hip_global.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/hipamd/rocclr/hip_global.cpp b/hipamd/rocclr/hip_global.cpp index 2a395fee96..680d16b03f 100755 --- a/hipamd/rocclr/hip_global.cpp +++ b/hipamd/rocclr/hip_global.cpp @@ -23,16 +23,18 @@ DeviceVar::DeviceVar(std::string name, hipModule_t hmod) : shadowVptr(nullptr), guarantee(false); } - if (amd_mem_obj_ == nullptr || device_ptr_ == nullptr) { - DevLogPrintfError("Cannot get memory for creating device Var: %s", name.c_str()); - guarantee(false); + // Handle size 0 symbols + if (size_ != 0) { + if (amd_mem_obj_ == nullptr || device_ptr_ == nullptr) { + DevLogPrintfError("Cannot get memory for creating device Var: %s", name.c_str()); + guarantee(false); + } + amd::MemObjMap::AddMemObj(device_ptr_, amd_mem_obj_); } - - amd::MemObjMap::AddMemObj(device_ptr_, amd_mem_obj_); } DeviceVar::~DeviceVar() { - if (device_ptr_ != nullptr) { + if (amd_mem_obj_ != nullptr) { amd::MemObjMap::RemoveMemObj(device_ptr_); amd_mem_obj_->release(); } From 603546cd45f0e34ac43994f19a9bcd79beace71e Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Mon, 20 Jul 2020 00:22:27 -0700 Subject: [PATCH 04/41] Print devPtr and size for hip*Symbol* api Change-Id: I07be6949cca005700ee1d1708d86220c793c9728 --- hipamd/rocclr/hip_platform.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/rocclr/hip_platform.cpp b/hipamd/rocclr/hip_platform.cpp index 4d8e28dce7..e41a8af26a 100755 --- a/hipamd/rocclr/hip_platform.cpp +++ b/hipamd/rocclr/hip_platform.cpp @@ -271,7 +271,7 @@ hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol) { HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), devPtr, &sym_size)); - HIP_RETURN(hipSuccess); + HIP_RETURN(hipSuccess, *devPtr); } hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) { @@ -280,7 +280,7 @@ hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) { hipDeviceptr_t device_ptr = nullptr; HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), &device_ptr, sizePtr)); - HIP_RETURN(hipSuccess); + HIP_RETURN(hipSuccess, *sizePtr); } hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj, From 6f5e63ebacfd14e0b8565d85eba60de02e973dee Mon Sep 17 00:00:00 2001 From: Satyanvesh Dittakavi Date: Tue, 21 Jul 2020 09:49:31 -0400 Subject: [PATCH 05/41] SWDEV-244772 - Fix dtest hipMemcpyWithStreamMultiThread.cpp. hipSetDevice is not used correctly to allocate on multiple devices in mGPU setup. Due to which hipMalloc was called on the same device on multiple threads leading to out of memory issues on some devices with lesser memory. Change-Id: I0e5b1bc028b9ecb11bd40c3a5edf715f8bd721ff --- .../src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp index aaaf8332bc..b3d613c45f 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp @@ -467,6 +467,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(void) { &A_h[0], &B_h[0], &C_h[0], N, false); for (int i=1; i < numDevices; ++i) { + HIPCHECK(hipSetDevice(i)); HIPCHECK(hipMalloc(&A_d[i], Nbytes)); HIPCHECK(hipMalloc(&B_d[i], Nbytes)); HIPCHECK(hipMalloc(&C_d[i], Nbytes)); @@ -476,6 +477,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(void) { hipStream_t stream[numDevices]; for (int i=0; i < numDevices; ++i) { + HIPCHECK(hipSetDevice(i)); HIPCHECK(hipStreamCreate(&stream[i])); } From eba78710dcf009f95a4abf01ea78a72913a4a9b2 Mon Sep 17 00:00:00 2001 From: Tao Sang Date: Sun, 19 Jul 2020 14:55:10 -0400 Subject: [PATCH 06/41] Support numa if libnume-dev is installed Let rocclr decide numa dependence Change-Id: I65bdfba7ec0d06b550f86632318bcfd1f765cfa9 --- hipamd/bin/hipcc | 2 +- hipamd/rocclr/CMakeLists.txt | 4 ++-- hipamd/tests/performance/memory/hipHostNumaAlloc.cpp | 9 +++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index b3cae544a4..ddf67242cf 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -839,7 +839,7 @@ if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") { if ($linkType eq 0) { $toolArgs .= " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma "; } else { - $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lhip_hcc -lnuma "; + $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lhip_hcc "; } # To support __fp16 and _Float16, explicitly link with compiler-rt $toolArgs .= " -L$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/linux -lclang_rt.builtins-x86_64 " diff --git a/hipamd/rocclr/CMakeLists.txt b/hipamd/rocclr/CMakeLists.txt index 8cb79a5496..ab62108c3f 100755 --- a/hipamd/rocclr/CMakeLists.txt +++ b/hipamd/rocclr/CMakeLists.txt @@ -208,7 +208,7 @@ target_link_libraries(device INTERFACE host) if(${BUILD_SHARED_LIBS}) - target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl numa hsa-runtime64::hsa-runtime64) + target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64) INSTALL(PROGRAMS $ DESTINATION lib COMPONENT MAIN) INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhip_hcc.so )" DESTINATION lib COMPONENT MAIN) @@ -219,7 +219,7 @@ if(${BUILD_SHARED_LIBS}) else() - target_link_libraries(amdhip64 PRIVATE Threads::Threads dl numa hsa-runtime64::hsa-runtime64 amd_comgr) + target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr) # combine objects of vid and hip into amdhip64_static add_custom_target( amdhip64_static_combiner diff --git a/hipamd/tests/performance/memory/hipHostNumaAlloc.cpp b/hipamd/tests/performance/memory/hipHostNumaAlloc.cpp index 75d6edf0cf..38401c8046 100644 --- a/hipamd/tests/performance/memory/hipHostNumaAlloc.cpp +++ b/hipamd/tests/performance/memory/hipHostNumaAlloc.cpp @@ -34,12 +34,13 @@ THE SOFTWARE. #include #include "hip/hip_runtime.h" /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD_CMD: hipHostNumaAlloc %hc -I%S/../../src %S/%s %S/../../src/test_common.cpp -lnuma -o %T/%t EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ -// To run it correctly, we must not export HIP_VISIBLE_DEVICES +// To run it correctly, we must not export HIP_VISIBLE_DEVICES. +// And we must explicitly link libnuma because of numa api move_pages(). #define NUM_PAGES 4 char *h = nullptr; char *d_h = nullptr; @@ -127,6 +128,7 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { printf("\n"); HIPCHECK(hipHostFree((void* )h)); + hipHostUnregister(m); free(m); if (cpuId >= 0 && (numaMode == MPOL_BIND || numaMode == MPOL_PREFERRED)) { @@ -149,8 +151,7 @@ bool runTest(const int &cpuCount, const int &gpuCount, for (int i = 0; i < cpuCount; i++) { for (int j = 0; j < gpuCount; j++) { - if (!test(i, j, mode[m], - hipHostMallocDefault | hipHostMallocNumaUser)) { + if (!test(i, j, mode[m], hostMallocflags)) { return false; } } From 274fded47dc55a19ec91bf7420239c7fb5a78639 Mon Sep 17 00:00:00 2001 From: Payam Date: Mon, 20 Jul 2020 22:05:04 -0400 Subject: [PATCH 07/41] modify cmake to strip debug symbols from rel blds Change-Id: Ia67261fc3c4a20f3f1bbe439a70384a3f6aefa91 --- hipamd/rocclr/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/hipamd/rocclr/CMakeLists.txt b/hipamd/rocclr/CMakeLists.txt index ab62108c3f..2687043b42 100755 --- a/hipamd/rocclr/CMakeLists.txt +++ b/hipamd/rocclr/CMakeLists.txt @@ -194,6 +194,7 @@ else() endif() +set_target_properties(amdhip64 PROPERTIES LINK_FLAGS_RELEASE -s) set_target_properties(amdhip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR}) add_library(host INTERFACE) target_link_libraries(host INTERFACE hip::amdhip64) From 311fc8c513f0274a7e6f93bfd94a7382cb177956 Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Fri, 17 Jul 2020 17:29:32 -0400 Subject: [PATCH 08/41] SWDEV-240800 - Adding initial support for hipDeviceGetP2PAttribute. Some attr support still pending. Change-Id: I0611aed136270db497dfa374144f6f5e35352a8f --- .../include/hip/hcc_detail/hip_runtime_api.h | 20 +++++++++ .../include/hip/nvcc_detail/hip_runtime_api.h | 6 +++ hipamd/rocclr/hip_device_runtime.cpp | 7 --- hipamd/rocclr/hip_hcc.def.in | 1 + hipamd/rocclr/hip_hcc.map.in | 1 + hipamd/rocclr/hip_peer.cpp | 45 +++++++++++++++++-- 6 files changed, 69 insertions(+), 11 deletions(-) mode change 100644 => 100755 hipamd/include/hip/hcc_detail/hip_runtime_api.h mode change 100644 => 100755 hipamd/include/hip/nvcc_detail/hip_runtime_api.h mode change 100644 => 100755 hipamd/rocclr/hip_device_runtime.cpp diff --git a/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/hipamd/include/hip/hcc_detail/hip_runtime_api.h old mode 100644 new mode 100755 index e8a7a9e623..3c8a775289 --- a/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -95,6 +95,13 @@ typedef struct ihipCtx_t* hipCtx_t; // Note many APIs also use integer deviceIds as an alternative to the device pointer: typedef int hipDevice_t; +typedef enum hipDeviceP2PAttr { + hipDevP2PAttrPerformanceRank = 0, + hipDevP2PAttrAccessSupported, + hipDevP2PAttrNativeAtomicSupported, + hipDevP2PAttrHipArrayAccessSupported +} hipDeviceP2PAttr; + typedef struct ihipStream_t* hipStream_t; #define hipIpcMemLazyEnablePeerAccess 0 @@ -2799,6 +2806,19 @@ hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device */ hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device); + +/** + * @brief Returns a value for attr of link between two devices + * @param [out] value + * @param [in] attr + * @param [in] srcDevice + * @param [in] dstDevice + * + * @returns #hipSuccess, #hipErrorInavlidDevice + */ +hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, + int srcDevice, int dstDevice); + /** * @brief Returns a PCI Bus Id string for the device, overloaded to take int device ID. * @param [out] pciBusId diff --git a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h old mode 100644 new mode 100755 index fe72f33d65..3744c6740c --- a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -179,6 +179,7 @@ typedef enum cudaSharedMemConfig hipSharedMemConfig; typedef CUfunc_cache hipFuncCache; typedef CUjit_option hipJitOption; typedef CUdevice hipDevice_t; +typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr; typedef CUmodule hipModule_t; typedef CUfunction hipFunction_t; typedef CUdeviceptr hipDeviceptr_t; @@ -1606,6 +1607,11 @@ inline static hipError_t hipDeviceGetName(char* name, int len, hipDevice_t devic return hipCUResultTohipError(cuDeviceGetName(name, len, device)); } +inline static hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, + int srcDevice, int dstDevice) { + return hipCUDAErrorTohipError(cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice)); +} + inline static hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t device) { return hipCUDAErrorTohipError(cudaDeviceGetPCIBusId(pciBusId, len, device)); } diff --git a/hipamd/rocclr/hip_device_runtime.cpp b/hipamd/rocclr/hip_device_runtime.cpp old mode 100644 new mode 100755 index be979dab9e..470b088f02 --- a/hipamd/rocclr/hip_device_runtime.cpp +++ b/hipamd/rocclr/hip_device_runtime.cpp @@ -367,13 +367,6 @@ hipError_t hipDeviceGetLimit ( size_t* pValue, hipLimit_t limit ) { } } -/** -hipError_t hipDeviceGetP2PAttribute ( int* value, hipDeviceP2PAttr attr, int srcDevice, int dstDevice ) { - assert(0); - HIP_RETURN(hipSuccess); -} -**/ - hipError_t hipDeviceGetPCIBusId ( char* pciBusId, int len, int device ) { HIP_INIT_API(hipDeviceGetPCIBusId, (void*)pciBusId, len, device); diff --git a/hipamd/rocclr/hip_hcc.def.in b/hipamd/rocclr/hip_hcc.def.in index da43f5f51d..5e45689c1a 100755 --- a/hipamd/rocclr/hip_hcc.def.in +++ b/hipamd/rocclr/hip_hcc.def.in @@ -29,6 +29,7 @@ hipDeviceGetLimit hipDeviceGetName hipDeviceGetPCIBusId hipDeviceGetSharedMemConfig +hipDeviceGetP2PAttribute hipDevicePrimaryCtxGetState hipDevicePrimaryCtxRelease hipDevicePrimaryCtxReset diff --git a/hipamd/rocclr/hip_hcc.map.in b/hipamd/rocclr/hip_hcc.map.in index e66d4be92d..f355a4e14a 100755 --- a/hipamd/rocclr/hip_hcc.map.in +++ b/hipamd/rocclr/hip_hcc.map.in @@ -30,6 +30,7 @@ global: hipDeviceGetName; hipDeviceGetPCIBusId; hipDeviceGetSharedMemConfig; + hipDeviceGetP2PAttribute; hipDevicePrimaryCtxGetState; hipDevicePrimaryCtxRelease; hipDevicePrimaryCtxReset; diff --git a/hipamd/rocclr/hip_peer.cpp b/hipamd/rocclr/hip_peer.cpp index 225361d525..24207b52c6 100755 --- a/hipamd/rocclr/hip_peer.cpp +++ b/hipamd/rocclr/hip_peer.cpp @@ -52,24 +52,61 @@ hipError_t canAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId){ amd::Device* device = nullptr; amd::Device* peer_device = nullptr; if (canAccessPeer == nullptr) { - HIP_RETURN(hipErrorInvalidValue); + return hipErrorInvalidValue; } /* Peer cannot be self */ if (deviceId == peerDeviceId) { *canAccessPeer = 0; - HIP_RETURN(hipSuccess); + return hipSuccess; } /* Cannot exceed the max number of devices */ if (static_cast(deviceId) >= g_devices.size() || static_cast(peerDeviceId) >= g_devices.size()) { - HIP_RETURN(hipErrorInvalidDevice); + return hipErrorInvalidDevice; } device = g_devices[deviceId]->devices()[0]; peer_device = g_devices[peerDeviceId]->devices()[0]; *canAccessPeer = static_cast(std::find(device->p2pDevices_.begin(), device->p2pDevices_.end(), as_cl(peer_device)) != device->p2pDevices_.end()); - HIP_RETURN(hipSuccess); + return hipSuccess; +} + +hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, + int srcDevice, int dstDevice) { + HIP_INIT_API(hipDeviceGetP2PAttribute, value, attr, srcDevice, dstDevice); + + hipError_t hip_error = hipSuccess; + + if (value == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + + if (srcDevice >= static_cast(g_devices.size()) + || dstDevice >= static_cast(g_devices.size())) { + HIP_RETURN(hipErrorInvalidDevice); + } + + switch (attr) { + case hipDevP2PAttrPerformanceRank : + assert(0 && "Unimplemented"); + break; + case hipDevP2PAttrAccessSupported : + hip_error = canAccessPeer(value, srcDevice, dstDevice); + break; + case hipDevP2PAttrNativeAtomicSupported : + assert(0 && "Unimplemented"); + break; + case hipDevP2PAttrHipArrayAccessSupported : + assert(0 && "Unimplemented"); + break; + default : + DevLogPrintfError("Invalid attribute attr: %d ", attr); + hip_error = hipErrorInvalidValue; + break; + } + + HIP_RETURN(hip_error); } hipError_t hipDeviceCanAccessPeer(int* canAccess, int deviceId, int peerDeviceId) { From fd783c1e887d11f4aaba15b89472edb42b682e80 Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Fri, 17 Jul 2020 17:59:54 -0400 Subject: [PATCH 09/41] SWDEV-236178 - Remove use of old routines functions()/modules(). Change-Id: I3a1ed967227c91b0d8cdf39e1360ade685e0bc73 --- hipamd/rocclr/hip_platform.cpp | 221 ++------------------------------- 1 file changed, 12 insertions(+), 209 deletions(-) diff --git a/hipamd/rocclr/hip_platform.cpp b/hipamd/rocclr/hip_platform.cpp index e41a8af26a..cb3cdf7f97 100755 --- a/hipamd/rocclr/hip_platform.cpp +++ b/hipamd/rocclr/hip_platform.cpp @@ -561,202 +561,6 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, namespace hip_impl { -struct dl_phdr_info { - ELFIO::Elf64_Addr dlpi_addr; - const char *dlpi_name; - const ELFIO::Elf64_Phdr *dlpi_phdr; - ELFIO::Elf64_Half dlpi_phnum; -}; - -extern "C" int dl_iterate_phdr( - int (*callback) (struct dl_phdr_info *info, size_t size, void *data), void *data -); - -struct Symbol { - std::string name; - ELFIO::Elf64_Addr value = 0; - ELFIO::Elf_Xword size = 0; - ELFIO::Elf_Half sect_idx = 0; - uint8_t bind = 0; - uint8_t type = 0; - uint8_t other = 0; -}; - -inline Symbol read_symbol(const ELFIO::symbol_section_accessor& section, unsigned int idx) { - assert(idx < section.get_symbols_num()); - - Symbol r; - section.get_symbol(idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other); - - return r; -} - -template -inline ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) { - const auto it = find_if(reader.sections.begin(), reader.sections.end(), std::move(p)); - - return it != reader.sections.end() ? *it : nullptr; -} - -std::vector> function_names_for(const ELFIO::elfio& reader, - ELFIO::section* symtab) { - std::vector> r; - ELFIO::symbol_section_accessor symbols{reader, symtab}; - - for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { - auto tmp = read_symbol(symbols, i); - - if (tmp.type == STT_FUNC && tmp.sect_idx != SHN_UNDEF && !tmp.name.empty()) { - r.emplace_back(tmp.value, tmp.name); - } - } - - return r; -} - -const std::vector>& function_names_for_process() { - static constexpr const char self[] = "/proc/self/exe"; - - static std::vector> r; - static std::once_flag f; - - std::call_once(f, []() { - ELFIO::elfio reader; - - if (reader.load(self)) { - const auto it = find_section_if( - reader, [](const ELFIO::section* x) { return x->get_type() == SHT_SYMTAB; }); - - if (it) r = function_names_for(reader, it); - } - }); - - return r; -} - - -const std::unordered_map& function_names() -{ - static std::unordered_map r{ - function_names_for_process().cbegin(), - function_names_for_process().cend()}; - static std::once_flag f; - - std::call_once(f, []() { - dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) { - ELFIO::elfio reader; - - if (reader.load(info->dlpi_name)) { - const auto it = find_section_if( - reader, [](const ELFIO::section* x) { return x->get_type() == SHT_SYMTAB; }); - - if (it) { - auto n = function_names_for(reader, it); - - for (auto&& f : n) f.first += info->dlpi_addr; - - r.insert(make_move_iterator(n.begin()), make_move_iterator(n.end())); - } - } - return 0; - }, - nullptr); - }); - - return r; -} - -std::vector bundles_for_process() { - static constexpr const char self[] = "/proc/self/exe"; - static constexpr const char kernel_section[] = ".kernel"; - std::vector r; - - ELFIO::elfio reader; - - if (reader.load(self)) { - auto it = find_section_if( - reader, [](const ELFIO::section* x) { return x->get_name() == kernel_section; }); - - if (it) r.insert(r.end(), it->get_data(), it->get_data() + it->get_size()); - } - - return r; -} - -const std::vector& modules() { - static std::vector r; - static std::once_flag f; - - std::call_once(f, []() { - static std::vector> bundles{bundles_for_process()}; - - dl_iterate_phdr( - [](dl_phdr_info* info, std::size_t, void*) { - ELFIO::elfio tmp; - if (tmp.load(info->dlpi_name)) { - const auto it = find_section_if( - tmp, [](const ELFIO::section* x) { return x->get_name() == ".kernel"; }); - - if (it) bundles.emplace_back(it->get_data(), it->get_data() + it->get_size()); - } - return 0; - }, - nullptr); - - for (auto&& bundle : bundles) { - if (bundle.empty()) { - continue; - } - std::string magic(&bundle[0], sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC_STR) - 1); - if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR)) - continue; - - const auto obheader = reinterpret_cast(&bundle[0]); - const auto* desc = &obheader->desc[0]; - for (uint64_t i = 0; i < obheader->numBundles; ++i, - desc = reinterpret_cast( - reinterpret_cast(&desc->triple[0]) + desc->tripleSize)) { - - std::string triple(desc->triple, sizeof(HCC_AMDGCN_AMDHSA_TRIPLE) - 1); - if (triple.compare(HCC_AMDGCN_AMDHSA_TRIPLE)) - continue; - - std::string target(desc->triple + sizeof(HCC_AMDGCN_AMDHSA_TRIPLE), - desc->tripleSize - sizeof(HCC_AMDGCN_AMDHSA_TRIPLE)); - - if (isCompatibleCodeObject(target, hip::getCurrentDevice()->devices()[0]->info().name_)) { - hipModule_t module; - if (hipSuccess == hipModuleLoadData(&module, reinterpret_cast( - reinterpret_cast(obheader) + desc->offset))) - r.push_back(module); - break; - } - } - } - }); - - return r; -} - -const std::unordered_map& functions() -{ - static std::unordered_map r; - static std::once_flag f; - - std::call_once(f, []() { - for (auto&& function : function_names()) { - for (auto&& module : modules()) { - hipFunction_t f; - if (hipSuccess == hipModuleGetFunction(&f, module, function.second.c_str())) { - r[function.first] = f; - } - } - } - }); - - return r; -} - void hipLaunchKernelGGLImpl( uintptr_t function_address, const dim3& numBlocks, @@ -767,11 +571,19 @@ void hipLaunchKernelGGLImpl( { HIP_INIT(); - const auto it = functions().find(function_address); - if (it == functions().cend()) - assert(0); + hip::Stream* s = reinterpret_cast(stream); + int deviceId = (s != nullptr)? s->DeviceId() : ihipGetDevice(); + if (deviceId == -1) { + DevLogPrintfError("Wrong Device Id: %d \n", deviceId); + } - hipModuleLaunchKernel(it->second, + hipFunction_t func = nullptr; + hipError_t hip_error = PlatformState::instance().getStatFunc(&func, reinterpret_cast(function_address), deviceId); + if ((hip_error != hipSuccess) || (func == nullptr)) { + DevLogPrintfError("Cannot find the static function: 0x%x", function_address); + } + + hipModuleLaunchKernel(func, numBlocks.x, numBlocks.y, numBlocks.z, dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes, stream, nullptr, kernarg); @@ -815,16 +627,7 @@ hipError_t ihipLaunchKernel(const void* hostFunction, hipFunction_t func = nullptr; hipError_t hip_error = PlatformState::instance().getStatFunc(&func, hostFunction, deviceId); if ((hip_error != hipSuccess) || (func == nullptr)) { -#ifdef ATI_OS_LINUX - const auto it = hip_impl::functions().find(reinterpret_cast(hostFunction)); - if (it == hip_impl::functions().cend()) { - DevLogPrintfError("Cannot find function: 0x%x \n", hostFunction); - HIP_RETURN(hipErrorInvalidDeviceFunction); - } - func = it->second; -#else HIP_RETURN(hipErrorInvalidDeviceFunction); -#endif } HIP_RETURN(ihipModuleLaunchKernel(func, (gridDim.x * blockDim.x), (gridDim.y * blockDim.y), (gridDim.z * blockDim.z), blockDim.x, blockDim.y, blockDim.z, From f7cc49c0b21af0c6bf1cfb21fd9322eaca196d79 Mon Sep 17 00:00:00 2001 From: Tao Sang Date: Tue, 14 Jul 2020 20:16:32 -0400 Subject: [PATCH 10/41] Apply constexpr on global constant varaibles When HIP_ENABLE_DEFERRED_LOADING=0, many global variables will be referenced but they are not initialized in that early time. The patch will use constexpr to initialze global constant varables in compile time. Change-Id: I9837c42e6bab38a894ece4e0f34b64f81b38fef0 --- hipamd/lpl_ca/pstreams/pstream.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hipamd/lpl_ca/pstreams/pstream.h b/hipamd/lpl_ca/pstreams/pstream.h index 363df0cf45..5b3afd7d35 100644 --- a/hipamd/lpl_ca/pstreams/pstream.h +++ b/hipamd/lpl_ca/pstreams/pstream.h @@ -70,12 +70,12 @@ struct pstreams { /// Type used for file descriptors. typedef int fd_type; - static const pmode pstdin = std::ios_base::out; ///< Write to stdin - static const pmode pstdout = std::ios_base::in; ///< Read from stdout - static const pmode pstderr = std::ios_base::app; ///< Read from stderr + static constexpr pmode pstdin = std::ios_base::out; ///< Write to stdin + static constexpr pmode pstdout = std::ios_base::in; ///< Read from stdout + static constexpr pmode pstderr = std::ios_base::app; ///< Read from stderr /// Create a new process group for the child process. - static const pmode newpg = std::ios_base::trunc; + static constexpr pmode newpg = std::ios_base::trunc; protected: enum { bufsz = 32 }; ///< Size of pstreambuf buffers. From fe6b645f3b7b9d77b8625f27483b0a8cd185d73d Mon Sep 17 00:00:00 2001 From: Vlad Sytchenko Date: Mon, 20 Jul 2020 11:02:48 -0400 Subject: [PATCH 11/41] 2D/3D copy optimizations SWDEV-244798 If {src/dst} ptr is marked as hipMemoryTypeHost, check if the memory was prepinned. If it was, upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning. Change-Id: Id287ef5b14ae67dfbcf80c4caa1b08a311191948 --- hipamd/rocclr/hip_memory.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/hipamd/rocclr/hip_memory.cpp b/hipamd/rocclr/hip_memory.cpp index a7b23bcc4c..cf3d17b574 100755 --- a/hipamd/rocclr/hip_memory.cpp +++ b/hipamd/rocclr/hip_memory.cpp @@ -1404,10 +1404,34 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipMemoryType srcMemoryType = pCopy->srcMemoryType; if (srcMemoryType == hipMemoryTypeUnified) { srcMemoryType = amd::MemObjMap::FindMemObj(pCopy->srcDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost; + if (srcMemoryType == hipMemoryTypeHost) { + // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system memory. + const_cast(pCopy)->srcHost = pCopy->srcDevice; + } } hipMemoryType dstMemoryType = pCopy->dstMemoryType; if (dstMemoryType == hipMemoryTypeUnified) { dstMemoryType = amd::MemObjMap::FindMemObj(pCopy->dstDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost; + if (srcMemoryType == hipMemoryTypeHost) { + const_cast(pCopy)->dstHost = pCopy->dstDevice; + } + } + + // If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned. + // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning. + if (srcMemoryType == hipMemoryTypeHost) { + amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->srcHost); + srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + if (srcMemoryType == hipMemoryTypeDevice) { + const_cast(pCopy)->srcDevice = const_cast(pCopy->srcHost); + } + } + if (dstMemoryType == hipMemoryTypeHost) { + amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->dstHost); + dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; + if (dstMemoryType == hipMemoryTypeDevice) { + const_cast(pCopy)->dstDevice = const_cast(pCopy->dstDevice); + } } amd::Coord3D srcOrigin = {pCopy->srcXInBytes, pCopy->srcY, pCopy->srcZ}; From e9cc570be8d00bd20c6c5d097dc8374e7c248307 Mon Sep 17 00:00:00 2001 From: Tao Sang Date: Tue, 21 Jul 2020 18:37:05 -0400 Subject: [PATCH 12/41] Remove __gnu_h2f_ieee and __gnu_f2h_ieee Change-Id: Ide24d245d851e20961020323e52f33322a33fff9 --- hipamd/rocclr/hip_hcc.def.in | 2 -- hipamd/rocclr/hip_hcc.map.in | 2 -- hipamd/rocclr/hip_platform.cpp | 40 ---------------------------------- 3 files changed, 44 deletions(-) diff --git a/hipamd/rocclr/hip_hcc.def.in b/hipamd/rocclr/hip_hcc.def.in index 5e45689c1a..253352f2cb 100755 --- a/hipamd/rocclr/hip_hcc.def.in +++ b/hipamd/rocclr/hip_hcc.def.in @@ -176,8 +176,6 @@ __hipRegisterVar __hipRegisterSurface __hipRegisterTexture __hipUnregisterFatBinary -__gnu_h2f_ieee -__gnu_f2h_ieee hipConfigureCall hipSetupArgument hipLaunchByPtr diff --git a/hipamd/rocclr/hip_hcc.map.in b/hipamd/rocclr/hip_hcc.map.in index f355a4e14a..be83e6d134 100755 --- a/hipamd/rocclr/hip_hcc.map.in +++ b/hipamd/rocclr/hip_hcc.map.in @@ -176,8 +176,6 @@ global: __hipRegisterSurface; __hipRegisterTexture; __hipUnregisterFatBinary; - __gnu_h2f_ieee; - __gnu_f2h_ieee; hipConfigureCall; hipSetupArgument; hipLaunchByPtr; diff --git a/hipamd/rocclr/hip_platform.cpp b/hipamd/rocclr/hip_platform.cpp index cb3cdf7f97..0dafd268a4 100755 --- a/hipamd/rocclr/hip_platform.cpp +++ b/hipamd/rocclr/hip_platform.cpp @@ -635,46 +635,6 @@ hipError_t ihipLaunchKernel(const void* hostFunction, flags)); } -// conversion routines between float and half precision -static inline std::uint32_t f32_as_u32(float f) { union { float f; std::uint32_t u; } v; v.f = f; return v.u; } -static inline float u32_as_f32(std::uint32_t u) { union { float f; std::uint32_t u; } v; v.u = u; return v.f; } -static inline int clamp_int(int i, int l, int h) { return std::min(std::max(i, l), h); } - -// half float, the f16 is in the low 16 bits of the input argument -static inline float __convert_half_to_float(std::uint32_t a) noexcept { - std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U; - std::uint32_t v = f32_as_u32(u32_as_f32(u) * u32_as_f32(0x77800000U)/*0x1.0p+112f*/) + 0x38000000U; - u = (a & 0x7fff) != 0 ? v : u; - return u32_as_f32(u) * u32_as_f32(0x07800000U)/*0x1.0p-112f*/; -} - -// float half with nearest even rounding -// The lower 16 bits of the result is the bit pattern for the f16 -static inline std::uint32_t __convert_float_to_half(float a) noexcept { - std::uint32_t u = f32_as_u32(a); - int e = static_cast((u >> 23) & 0xff) - 127 + 15; - std::uint32_t m = ((u >> 11) & 0xffe) | ((u & 0xfff) != 0); - std::uint32_t i = 0x7c00 | (m != 0 ? 0x0200 : 0); - std::uint32_t n = ((std::uint32_t)e << 12) | m; - std::uint32_t s = (u >> 16) & 0x8000; - int b = clamp_int(1-e, 0, 13); - std::uint32_t d = (0x1000 | m) >> b; - d |= (d << b) != (0x1000 | m); - std::uint32_t v = e < 1 ? d : n; - v = (v >> 2) + (((v & 0x7) == 3) | ((v & 0x7) > 5)); - v = e > 30 ? 0x7c00 : v; - v = e == 143 ? i : v; - return s | v; -} - -extern "C" float __gnu_h2f_ieee(unsigned short h){ - return __convert_half_to_float((std::uint32_t) h); -} - -extern "C" unsigned short __gnu_f2h_ieee(float f){ - return (unsigned short)__convert_float_to_half(f); -} - void PlatformState::init() { amd::ScopedLock lock(lock_); From c1d65e57a1201019eccf86b341138c482c774d9d Mon Sep 17 00:00:00 2001 From: jujiang Date: Mon, 20 Jul 2020 12:03:04 -0400 Subject: [PATCH 13/41] SWDEV-242344-hipMemcpyAll failed on MGPU Change-Id: I01671e322c84213964d3d48bd6b5dc704dba731a --- hipamd/rocclr/hip_memory.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hipamd/rocclr/hip_memory.cpp b/hipamd/rocclr/hip_memory.cpp index cf3d17b574..b7596e8b0f 100755 --- a/hipamd/rocclr/hip_memory.cpp +++ b/hipamd/rocclr/hip_memory.cpp @@ -178,9 +178,12 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin *srcMemory->asBuffer(), sOffset, sizeBytes, dst); isAsync = false; } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { - // Check if the queue device doesn't match the device on any memory object. Hence - // it's a P2P transfer, because the app has requested access to another GPU - if (srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { + // Check if the queue device doesn't match the device on any memory object. + // And any of them are not host allocation. + // Hence it's a P2P transfer, because the app has requested access to another GPU + if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) && + ((srcMemory->getContext().devices().size() == 1) && + (dstMemory->getContext().devices().size() == 1))) { command = new amd::CopyMemoryP2PCommand(queue, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes); if (command == nullptr) { From c919a8879507f13b88e1f63dafbe351d0eadb049 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 17 Jun 2020 20:37:46 +0000 Subject: [PATCH 14/41] Remove libhip_hcc symlinks to libamdhip64 Change-Id: I7b5b47fc7ca9760316c2424c16b454d7919c665c --- hipamd/bin/hipcc | 6 +++--- hipamd/packaging/hip-rocclr.txt | 2 -- hipamd/rocclr/CMakeLists.txt | 8 -------- hipamd/tests/src/g++/hipMalloc.cpp | 2 +- hipamd/tests/src/gcc/LaunchKernel.c | 2 +- hipamd/tests/src/gcc/hipMalloc.c | 2 +- 6 files changed, 6 insertions(+), 16 deletions(-) diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index ddf67242cf..5f1445cc6a 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -798,9 +798,9 @@ if ($setStdLib eq 0 and $HIP_PLATFORM eq 'hcc' and $HIP_COMPILER eq 'hcc') if ($needHipHcc) { if ($linkType eq 0) { - substr($HIPLDFLAGS,0,0) = " $HIP_LIB_PATH/libhip_hcc_static.a " ; + substr($HIPLDFLAGS,0,0) = " $HIP_LIB_PATH/libamdhip64.a " ; } else { - substr($HIPLDFLAGS,0,0) = " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libhip_hcc.so "; + substr($HIPLDFLAGS,0,0) = " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libamdhip64.so "; } } @@ -839,7 +839,7 @@ if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") { if ($linkType eq 0) { $toolArgs .= " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma "; } else { - $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lhip_hcc "; + $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lamdhip64 "; } # To support __fp16 and _Float16, explicitly link with compiler-rt $toolArgs .= " -L$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/linux -lclang_rt.builtins-x86_64 " diff --git a/hipamd/packaging/hip-rocclr.txt b/hipamd/packaging/hip-rocclr.txt index 9cf89f042b..e284d6fc7f 100644 --- a/hipamd/packaging/hip-rocclr.txt +++ b/hipamd/packaging/hip-rocclr.txt @@ -5,8 +5,6 @@ if(@BUILD_SHARED_LIBS@) install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_MAJOR@ DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_STRING@ DESTINATION lib) - install(FILES @PROJECT_BINARY_DIR@/lib/libhip_hcc.so DESTINATION lib) - install(FILES @PROJECT_BINARY_DIR@/lib/libhiprtc.so DESTINATION lib) else() install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.a DESTINATION lib) endif() diff --git a/hipamd/rocclr/CMakeLists.txt b/hipamd/rocclr/CMakeLists.txt index 2687043b42..bef5de347c 100755 --- a/hipamd/rocclr/CMakeLists.txt +++ b/hipamd/rocclr/CMakeLists.txt @@ -207,17 +207,9 @@ target_link_libraries(device INTERFACE host) # FIXME: Linux convention is to create static library with same base # filename. - if(${BUILD_SHARED_LIBS}) target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64) INSTALL(PROGRAMS $ DESTINATION lib COMPONENT MAIN) - INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhip_hcc.so )" DESTINATION lib COMPONENT MAIN) - - INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhiprtc.so )" DESTINATION lib COMPONENT MAIN) - INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhip_hcc.so DESTINATION lib COMPONENT MAIN) - - INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so DESTINATION lib COMPONENT MAIN) - else() target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr) diff --git a/hipamd/tests/src/g++/hipMalloc.cpp b/hipamd/tests/src/g++/hipMalloc.cpp index 3aab48aad5..d527db3570 100644 --- a/hipamd/tests/src/g++/hipMalloc.cpp +++ b/hipamd/tests/src/g++/hipMalloc.cpp @@ -18,7 +18,7 @@ * */ /* HIT_START - * BUILD_CMD: hipMalloc %cxx -D__HIP_PLATFORM_HCC__ -I%hip-path/include -I/opt/rocm/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libhip_hcc.so -o %T/%t -std=c++11 EXCLUDE_HIP_PLATFORM nvcc + * BUILD_CMD: hipMalloc %cxx -D__HIP_PLATFORM_HCC__ -I%hip-path/include -I/opt/rocm/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libamdhip64.so -o %T/%t -std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t EXCLUDE_HIP_PLATFORM nvcc * HIT_END */ diff --git a/hipamd/tests/src/gcc/LaunchKernel.c b/hipamd/tests/src/gcc/LaunchKernel.c index 1791d52d25..08aca3e2fe 100644 --- a/hipamd/tests/src/gcc/LaunchKernel.c +++ b/hipamd/tests/src/gcc/LaunchKernel.c @@ -21,7 +21,7 @@ /* HIT_START * BUILD_CMD: gpu.o %hc -I%hip-path/include -g -c %S/gpu.cpp -o %T/gpu.o EXCLUDE_HIP_PLATFORM nvcc rocclr * BUILD_CMD: launchkernel.o %hc -D__HIP_PLATFORM_HCC__ -g -I%hip-path/include -c %S/LaunchKernel.c -o %T/launchkernel.o EXCLUDE_HIP_PLATFORM nvcc rocclr - * BUILD_CMD: LaunchKernel %hc %T/launchkernel.o %T/gpu.o -g -Wl,--rpath=%hip-path/lib %hip-path/lib/libhip_hcc.so -o %T/%t DEPENDS gpu.o launchkernel.o EXCLUDE_HIP_PLATFORM nvcc rocclr + * BUILD_CMD: LaunchKernel %hc %T/launchkernel.o %T/gpu.o -g -Wl,--rpath=%hip-path/lib %hip-path/lib/libamdhip64.so -o %T/%t DEPENDS gpu.o launchkernel.o EXCLUDE_HIP_PLATFORM nvcc rocclr * TEST: %t EXCLUDE_HIP_PLATFORM nvcc rocclr * HIT_END */ diff --git a/hipamd/tests/src/gcc/hipMalloc.c b/hipamd/tests/src/gcc/hipMalloc.c index f54071f907..2e5deb2a16 100644 --- a/hipamd/tests/src/gcc/hipMalloc.c +++ b/hipamd/tests/src/gcc/hipMalloc.c @@ -19,7 +19,7 @@ /* HIT_START * BUILD_CMD: hipMalloc %cc -D__HIP_PLATFORM_NVCC__ -I%hip-path/include -I/usr/local/cuda/include %S/%s -o %T/hipMalloc_nv -L/usr/local/cuda/lib64 -lcudart EXCLUDE_HIP_PLATFORM hcc rocclr - * BUILD_CMD: hipMalloc %cc -D__HIP_PLATFORM_HCC__ -I%hip-path/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libhip_hcc.so -o %T/hipMalloc_hcc EXCLUDE_HIP_PLATFORM nvcc rocclr + * BUILD_CMD: hipMalloc %cc -D__HIP_PLATFORM_HCC__ -I%hip-path/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libamdhip64.so -o %T/hipMalloc_hcc EXCLUDE_HIP_PLATFORM nvcc rocclr * TEST: hipMalloc_nv EXCLUDE_HIP_PLATFORM hcc rocclr * TEST: hipMalloc_hcc EXCLUDE_HIP_PLATFORM nvcc rocclr * HIT_END From 78fcbab8c20375defcf57715c44a2761eef23bbf Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Wed, 22 Jul 2020 03:28:31 -0400 Subject: [PATCH 15/41] Removing redundant .hipInfo Install Change-Id: Id8f7a4668d8b6965a47a464c94c19bc1036f3537 --- hipamd/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index 565ce37519..055543a245 100755 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -477,9 +477,6 @@ if(HIP_PLATFORM STREQUAL "hcc" OR HIP_PLATFORM STREQUAL "rocclr") install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib) endif() -# Install .hipInfo -install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib) - # Install .hipVersion install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION bin) From e430506f706c71fe7fc93115079aff4cade4542f Mon Sep 17 00:00:00 2001 From: Jason Tang Date: Tue, 21 Jul 2020 12:10:43 -0400 Subject: [PATCH 16/41] SWDEV-243576 - Fix hipMemcpy regression. If the queue device doesn't match the device on any memory object, use the queue device from the memory object. Change-Id: I5fdcf00494f8391574f4443332c01788b8da44ef --- hipamd/rocclr/hip_memory.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/hipamd/rocclr/hip_memory.cpp b/hipamd/rocclr/hip_memory.cpp index b7596e8b0f..c909928a05 100755 --- a/hipamd/rocclr/hip_memory.cpp +++ b/hipamd/rocclr/hip_memory.cpp @@ -196,7 +196,16 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin return hipErrorInvalidValue; } } else { - command = new amd::CopyMemoryCommand(queue, CL_COMMAND_COPY_BUFFER, waitList, + amd::HostQueue* pQueue = &queue; + if (queueDevice != srcMemory->getContext().devices()[0]) { + pQueue = hip::getNullStream(srcMemory->getContext()); + amd::Command* cmd = queue.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } + } + + command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes); } } From b0438e6e9994f11be01f662e309d65744b452376 Mon Sep 17 00:00:00 2001 From: Freddy Paul Date: Mon, 27 Jul 2020 10:30:09 -0700 Subject: [PATCH 17/41] hip-rocclr:Remove dependency to rocm-utils rocm-utils is a meta-pacakge that pull in lot more pacakges than what is required for hip. Correct dependency to set as of today will be rocminfo. Change-Id: I065ca74b4520e0c9ce183d82d6026554ff43e1f0 --- hipamd/packaging/hip-rocclr.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/packaging/hip-rocclr.txt b/hipamd/packaging/hip-rocclr.txt index e284d6fc7f..6f5c16bb96 100644 --- a/hipamd/packaging/hip-rocclr.txt +++ b/hipamd/packaging/hip-rocclr.txt @@ -43,7 +43,7 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb) set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}), comgr (>= 1.1), llvm-amdgpu") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})") set(CPACK_BINARY_RPM "ON") @@ -53,7 +53,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu") +set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}, comgr >= 1.1, llvm-amdgpu") set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") From d251c2db575eb36502378b13d66fd3773c5f5b51 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Wed, 22 Jul 2020 09:27:16 -0400 Subject: [PATCH 18/41] Fixing/Disabling tests on nvcc path Change-Id: I7ed4a1246943867482ff595d5f98bb25bd79074b --- hipamd/tests/src/cg/hipCGThreadBlockType.cpp | 2 +- hipamd/tests/src/deviceLib/hipHalf2Comparision.cpp | 3 ++- hipamd/tests/src/ipc/hipSimpleIpc.cpp | 3 ++- .../src/runtimeApi/device/hipDeviceGetPCIBusId.cpp | 7 ++++++- hipamd/tests/src/runtimeApi/memory/hipMemset.cpp | 4 ++-- .../module/hipModuleLoadDataMultThreadOnMultGPU.cpp | 5 +++-- .../module/hipModuleLoadDataMultThreaded.cpp | 5 +++-- hipamd/tests/src/runtimeApi/module/tex2d_kernel.cpp | 2 +- .../runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp | 10 +++++++++- .../src/runtimeApi/stream/hipStreamGetPriority.cpp | 2 +- 10 files changed, 30 insertions(+), 13 deletions(-) diff --git a/hipamd/tests/src/cg/hipCGThreadBlockType.cpp b/hipamd/tests/src/cg/hipCGThreadBlockType.cpp index ab9492c609..14c2e3ce2a 100644 --- a/hipamd/tests/src/cg/hipCGThreadBlockType.cpp +++ b/hipamd/tests/src/cg/hipCGThreadBlockType.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/hipamd/tests/src/deviceLib/hipHalf2Comparision.cpp b/hipamd/tests/src/deviceLib/hipHalf2Comparision.cpp index e9e6c843cf..c9b100ccae 100644 --- a/hipamd/tests/src/deviceLib/hipHalf2Comparision.cpp +++ b/hipamd/tests/src/deviceLib/hipHalf2Comparision.cpp @@ -19,12 +19,13 @@ THE SOFTWARE. /* HIT_START - * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11 + * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ #include "test_common.h" +#include #include "hip/hip_fp16.h" #define test_passed(test_name) \ diff --git a/hipamd/tests/src/ipc/hipSimpleIpc.cpp b/hipamd/tests/src/ipc/hipSimpleIpc.cpp index 074c06692b..4caea82aab 100755 --- a/hipamd/tests/src/ipc/hipSimpleIpc.cpp +++ b/hipamd/tests/src/ipc/hipSimpleIpc.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ @@ -43,6 +43,7 @@ void single_process() { // Negative, Make sure we return error when an offset of original ptr is passed ipc_offset_dptr = ipc_dptr + (OFFSET * sizeof(int)); + // HIP API return value differs from CUDA's return type assert(hipErrorInvalidDevicePointer == hipIpcGetMemHandle(&ipc_offset_handle, ipc_offset_dptr)); // Get handle for the device_ptr diff --git a/hipamd/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp b/hipamd/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp index ae944ff122..874f8bc44c 100644 --- a/hipamd/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp +++ b/hipamd/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp @@ -26,7 +26,7 @@ /* HIT_START * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1 - * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 + * TEST_NAMED: %t hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc * HIT_END */ @@ -106,8 +106,13 @@ bool compareHipDeviceGetPCIBusIdWithLspci() { getPciBusId(deviceCount, hipDeviceList); // Get lspci device list and compare with hip device list +#if defined(__CUDA_ARCH__) + char const *command = "lspci -D | grep controller | grep NVIDIA | " + "cut -d ' ' -f 1"; +#else char const *command = "lspci -D | grep controller | grep AMD/ATI | " "cut -d ' ' -f 1"; +#endif fpipe = popen(command, "r"); if (fpipe == nullptr) { diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemset.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemset.cpp index f08b6c921f..9f5b9092f9 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemset.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemset.cpp @@ -166,11 +166,11 @@ bool testhipMemset2AsyncOps() { hipStream_t s; hipStreamCreate(&s); hipMemsetAsync(p2, 0, 32*32*4, s); - hipMemsetD32Async(p3, 0x3fe00000, 32*32, s ); + hipMemsetD32Async((hipDeviceptr_t)p3, 0x3fe00000, 32*32, s ); hipStreamSynchronize(s); for (int i = 0; i < 256; ++i) { hipMemsetAsync(p2, 0, 32*32*4, s); - hipMemsetD32Async(p3, 0x3fe00000, 32*32, s ); + hipMemsetD32Async((hipDeviceptr_t)p3, 0x3fe00000, 32*32, s ); } hipStreamSynchronize(s); hipDeviceSynchronize(); diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp index ce78590147..cc976ced42 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp @@ -60,8 +60,6 @@ void run(const std::vector& buffer, int deviceNo) { hipSetDevice(deviceNo); hipModule_t Module; hipFunction_t Function; - HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); float *A, *B, *Ad, *Bd; A = new float[LEN]; @@ -78,6 +76,9 @@ void run(const std::vector& buffer, int deviceNo) { HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice)); + HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); + HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); + hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 6ae1b92ab3..840e9b6975 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -56,8 +56,6 @@ std::vector load_file() { void run(const std::vector& buffer) { hipModule_t Module; hipFunction_t Function; - HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); float *A, *B, *Ad, *Bd; A = new float[LEN]; @@ -74,6 +72,9 @@ void run(const std::vector& buffer) { HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice)); + HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); + HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); + hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); diff --git a/hipamd/tests/src/runtimeApi/module/tex2d_kernel.cpp b/hipamd/tests/src/runtimeApi/module/tex2d_kernel.cpp index 560f27e741..579714566d 100755 --- a/hipamd/tests/src/runtimeApi/module/tex2d_kernel.cpp +++ b/hipamd/tests/src/runtimeApi/module/tex2d_kernel.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD_CMD: tex2d_kernel.code %hc --genco %S/tex2d_kernel.cpp -o tex2d_kernel.code EXCLUDE_HIP_PLATFORM rocclr + * BUILD_CMD: tex2d_kernel.code %hc --genco %S/tex2d_kernel.cpp -o tex2d_kernel.code EXCLUDE_HIP_PLATFORM rocclr nvcc * HIT_END */ diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp index 8c2fe2603b..d21ea5da54 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp @@ -81,6 +81,13 @@ static void HIPRT_CB Callback1(hipStream_t stream, hipError_t status, sleep(SECONDS_TO_WAIT); } +bool rangedCompare(long a, long b) { + auto diff = b - a; + if (diff < 0) diff *= -1; + if (diff < 500) return true; + return false; +} + int main(int argc, char* argv[]) { float *A_d, *C_d; @@ -139,7 +146,8 @@ int main(int argc, char* argv[]) { // completes the execution. Therefore the hipStreamSynchronize() in the // main thread should hardly take any time to complete. - if (duration.count() < SECONDS_TO_WAIT * TO_MICROSECONDS) { + if ((duration.count() < (SECONDS_TO_WAIT * TO_MICROSECONDS)) || + (rangedCompare(duration.count(), SECONDS_TO_WAIT * TO_MICROSECONDS))) { passed(); } else { failed("hipStreamSynchronize is waiting untill Callback() completes."); diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp index d1c3de08fe..8da2c2f8a5 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp @@ -45,7 +45,7 @@ int main(int argc, char *argv[]) { // Check if priorities are indeed supported if ((priority_low + priority_high) != 0) { - failed("Priorities are not supported"); + passed(); // exit the test since priorities are not supported } // Checking Priority of default stream From 21408334214ddb3ca754f9823f30551bb9dfa3f1 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 28 Jul 2020 16:06:59 +0000 Subject: [PATCH 19/41] Bump version to 3.8 Change-Id: I710ee1456aa108eb979eb5bef8f32081dda593d9 --- hipamd/bin/hipconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/bin/hipconfig b/hipamd/bin/hipconfig index b975a5b08a..8659669f5f 100755 --- a/hipamd/bin/hipconfig +++ b/hipamd/bin/hipconfig @@ -1,7 +1,7 @@ #!/usr/bin/perl -w $HIP_BASE_VERSION_MAJOR = "3"; -$HIP_BASE_VERSION_MINOR = "7"; +$HIP_BASE_VERSION_MINOR = "8"; # Need perl > 5.10 to use logic-defined or use 5.006; use v5.10.1; From e3617167f5d8dda140faeb9e5a27081c360758ae Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 9 Jun 2020 15:52:59 +0000 Subject: [PATCH 20/41] Support empty sources in add_executable and add_library Change-Id: I1441f2c045aeda2bba99bffe1c99fcc4a59cea9b --- hipamd/cmake/FindHIP.cmake | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/hipamd/cmake/FindHIP.cmake b/hipamd/cmake/FindHIP.cmake index 498b5e4570..b4a5cb239e 100644 --- a/hipamd/cmake/FindHIP.cmake +++ b/hipamd/cmake/FindHIP.cmake @@ -638,7 +638,11 @@ macro(HIP_ADD_EXECUTABLE hip_target) endif() set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o ") endif() - add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) + if ("${_sources}" STREQUAL "") + add_executable(${hip_target} ${_cmake_options} ${_generated_files} "") + else() + add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) + endif() set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP) endmacro() @@ -652,7 +656,11 @@ macro(HIP_ADD_LIBRARY hip_target) if(_source_files) list(REMOVE_ITEM _sources ${_source_files}) endif() - add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) + if ("${_sources}" STREQUAL "") + add_library(${hip_target} ${_cmake_options} ${_generated_files} "") + else() + add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) + endif() set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX}) endmacro() From 5ac8672fb271cec60beb20314a6879371e26848d Mon Sep 17 00:00:00 2001 From: jujiang Date: Mon, 27 Jul 2020 11:26:42 -0400 Subject: [PATCH 21/41] SWDEV-242207-Update deprecated APIs Change-Id: Icb64227be0eb44497de99240c0044a162ff3c085 --- hipamd/docs/markdown/hip_deprecated_api_list.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hipamd/docs/markdown/hip_deprecated_api_list.md b/hipamd/docs/markdown/hip_deprecated_api_list.md index b9e895d402..783ea4a390 100644 --- a/hipamd/docs/markdown/hip_deprecated_api_list.md +++ b/hipamd/docs/markdown/hip_deprecated_api_list.md @@ -4,8 +4,6 @@ CUDA supports cuCtx API, the Driver API that defines "Context" and "Devices" as separate entities. Contexts contain a single device, and a device can theoretically have multiple contexts. HIP initially added limited support for these API to facilitate easy porting from existing driver codes. These API are marked as deprecated now since there are better alternate interface (such as hipSetDevice or the stream API) to achieve the required functions. -### hipCtxCreate -### hipCtxDestroy ### hipCtxPopCurrent ### hipCtxPushCurrent ### hipCtxSetCurrent @@ -21,7 +19,7 @@ CUDA supports cuCtx API, the Driver API that defines "Context" and "Devices" as ### hipCtxEnablePeerAccess ### hipCtxDisablePeerAccess -## HIP Management APIs +## HIP Memory Management APIs ### hipMallocHost Should use "hipHostMalloc" instead. From cc0f0bae74175fa2486b18557de4e4f2763a64f8 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Thu, 30 Jul 2020 16:15:51 -0400 Subject: [PATCH 22/41] [dtest] disable hipStreamCreateWithPriority test temporarily as it fails randomly on Jenkins Change-Id: I4edcd555653ef8e7562245da3a6f310d60884fe0 --- .../tests/src/runtimeApi/stream/hipStreamCreateWithPriority.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamCreateWithPriority.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamCreateWithPriority.cpp index 11cd8d95b5..e7e3261d64 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamCreateWithPriority.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamCreateWithPriority.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM all * TEST: %t * HIT_END */ From 8481524416a4c0bb166568b913745cc097e3a70f Mon Sep 17 00:00:00 2001 From: Anusha Godavarthy Surya Date: Tue, 14 Jul 2020 06:41:34 -0400 Subject: [PATCH 23/41] SWDEV-243961 Added overflow check for globalWorkSize Change-Id: I153d99398eb4619baad8c0a52466e0d5d5f898ca --- hipamd/rocclr/hip_module.cpp | 75 ++++++++++++++++++++++------------ hipamd/rocclr/hip_platform.cpp | 14 ++++++- 2 files changed, 62 insertions(+), 27 deletions(-) diff --git a/hipamd/rocclr/hip_module.cpp b/hipamd/rocclr/hip_module.cpp index 70dbc02f3a..07944c68ad 100755 --- a/hipamd/rocclr/hip_module.cpp +++ b/hipamd/rocclr/hip_module.cpp @@ -193,16 +193,17 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) HIP_RETURN(hipSuccess); } -hipError_t ihipModuleLaunchKernel(hipFunction_t f, - uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, +hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) { - HIP_INIT_API(ihipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, - sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags, params); + HIP_INIT_API(ihipModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, + blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, + stopEvent, flags, params); hip::DeviceFunc* function = hip::DeviceFunc::asFunction(f); amd::Kernel* kernel = function->kernel(); @@ -229,7 +230,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, int block_size = blockDimX * blockDimY * blockDimZ; hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, block_size, sharedMemBytes, true); - if (((gridDimX * gridDimY * gridDimZ) / block_size) > unsigned(max_blocks_per_grid)) { + if (((globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ) / block_size) > + unsigned(max_blocks_per_grid)) { return hipErrorCooperativeLaunchTooLarge; } } @@ -243,7 +245,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, } size_t globalWorkOffset[3] = {0}; - size_t globalWorkSize[3] = { gridDimX, gridDimY, gridDimZ }; + size_t globalWorkSize[3] = { globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ }; size_t localWorkSize[3] = { blockDimX, blockDimY, blockDimZ }; amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize); amd::Command::EventWaitList waitList; @@ -290,11 +292,11 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, command->enqueue(); - if(startEvent != nullptr) { + if (startEvent != nullptr) { eStart->addMarker(queue, command, false); command->retain(); } - if(stopEvent != nullptr) { + if (stopEvent != nullptr) { eStop->addMarker(queue, command, false); command->retain(); } @@ -313,8 +315,17 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra); - - HIP_RETURN(ihipModuleLaunchKernel(f, gridDimX * blockDimX, gridDimY * blockDimY, gridDimZ * blockDimZ, + size_t globalWorkSizeX = gridDimX * blockDimX; + size_t globalWorkSizeY = gridDimY * blockDimY; + size_t globalWorkSizeZ = gridDimZ * blockDimZ; + if (globalWorkSizeX > std::numeric_limits::max() || + globalWorkSizeY > std::numeric_limits::max() || + globalWorkSizeZ > std::numeric_limits::max()) { + HIP_RETURN(hipErrorInvalidConfiguration); + } + HIP_RETURN(ihipModuleLaunchKernel(f, static_cast(globalWorkSizeX), + static_cast(globalWorkSizeY), + static_cast(globalWorkSizeZ), blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr)); } @@ -337,37 +348,37 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, -hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, - uint32_t gridDimY, uint32_t gridDimZ, +hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent) { - HIP_INIT_API(hipHccModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, + HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent); - HIP_RETURN(ihipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); } -hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX, - uint32_t gridDimY, uint32_t gridDimZ, +hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent) { - HIP_INIT_API(hipModuleLaunchKernelExt, f, gridDimX, gridDimY, gridDimZ, + HIP_INIT_API(hipModuleLaunchKernelExt, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent); - HIP_RETURN(ihipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); } @@ -406,8 +417,17 @@ hipError_t hipLaunchCooperativeKernel(const void* f, int deviceId = ihipGetDevice(); hipFunction_t func = nullptr; HIP_RETURN_ONFAIL(PlatformState::instance().getStatFunc(&func, f, deviceId)); - - HIP_RETURN(ihipModuleLaunchKernel(func, gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z, + size_t globalWorkSizeX = gridDim.x * blockDim.x; + size_t globalWorkSizeY = gridDim.y * blockDim.y; + size_t globalWorkSizeZ = gridDim.z * blockDim.z; + if (globalWorkSizeX > std::numeric_limits::max() || + globalWorkSizeY > std::numeric_limits::max() || + globalWorkSizeZ > std::numeric_limits::max()) { + HIP_RETURN(hipErrorInvalidConfiguration); + } + HIP_RETURN(ihipModuleLaunchKernel(func, static_cast(globalWorkSizeX), + static_cast(globalWorkSizeY), + static_cast(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups)); @@ -481,11 +501,16 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL result = hipErrorInvalidDeviceFunction; HIP_RETURN(result); } - - result = ihipModuleLaunchKernel(func, - launch.gridDim.x * launch.blockDim.x, - launch.gridDim.y * launch.blockDim.y, - launch.gridDim.z * launch.blockDim.z, + size_t globalWorkSizeX = launch.gridDim.x * launch.blockDim.x; + size_t globalWorkSizeY = launch.gridDim.y * launch.blockDim.y; + size_t globalWorkSizeZ = launch.gridDim.z * launch.blockDim.z; + if (globalWorkSizeX > std::numeric_limits::max() || + globalWorkSizeY > std::numeric_limits::max() || + globalWorkSizeZ > std::numeric_limits::max()) { + HIP_RETURN(hipErrorInvalidConfiguration); + } + result = ihipModuleLaunchKernel(func, static_cast(globalWorkSizeX), + static_cast(globalWorkSizeY), static_cast(globalWorkSizeZ), launch.blockDim.x, launch.blockDim.y, launch.blockDim.z, launch.sharedMem, launch.stream, launch.args, nullptr, nullptr, nullptr, flags, extFlags, i, numDevices, prevGridSize, allGridSize, firstDevice); diff --git a/hipamd/rocclr/hip_platform.cpp b/hipamd/rocclr/hip_platform.cpp index 0dafd268a4..e03753e77c 100755 --- a/hipamd/rocclr/hip_platform.cpp +++ b/hipamd/rocclr/hip_platform.cpp @@ -629,8 +629,18 @@ hipError_t ihipLaunchKernel(const void* hostFunction, if ((hip_error != hipSuccess) || (func == nullptr)) { HIP_RETURN(hipErrorInvalidDeviceFunction); } - HIP_RETURN(ihipModuleLaunchKernel(func, (gridDim.x * blockDim.x), (gridDim.y * blockDim.y), - (gridDim.z * blockDim.z), blockDim.x, blockDim.y, blockDim.z, + size_t globalWorkSizeX = gridDim.x * blockDim.x; + size_t globalWorkSizeY = gridDim.y * blockDim.y; + size_t globalWorkSizeZ = gridDim.z * blockDim.z; + if (globalWorkSizeX > std::numeric_limits::max() || + globalWorkSizeY > std::numeric_limits::max() || + globalWorkSizeZ > std::numeric_limits::max()) { + HIP_RETURN(hipErrorInvalidConfiguration); + } + HIP_RETURN(ihipModuleLaunchKernel(func, static_cast(globalWorkSizeX), + static_cast(globalWorkSizeY), + static_cast(globalWorkSizeZ), + blockDim.x, blockDim.y, blockDim.z, sharedMemBytes, stream, args, nullptr, startEvent, stopEvent, flags)); } From bc76760452b4b94b47a3e386debe59b68b00f5ef Mon Sep 17 00:00:00 2001 From: Sarbojit Sarkar Date: Thu, 30 Jul 2020 12:32:15 -0400 Subject: [PATCH 24/41] Added free device memory info Recently there were few OOM(out of Memory) issues poped. Adding trace will help know genuine OOM issue. Change-Id: Ie2a56bba95cead24caf1af5c807e73c383c2f5d9 --- hipamd/rocclr/hip_memory.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hipamd/rocclr/hip_memory.cpp b/hipamd/rocclr/hip_memory.cpp index c909928a05..c26643071c 100755 --- a/hipamd/rocclr/hip_memory.cpp +++ b/hipamd/rocclr/hip_memory.cpp @@ -122,6 +122,9 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags) *ptr = amd::SvmBuffer::malloc(*amdContext, flags, sizeBytes, amdContext->devices()[0]->info().memBaseAddrAlign_, useHostDevice ? curDevContext->svmDevices()[0] : nullptr); if (*ptr == nullptr) { + size_t free = 0, total =0; + hipMemGetInfo(&free, &total); + LogPrintfError("Allocation failed : Device memory : required :%u | free :%u | total :%u \n", sizeBytes, free, total); return hipErrorOutOfMemory; } From c24f884c767aac3eb70e42bc370edb977b9b25a9 Mon Sep 17 00:00:00 2001 From: Payam Date: Tue, 28 Jul 2020 09:37:10 -0400 Subject: [PATCH 25/41] adding duration to print log for hip api calls Change-Id: Icffa8fcacfb8ba24861d980b9553faeee4efec8c --- hipamd/rocclr/hip_internal.hpp | 19 +++++++- hipamd/rocclr/hip_memory.cpp | 84 +++++++++++++++++----------------- 2 files changed, 59 insertions(+), 44 deletions(-) diff --git a/hipamd/rocclr/hip_internal.hpp b/hipamd/rocclr/hip_internal.hpp index 6dbf77baed..a950961ea7 100755 --- a/hipamd/rocclr/hip_internal.hpp +++ b/hipamd/rocclr/hip_internal.hpp @@ -37,6 +37,15 @@ #include #endif +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" +#define KYEL "\x1B[33m" +#define KBLU "\x1B[34m" +#define KMAG "\x1B[35m" +#define KCYN "\x1B[36m" +#define KWHT "\x1B[37m" + /*! IHIP IPC MEMORY Structure */ #define IHIP_IPC_MEM_HANDLE_SIZE 32 #define IHIP_IPC_MEM_RESERVED_SIZE LP64_SWITCH(28,24) @@ -58,8 +67,8 @@ typedef struct ihipIpcMemHandle_st { } #define HIP_API_PRINT(...) \ - ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] %s ( %s )", getpid(), std::this_thread::get_id(), \ - __func__, ToString( __VA_ARGS__ ).c_str()); + uint64_t startTimeUs=0 ; HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%-5d: [%zx] %s%s ( %s )%s", getpid(), std::this_thread::get_id(), KGRN, \ + __func__, ToString( __VA_ARGS__ ).c_str(),KNRM); #define HIP_ERROR_PRINT(err, ...) \ ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] %s: Returned %s : %s", getpid(), std::this_thread::get_id(), \ @@ -75,6 +84,12 @@ typedef struct ihipIpcMemHandle_st { HIP_INIT() \ HIP_CB_SPAWNER_OBJECT(cid); +#define HIP_RETURN_DURATION(ret, ...) \ + hip::g_lastError = ret; \ + HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%-5d: [%zx] %s: Returned %s : %s", getpid(), std::this_thread::get_id(), \ + __func__, hipGetErrorName(hip::g_lastError), ToString( __VA_ARGS__ ).c_str()); \ + return hip::g_lastError; + #define HIP_RETURN(ret, ...) \ hip::g_lastError = ret; \ HIP_ERROR_PRINT(hip::g_lastError, __VA_ARGS__) \ diff --git a/hipamd/rocclr/hip_memory.cpp b/hipamd/rocclr/hip_memory.cpp index c26643071c..a24c8f93df 100755 --- a/hipamd/rocclr/hip_memory.cpp +++ b/hipamd/rocclr/hip_memory.cpp @@ -243,7 +243,7 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag hipError_t hipMalloc(void** ptr, size_t sizeBytes) { HIP_INIT_API(hipMalloc, ptr, sizeBytes); - HIP_RETURN(ihipMalloc(ptr, sizeBytes, 0), *ptr); + HIP_RETURN_DURATION(ihipMalloc(ptr, sizeBytes, 0), *ptr); } hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { @@ -275,7 +275,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { ihipFlags |= CL_MEM_FOLLOW_USER_NUMA_POLICY; } - HIP_RETURN(ihipMalloc(ptr, sizeBytes, ihipFlags), *ptr); + HIP_RETURN_DURATION(ihipMalloc(ptr, sizeBytes, ihipFlags), *ptr); } hipError_t hipFree(void* ptr) { @@ -288,7 +288,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind HIP_INIT_API(hipMemcpy, dst, src, sizeBytes, kind); amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, kind, *queue)); + HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue)); } hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, @@ -297,7 +297,7 @@ hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, amd::HostQueue* queue = hip::getQueue(stream); - HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false)); + HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false)); } hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) { @@ -721,7 +721,7 @@ hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) amd::MemObjMap::AddMemObj(hostPtr, mem); HIP_RETURN(hipSuccess); } else { - HIP_RETURN(ihipMalloc(&hostPtr, sizeBytes, flags), hostPtr); + HIP_RETURN_DURATION(ihipMalloc(&hostPtr, sizeBytes, flags), hostPtr); } } @@ -784,7 +784,7 @@ hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeByt device_ptr = reinterpret_cast
(device_ptr) + offset; /* Copy memory from source to destination address */ - HIP_RETURN(hipMemcpy(device_ptr, src, sizeBytes, kind)); + HIP_RETURN_DURATION(hipMemcpy(device_ptr, src, sizeBytes, kind)); } hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol, size_t sizeBytes, @@ -806,7 +806,7 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol, size_t sizeBytes, device_ptr = reinterpret_cast
(device_ptr) + offset; /* Copy memory from source to destination address */ - HIP_RETURN(hipMemcpy(dst, device_ptr, sizeBytes, kind)); + HIP_RETURN_DURATION(hipMemcpy(dst, device_ptr, sizeBytes, kind)); } hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, @@ -828,7 +828,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t si device_ptr = reinterpret_cast
(device_ptr) + offset; /* Copy memory from source to destination address */ - HIP_RETURN(hipMemcpyAsync(device_ptr, src, sizeBytes, kind, stream)); + HIP_RETURN_DURATION(hipMemcpyAsync(device_ptr, src, sizeBytes, kind, stream)); } hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol, size_t sizeBytes, @@ -850,7 +850,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol, size_t sizeBy device_ptr = reinterpret_cast
(device_ptr) + offset; /* Copy memory from source to destination address */ - HIP_RETURN(hipMemcpyAsync(dst, device_ptr, sizeBytes, kind, stream)); + HIP_RETURN_DURATION(hipMemcpyAsync(dst, device_ptr, sizeBytes, kind, stream)); } hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, @@ -858,7 +858,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyHtoD, dstDevice, srcHost, ByteCount); - HIP_RETURN(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(nullptr))); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(nullptr))); } hipError_t hipMemcpyDtoH(void* dstHost, @@ -866,7 +866,7 @@ hipError_t hipMemcpyDtoH(void* dstHost, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoH, dstHost, srcDevice, ByteCount); - HIP_RETURN(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(nullptr))); + HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(nullptr))); } hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice, @@ -874,7 +874,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoD, dstDevice, srcDevice, ByteCount); - HIP_RETURN(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(nullptr))); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(nullptr))); } hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, @@ -883,7 +883,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, amd::HostQueue* queue = hip::getQueue(stream); - HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, kind, *queue, true)); + HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, true)); } hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, @@ -892,7 +892,7 @@ hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, hipStream_t stream) { HIP_INIT_API(hipMemcpyHtoDAsync, dstDevice, srcHost, ByteCount, stream); - HIP_RETURN(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(stream), true)); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(stream), true)); } hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, @@ -901,7 +901,7 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, hipStream_t stream) { HIP_INIT_API(hipMemcpyDtoDAsync, dstDevice, srcDevice, ByteCount, stream); - HIP_RETURN(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(stream), true)); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(stream), true)); } hipError_t hipMemcpyDtoHAsync(void* dstHost, @@ -910,7 +910,7 @@ hipError_t hipMemcpyDtoHAsync(void* dstHost, hipStream_t stream) { HIP_INIT_API(hipMemcpyDtoHAsync, dstHost, srcDevice, ByteCount, stream); - HIP_RETURN(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(stream), true)); + HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(stream), true)); } hipError_t ihipMemcpyAtoD(hipArray* srcArray, @@ -1524,21 +1524,21 @@ hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) { HIP_INIT_API(hipMemcpyParam2D, pCopy); - HIP_RETURN(ihipMemcpyParam2D(pCopy, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyParam2D(pCopy, nullptr)); } hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy2D, dst, dpitch, src, spitch, width, height, kind); - HIP_RETURN(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, nullptr)); } hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { HIP_INIT_API(hipMemcpy2DAsync, dst, dpitch, src, spitch, width, height, kind, stream); - HIP_RETURN(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream, true)); } hipError_t ihipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) { @@ -1569,7 +1569,7 @@ hipError_t ihipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, c hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy2DToArray, dst, wOffset, hOffset, src, spitch, width, height, kind); - HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, nullptr)); } hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind) { @@ -1584,7 +1584,7 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const const size_t height = (count / dst->width) / hip::getElementSize(dst); - HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, witdthInBytes, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, witdthInBytes, height, kind, nullptr)); } hipError_t ihipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) { @@ -1624,7 +1624,7 @@ hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t src, size_t wOffsetSrc const size_t height = (count / src->width) / hip::getElementSize(src); - HIP_RETURN(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffset, witdthInBytes, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffset, witdthInBytes, height, kind, nullptr)); } hipError_t hipMemcpyHtoA(hipArray* dstArray, @@ -1633,7 +1633,7 @@ hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t ByteCount) { HIP_INIT_API(hipMemcpyHtoA, dstArray, dstOffset, srcHost, ByteCount); - HIP_RETURN(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); } hipError_t hipMemcpyAtoH(void* dstHost, @@ -1642,7 +1642,7 @@ hipError_t hipMemcpyAtoH(void* dstHost, size_t ByteCount) { HIP_INIT_API(hipMemcpyAtoH, dstHost, srcArray, srcOffset, ByteCount); - HIP_RETURN(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); } hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, @@ -1669,25 +1669,25 @@ hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, hipError_t hipMemcpy3D(const hipMemcpy3DParms* p) { HIP_INIT_API(hipMemcpy3D, p); - HIP_RETURN(ihipMemcpy3D(p, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy3D(p, nullptr)); } hipError_t hipMemcpy3DAsync(const hipMemcpy3DParms* p, hipStream_t stream) { HIP_INIT_API(hipMemcpy3DAsync, p, stream); - HIP_RETURN(ihipMemcpy3D(p, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy3D(p, stream, true)); } hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy) { HIP_INIT_API(hipDrvMemcpy3D, pCopy); - HIP_RETURN(ihipMemcpyParam3D(pCopy, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyParam3D(pCopy, nullptr)); } hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) { HIP_INIT_API(hipDrvMemcpy3DAsync, pCopy, stream); - HIP_RETURN(ihipMemcpyParam3D(pCopy, stream, true)); + HIP_RETURN_DURATION(ihipMemcpyParam3D(pCopy, stream, true)); } hipError_t packFillMemoryCommand(amd::Memory* memory, size_t offset, int64_t value, size_t valueSize, @@ -1936,7 +1936,7 @@ hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pitch, size_t widthInB hipError_t hipMemAllocHost(void** ptr, size_t size) { HIP_INIT_API(hipMemAllocHost, ptr, size); - HIP_RETURN(hipHostMalloc(ptr, size, 0)); + HIP_RETURN_DURATION(hipHostMalloc(ptr, size, 0)); } hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* dev_ptr) { @@ -2118,25 +2118,25 @@ hipError_t ihipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hO hipError_t hipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy2DArrayToArray, dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind); - HIP_RETURN(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr)); } hipError_t hipMemcpyArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpyArrayToArray, dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind); - HIP_RETURN(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr)); } hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind) { HIP_INIT_API(hipMemcpy2DFromArray, dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind); - HIP_RETURN(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, nullptr)); + HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, nullptr)); } hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { HIP_INIT_API(hipMemcpy2DFromArrayAsync, dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream); - HIP_RETURN(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream, true)); } hipError_t hipMemcpyFromArrayAsync(void* dst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, hipMemcpyKind kind, hipStream_t stream) { @@ -2151,13 +2151,13 @@ hipError_t hipMemcpyFromArrayAsync(void* dst, hipArray_const_t src, size_t wOffs const size_t height = (count / src->width) / hip::getElementSize(src); - HIP_RETURN(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffsetSrc, widthInBytes, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffsetSrc, widthInBytes, height, kind, stream, true)); } hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { HIP_INIT_API(hipMemcpy2DToArrayAsync, dst, wOffset, hOffset, src, spitch, width, height, kind); - HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, stream, true)); } hipError_t hipMemcpyToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind, hipStream_t stream) { @@ -2172,7 +2172,7 @@ hipError_t hipMemcpyToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset, const size_t height = (count / dst->width) / hip::getElementSize(dst); - HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, widthInBytes, height, kind, stream, true)); + HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, widthInBytes, height, kind, stream, true)); } hipError_t hipMemcpyAtoA(hipArray* dstArray, @@ -2182,7 +2182,7 @@ hipError_t hipMemcpyAtoA(hipArray* dstArray, size_t ByteCount) { HIP_INIT_API(hipMemcpyAtoA, dstArray, dstOffset, srcArray, srcOffset, ByteCount); - HIP_RETURN(ihipMemcpyAtoA(srcArray, dstArray, {srcOffset, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyAtoA(srcArray, dstArray, {srcOffset, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, nullptr)); } hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, @@ -2191,7 +2191,7 @@ hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyAtoD, dstDevice, srcArray, srcOffset, ByteCount); - HIP_RETURN(ihipMemcpyAtoD(srcArray, dstDevice, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyAtoD(srcArray, dstDevice, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); } hipError_t hipMemcpyAtoHAsync(void* dstHost, @@ -2201,7 +2201,7 @@ hipError_t hipMemcpyAtoHAsync(void* dstHost, hipStream_t stream) { HIP_INIT_API(hipMemcpyAtoHAsync, dstHost, srcArray, srcOffset, ByteCount, stream); - HIP_RETURN(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true)); + HIP_RETURN_DURATION(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true)); } hipError_t hipMemcpyDtoA(hipArray* dstArray, @@ -2210,7 +2210,7 @@ hipError_t hipMemcpyDtoA(hipArray* dstArray, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoA, dstArray, dstOffset, srcDevice, ByteCount); - HIP_RETURN(ihipMemcpyDtoA(srcDevice, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); + HIP_RETURN_DURATION(ihipMemcpyDtoA(srcDevice, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr)); } hipError_t hipMemcpyHtoAAsync(hipArray* dstArray, @@ -2220,7 +2220,7 @@ hipError_t hipMemcpyHtoAAsync(hipArray* dstArray, hipStream_t stream) { HIP_INIT_API(hipMemcpyHtoAAsync, dstArray, dstOffset, srcHost, ByteCount, stream); - HIP_RETURN(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true)); + HIP_RETURN_DURATION(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true)); } hipError_t hipMipmappedArrayCreate(hipMipmappedArray_t* pHandle, @@ -2277,7 +2277,7 @@ hipError_t hipMallocHost(void** ptr, HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN(ihipMalloc(ptr, size, CL_MEM_SVM_FINE_GRAIN_BUFFER), *ptr); + HIP_RETURN_DURATION(ihipMalloc(ptr, size, CL_MEM_SVM_FINE_GRAIN_BUFFER), *ptr); } hipError_t hipFreeHost(void *ptr) { From dec95e58e348987ef461da3fd870d5a7c04655b0 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Wed, 22 Jul 2020 11:35:15 -0700 Subject: [PATCH 26/41] Enable queue profile only if we attach a profiler Submit explicit profile marker for hipEventRecord to record timestamps. Enable explicit signal profiling if the API specifies start and stop events. Toggle this with env var HIP_FORCE_QUEUE_PROFILING=0 Change-Id: Iae449a63ec3ebf6c2880e65d7b1dd1031a29018f --- hipamd/rocclr/hip_event.cpp | 22 ++++++++++++++-------- hipamd/rocclr/hip_event.hpp | 7 ++++--- hipamd/rocclr/hip_module.cpp | 13 ++++++++----- hipamd/rocclr/hip_stream.cpp | 15 ++++++++++++--- 4 files changed, 38 insertions(+), 19 deletions(-) diff --git a/hipamd/rocclr/hip_event.cpp b/hipamd/rocclr/hip_event.cpp index a9ea30e15c..858309d638 100644 --- a/hipamd/rocclr/hip_event.cpp +++ b/hipamd/rocclr/hip_event.cpp @@ -140,6 +140,19 @@ hipError_t Event::streamWait(amd::HostQueue* hostQueue, uint flags) { void Event::addMarker(amd::HostQueue* queue, amd::Command* command, bool record) { amd::ScopedLock lock(lock_); + if (queue->properties().test(CL_QUEUE_PROFILING_ENABLE)) { + if (command == nullptr) { + command = queue->getLastQueuedCommand(true); + if (command == nullptr) { + command = new amd::Marker(*queue, kMarkerDisableFlush); + command->enqueue(); + } + } + } else if (command == nullptr) { + command = new hip::ProfileMarker(*queue, false); + command->enqueue(); + } + if (event_ == &command->event()) return; if (event_ != nullptr) { @@ -239,16 +252,9 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) { } hip::Event* e = reinterpret_cast(event); - amd::ScopedLock lock(e->lock()); - amd::HostQueue* queue = hip::getQueue(stream); - amd::Command* command = queue->getLastQueuedCommand(true); - if (command == nullptr) { - command = new amd::Marker(*queue, kMarkerDisableFlush); - command->enqueue(); - } - e->addMarker(queue, command, true); + e->addMarker(queue, nullptr, true); HIP_RETURN(hipSuccess); } diff --git a/hipamd/rocclr/hip_event.hpp b/hipamd/rocclr/hip_event.hpp index dbd43b1a5c..dccd4e884a 100644 --- a/hipamd/rocclr/hip_event.hpp +++ b/hipamd/rocclr/hip_event.hpp @@ -26,12 +26,13 @@ namespace hip { -class TimerMarker: public amd::Marker { +class ProfileMarker: public amd::Marker { public: - TimerMarker(amd::HostQueue& queue) : amd::Marker(queue, false) { + ProfileMarker(amd::HostQueue& queue, bool disableFlush) + : amd::Marker(queue, disableFlush) { profilingInfo_.enabled_ = true; profilingInfo_.callback_ = nullptr; - profilingInfo_.start_ = profilingInfo_.end_ = 0; + profilingInfo_.clear(); } }; diff --git a/hipamd/rocclr/hip_module.cpp b/hipamd/rocclr/hip_module.cpp index 07944c68ad..07eeb55a84 100755 --- a/hipamd/rocclr/hip_module.cpp +++ b/hipamd/rocclr/hip_module.cpp @@ -249,7 +249,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, size_t localWorkSize[3] = { blockDimX, blockDimY, blockDimZ }; amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize); amd::Command::EventWaitList waitList; - + bool profileNDRange = false; address kernargs = nullptr; // 'extra' is a struct that contains the following info: { @@ -273,13 +273,16 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, desc.type_ == T_POINTER/*svmBound*/); } else { assert(extra == nullptr); - kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/); + kernel->parameters().set(i, desc.size_, kernelParams[i], + desc.type_ == T_POINTER/*svmBound*/); } } + profileNDRange = (startEvent != nullptr && stopEvent != nullptr); + amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand( *queue, waitList, *kernel, ndrange, sharedMemBytes, - params, gridId, numGrids, prevGridSum, allGridSum, firstDevice); + params, gridId, numGrids, prevGridSum, allGridSum, firstDevice, profileNDRange); if (!command) { return hipErrorOutOfMemory; } @@ -472,7 +475,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL uint64_t prevGridSize = 0; uint32_t firstDevice = 0; - // Sync the execution streams on all devices + // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) { for (int i = 0; i < numDevices; ++i) { amd::HostQueue* queue = @@ -520,7 +523,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z; } - // Sync the execution streams on all devices + // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) { for (int i = 0; i < numDevices; ++i) { amd::HostQueue* queue = diff --git a/hipamd/rocclr/hip_stream.cpp b/hipamd/rocclr/hip_stream.cpp index 9da673f023..379954ef5c 100755 --- a/hipamd/rocclr/hip_stream.cpp +++ b/hipamd/rocclr/hip_stream.cpp @@ -22,6 +22,9 @@ #include "hip_internal.hpp" #include "hip_event.hpp" #include "thread/monitor.hpp" +#include "hip_prof_api.h" + +extern api_callbacks_table_t callbacks_table; static amd::Monitor streamSetLock{"Guards global stream set"}; static std::unordered_set streamSet; @@ -50,7 +53,12 @@ Stream::Stream(hip::Device* dev, Priority p, // ================================================================================================ bool Stream::Create() { - cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; + // Enable queue profiling if a profiler is attached which sets the callback_table flag + // or if we force it with env var. This would enable time stamp collection for every + // command submitted to the stream(queue). + cl_command_queue_properties properties = (callbacks_table.is_enabled() || + HIP_FORCE_QUEUE_PROFILING) ? + CL_QUEUE_PROFILING_ENABLE : 0; amd::CommandQueue::Priority p; switch (priority_) { case Priority::High: @@ -64,8 +72,9 @@ bool Stream::Create() { p = amd::CommandQueue::Priority::Normal; break; } - amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], properties, - amd::CommandQueue::RealTimeDisabled, p, cuMask_); + amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], + properties, amd::CommandQueue::RealTimeDisabled, + p, cuMask_); // Create a host queue bool result = (queue != nullptr) ? queue->create() : false; From 94bfd485ef92305a8569ab421bfbfc2abdfa4d70 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Tue, 28 Jul 2020 08:16:58 -0400 Subject: [PATCH 27/41] Cuda 11 changes: https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaPointerAttributes.html\#structcudaPointerAttributes Change-Id: I8a5389b88df286043c365a734983a4c5de352102 --- .../include/hip/hcc_detail/hip_runtime_api.h | 51 +++++++++----- .../include/hip/nvcc_detail/hip_runtime_api.h | 66 ++++++++++++------- 2 files changed, 80 insertions(+), 37 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 3c8a775289..853c3ce6a8 100755 --- a/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -2173,6 +2173,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, * hipMemcpyAsync */ +DEPRECATED(DEPRECATED_MSG) hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind); @@ -2191,6 +2192,7 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, * hipMemcpyAsync */ +DEPRECATED(DEPRECATED_MSG) hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset, size_t count, hipMemcpyKind kind); @@ -3531,6 +3533,7 @@ hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks, dim3 void** args, size_t sharedMemBytes, hipStream_t stream, hipEvent_t startEvent, hipEvent_t stopEvent, int flags); +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture( size_t* offset, const textureReference* tex, @@ -3538,6 +3541,7 @@ hipError_t hipBindTexture( const hipChannelFormatDesc* desc, size_t size __dparm(UINT_MAX)); +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture2D( size_t* offset, const textureReference* tex, @@ -3547,6 +3551,7 @@ hipError_t hipBindTexture2D( size_t height, size_t pitch); +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTextureToArray( const textureReference* tex, hipArray_const_t array, @@ -3557,6 +3562,7 @@ hipError_t hipBindTextureToMipmappedArray( hipMipmappedArray_const_t mipmappedArray, const hipChannelFormatDesc* desc); +DEPRECATED(DEPRECATED_MSG) hipError_t hipGetTextureAlignmentOffset( size_t* offset, const textureReference* texref); @@ -3565,6 +3571,7 @@ hipError_t hipGetTextureReference( const textureReference** texref, const void* symbol); +DEPRECATED(DEPRECATED_MSG) hipError_t hipUnbindTexture(const textureReference* tex); hipError_t hipCreateTextureObject( @@ -3834,6 +3841,7 @@ inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( class TlsData; #if !__HIP_ROCclr__ +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture(size_t* offset, textureReference* tex, const void* devPtr, const hipChannelFormatDesc* desc, size_t size = UINT_MAX); #endif @@ -3861,6 +3869,7 @@ hipError_t ihipBindTextureImpl(TlsData *tls, int dim, enum hipTextureReadMode re **/ #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture(size_t* offset, struct texture& tex, const void* devPtr, const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) { return ihipBindTextureImpl(nullptr, dim, readMode, offset, devPtr, &desc, size, &tex); @@ -3883,6 +3892,7 @@ hipError_t hipBindTexture(size_t* offset, struct texture& tex, **/ #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture(size_t* offset, struct texture& tex, const void* devPtr, size_t size = UINT_MAX) { return ihipBindTextureImpl(nullptr, dim, readMode, offset, devPtr, &(tex.channelDesc), size, &tex); @@ -3891,6 +3901,7 @@ hipError_t hipBindTexture(size_t* offset, struct texture& tex, // C API #if !__HIP_ROCclr__ +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture2D(size_t* offset, textureReference* tex, const void* devPtr, const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch); @@ -3904,6 +3915,7 @@ hipError_t ihipBindTexture2DImpl(int dim, enum hipTextureReadMode readMode, size #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture2D(size_t* offset, struct texture& tex, const void* devPtr, size_t width, size_t height, size_t pitch) { return ihipBindTexture2DImpl(dim, readMode, offset, devPtr, &(tex.channelDesc), width, height, @@ -3913,6 +3925,7 @@ hipError_t hipBindTexture2D(size_t* offset, struct texture& te #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTexture2D(size_t* offset, struct texture& tex, const void* devPtr, const struct hipChannelFormatDesc& desc, size_t width, size_t height, size_t pitch) { @@ -3922,6 +3935,7 @@ hipError_t hipBindTexture2D(size_t* offset, struct texture& te // C API #if !__HIP_ROCclr__ +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTextureToArray(textureReference* tex, hipArray_const_t array, const hipChannelFormatDesc* desc); #endif @@ -3935,6 +3949,7 @@ hipError_t ihipBindTextureToArrayImpl(TlsData *tls, int dim, enum hipTextureRead #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTextureToArray(struct texture& tex, hipArray_const_t array) { return ihipBindTextureToArrayImpl(nullptr, dim, readMode, array, tex.channelDesc, &tex); } @@ -3942,6 +3957,7 @@ hipError_t hipBindTextureToArray(struct texture& tex, hipArray #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipBindTextureToArray(struct texture& tex, hipArray_const_t array, const struct hipChannelFormatDesc& desc) { return ihipBindTextureToArrayImpl(nullptr, dim, readMode, array, desc, &tex); @@ -3950,6 +3966,7 @@ hipError_t hipBindTextureToArray(struct texture& tex, hipArray #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) inline static hipError_t hipBindTextureToArray(struct texture *tex, hipArray_const_t array, const struct hipChannelFormatDesc* desc) { @@ -4019,6 +4036,7 @@ inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchPara * @return #hipSuccess **/ #if !__HIP_ROCclr__ +DEPRECATED(DEPRECATED_MSG) hipError_t hipUnbindTexture(const textureReference* tex); #endif @@ -4028,6 +4046,7 @@ extern hipError_t ihipUnbindTextureImpl(const hipTextureObject_t& textureObject) #if !__HIP_ROCclr__ template +DEPRECATED(DEPRECATED_MSG) hipError_t hipUnbindTexture(struct texture& tex) { return ihipUnbindTextureImpl(tex.textureObject); } @@ -4035,7 +4054,10 @@ hipError_t hipUnbindTexture(struct texture& tex) { #if !__HIP_ROCclr__ hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array); + +DEPRECATED(DEPRECATED_MSG) hipError_t hipGetTextureAlignmentOffset(size_t* offset, const textureReference* texref); + hipError_t hipGetTextureReference(const textureReference** texref, const void* symbol); hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc, @@ -4078,28 +4100,23 @@ hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, const hipReso hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject); #if __HIP_ROCclr__ -template -static inline hipError_t hipBindTexture( - size_t *offset, - const struct texture &tex, - const void *devPtr, - size_t size = UINT_MAX) -{ +template +DEPRECATED(DEPRECATED_MSG) +static inline hipError_t hipBindTexture(size_t* offset, const struct texture& tex, + const void* devPtr, size_t size = UINT_MAX) { return hipBindTexture(offset, &tex, devPtr, &tex.channelDesc, size); } -template -static inline hipError_t hipBindTexture( - size_t *offset, - const struct texture &tex, - const void *devPtr, - const struct hipChannelFormatDesc &desc, - size_t size = UINT_MAX) -{ +template +DEPRECATED(DEPRECATED_MSG) +static inline hipError_t + hipBindTexture(size_t* offset, const struct texture& tex, const void* devPtr, + const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) { return hipBindTexture(offset, &tex, devPtr, &desc, size); } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipBindTexture2D( size_t *offset, const struct texture &tex, @@ -4112,6 +4129,7 @@ static inline hipError_t hipBindTexture2D( } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipBindTexture2D( size_t *offset, const struct texture &tex, @@ -4125,6 +4143,7 @@ static inline hipError_t hipBindTexture2D( } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipBindTextureToArray( const struct texture &tex, hipArray_const_t array) @@ -4135,6 +4154,7 @@ static inline hipError_t hipBindTextureToArray( } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipBindTextureToArray( const struct texture &tex, hipArray_const_t array, @@ -4168,6 +4188,7 @@ static inline hipError_t hipBindTextureToMipmappedArray( } template +DEPRECATED(DEPRECATED_MSG) static inline hipError_t hipUnbindTexture( const struct texture &tex) { diff --git a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 3744c6740c..f9a2992cd1 100755 --- a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -37,6 +37,18 @@ extern "C" { #define __dparm(x) #endif +// Add Deprecated Support for CUDA Mapped HIP APIs +#if defined(__DOXYGEN_ONLY__) || defined(HIP_ENABLE_DEPRECATED) +#define __HIP_DEPRECATED +#elif defined(_MSC_VER) +#define __HIP_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __HIP_DEPRECATED __attribute__((deprecated)) +#else +#define __HIP_DEPRECATED +#endif + + // TODO -move to include/hip_runtime_api.h as a common implementation. /** * Memory copy types @@ -963,14 +975,16 @@ inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_ height, hipMemcpyKindToCudaMemcpyKind(kind))); } -inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, - const void* src, size_t count, hipMemcpyKind kind) { +__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, + size_t hOffset, const void* src, + size_t count, hipMemcpyKind kind) { return hipCUDAErrorTohipError( cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind))); } -inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, - size_t hOffset, size_t count, hipMemcpyKind kind) { +__HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, + size_t wOffset, size_t hOffset, + size_t count, hipMemcpyKind kind) { return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count, hipMemcpyKindToCudaMemcpyKind(kind))); } @@ -1353,7 +1367,12 @@ inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attribut struct cudaPointerAttributes cPA; hipError_t err = hipCUDAErrorTohipError(cudaPointerGetAttributes(&cPA, ptr)); if (err == hipSuccess) { - switch (cPA.memoryType) { +#if (CUDART_VERSION >= 11000) + auto memType = cPA.type; +#else + unsigned memType = cPA.memoryType; // No auto because cuda 10.2 doesnt force c++11 +#endif + switch (memType) { case cudaMemoryTypeDevice: attributes->memoryType = hipMemoryTypeDevice; break; @@ -1695,14 +1714,17 @@ inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig)); } -inline static hipError_t hipBindTexture(size_t* offset, struct textureReference* tex, const void* devPtr, - const hipChannelFormatDesc* desc, size_t size __dparm(UINT_MAX)){ +__HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset, + struct textureReference* tex, + const void* devPtr, + const hipChannelFormatDesc* desc, + size_t size __dparm(UINT_MAX)) { return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size)); } -inline static hipError_t hipBindTexture2D(size_t* offset, struct textureReference* tex, const void* devPtr, - const hipChannelFormatDesc* desc, size_t width, size_t height, - size_t pitch) { +__HIP_DEPRECATED inline static hipError_t hipBindTexture2D( + size_t* offset, struct textureReference* tex, const void* devPtr, + const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) { return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch)); } @@ -1737,8 +1759,8 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject)); } -inline static hipError_t hipGetTextureAlignmentOffset(size_t* offset, const struct textureReference* texref) -{ +__HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset( + size_t* offset, const struct textureReference* texref) { return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref)); } @@ -1811,32 +1833,32 @@ inline static hipError_t hipBindTexture(size_t* offset, struct texture -inline static hipError_t hipUnbindTexture(struct texture* tex) { +__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture* tex) { return hipCUDAErrorTohipError(cudaUnbindTexture(tex)); } template -inline static hipError_t hipUnbindTexture(struct texture &tex) { +__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture& tex) { return hipCUDAErrorTohipError(cudaUnbindTexture(tex)); } template -inline static hipError_t hipBindTextureToArray(struct texture& tex, - hipArray_const_t array, - const hipChannelFormatDesc& desc) { +__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray( + struct texture& tex, hipArray_const_t array, + const hipChannelFormatDesc& desc) { return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc)); } template -inline static hipError_t hipBindTextureToArray(struct texture *tex, - hipArray_const_t array, - const hipChannelFormatDesc* desc) { +__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray( + struct texture* tex, hipArray_const_t array, + const hipChannelFormatDesc* desc) { return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc)); } template -inline static hipError_t hipBindTextureToArray(struct texture& tex, - hipArray_const_t array) { +__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray( + struct texture& tex, hipArray_const_t array) { return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array)); } From f0d9df0b71d82447adf993e6219e45d308d5009f Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Thu, 6 Aug 2020 08:46:07 -0400 Subject: [PATCH 28/41] Fixing the memchannel calculation Change-Id: Ib5f5931cc0e1ee5b7cb356c2ede0cb951ee8c9de --- hipamd/rocclr/hip_device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/rocclr/hip_device.cpp b/hipamd/rocclr/hip_device.cpp index 2d8900cded..f36fde4680 100644 --- a/hipamd/rocclr/hip_device.cpp +++ b/hipamd/rocclr/hip_device.cpp @@ -175,7 +175,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) deviceProps.maxGridSize[2] = INT32_MAX; deviceProps.clockRate = info.maxEngineClockFrequency_ * 1000; deviceProps.memoryClockRate = info.maxMemoryClockFrequency_ * 1000; - deviceProps.memoryBusWidth = info.globalMemChannels_ * 32; + deviceProps.memoryBusWidth = info.globalMemChannels_; deviceProps.totalConstMem = info.maxConstantBufferSize_; deviceProps.major = info.gfxipVersion_ / 100; deviceProps.minor = info.gfxipVersion_ % 100; From fb2760a8aa6c06f29f7c7135763459ecadd0be67 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Tue, 4 Aug 2020 09:31:04 -0400 Subject: [PATCH 29/41] Bring .clang-format file in sync with ROCclr's Change-Id: I1990722c0779d61f66d79a5d0649d1b64e42eaf7 --- hipamd/.clang-format | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/hipamd/.clang-format b/hipamd/.clang-format index 1793af2ba2..5572a72cdd 100644 --- a/hipamd/.clang-format +++ b/hipamd/.clang-format @@ -1,20 +1,10 @@ ---- Language: Cpp BasedOnStyle: Google AlignEscapedNewlinesLeft: false +AlignOperands: false ColumnLimit: 100 +AlwaysBreakTemplateDeclarations: false DerivePointerAlignment: false -IndentWrappedFunctionNames: false +IndentFunctionDeclarationAfterType: false MaxEmptyLinesToKeep: 2 SortIncludes: false -IndentWidth: 4 ---- -Language: ObjC -BasedOnStyle: Google -AlignEscapedNewlinesLeft: false -ColumnLimit: 100 -DerivePointerAlignment: false -IndentWrappedFunctionNames: false -MaxEmptyLinesToKeep: 2 -SortIncludes: false -IndentWidth: 4 From c1a498c2191d87b1b83ff05dcc42667100cd5450 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Tue, 4 Aug 2020 15:01:52 -0400 Subject: [PATCH 30/41] Let hipcc not escape space between obj files extracted from library hipcc extracts bundles from static libraries and pass them to clang. It should not escape spaces between objects extracted from library. Fixes: SWDEV-246544 Change-Id: Iad3902f28a43a3986917ee37a49f10382c329940 --- hipamd/bin/hipcc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index 5f1445cc6a..22fd249bfd 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -426,6 +426,7 @@ foreach $arg (@ARGV) # TODO: why are we removing it here? $trimarg =~ s/^\s+|\s+$//g; # Remive whitespace my $swallowArg = 0; + my $escapeArg = 1; if ($arg eq '-c' or $arg eq '--genco' or $arg eq '-E') { $compileOnly = 1; $needLDFLAGS = 0; @@ -578,6 +579,7 @@ foreach $arg (@ARGV) close $in; close $out; $arg = "$new_arg -Wl,\@$new_file"; + $escapeArg = 0; } elsif (($arg =~ m/\.a$/ || $arg =~ m/\.lo$/) && $HIP_PLATFORM eq 'hcc' and $HIP_COMPILER eq 'clang') { ## process static library for hip-clang @@ -624,6 +626,7 @@ foreach $arg (@ARGV) $new_arg .= " $tmpdir/$libBaseName"; } $arg = "$new_arg"; + $escapeArg = 0; if ($toolArgs =~ m/-Xlinker$/) { $toolArgs = substr $toolArgs, 0, -8; chomp $toolArgs; @@ -703,7 +706,7 @@ foreach $arg (@ARGV) # common characters such as alphanumerics. # Do the quoting here because sometimes the $arg is changed in the loop # Important to have all of '-Xlinker' in the set of unquoted characters. - if (not $isWindows) { # Windows needs different quoting, ignore for now + if (not $isWindows and $escapeArg) { # Windows needs different quoting, ignore for now $arg =~ s/[^-a-zA-Z0-9_=+,.\/]/\\$&/g; } $toolArgs .= " $arg" unless $swallowArg; From 61dafa0397b8f8d9d53af836600399f6d7e41db2 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Fri, 7 Aug 2020 09:57:11 -0400 Subject: [PATCH 31/41] [HipPerf] add a test for measuring shared memory read speed SWDEV-245290/SWDEV-247330 Change-Id: If78fa0879ff58aab84775e412a86665c7e5959e7 --- .../memory/hipPerfSharedMemReadSpeed.cpp | 250 ++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 hipamd/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp diff --git a/hipamd/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp b/hipamd/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp new file mode 100644 index 0000000000..86adc5f354 --- /dev/null +++ b/hipamd/tests/performance/memory/hipPerfSharedMemReadSpeed.cpp @@ -0,0 +1,250 @@ +/* + Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +using namespace std; + +#define sharedMemSize1 2048 +#define sharedMemSize2 256 + +__global__ void sharedMemReadSpeed1(float *outBuf, ulong N) { + + size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); + size_t lid = threadIdx.x; + __shared__ float local[sharedMemSize1]; + + float val1 = 0; + float val2 = 0; + float val3 = 0; + float val4 = 0; + + for (int i = 0; i < (sharedMemSize1 / 64); i++) { + local[lid + i * 64] = lid; + } + + __syncthreads(); + + val1 += local[lid]; + val2 += local[lid + 64]; + val3 += local[lid + 128]; + val4 += local[lid + 192]; + val1 += local[lid + 256]; + val2 += local[lid + 320]; + val3 += local[lid + 384]; + val4 += local[lid + 448]; + val1 += local[lid + 512]; + val2 += local[lid + 576]; + val3 += local[lid + 640]; + val4 += local[lid + 704]; + val1 += local[lid + 768]; + val2 += local[lid + 832]; + val3 += local[lid + 896]; + val4 += local[lid + 960]; + val1 += local[lid + 1024]; + val2 += local[lid + 1088]; + val3 += local[lid + 1152]; + val4 += local[lid + 1216]; + val1 += local[lid + 1280]; + val2 += local[lid + 1344]; + val3 += local[lid + 1408]; + val4 += local[lid + 1472]; + val1 += local[lid + 1536]; + val2 += local[lid + 1600]; + val3 += local[lid + 1664]; + val4 += local[lid + 1728]; + val1 += local[lid + 1792]; + val2 += local[lid + 1856]; + val3 += local[lid + 1920]; + val4 += local[lid + 1984]; + + if (gid < N) { + outBuf[gid] = val1 + val2 + val3 + val4; + } +}; + +__global__ void sharedMemReadSpeed2(float *outBuf, ulong N) { + size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); + size_t lid = threadIdx.x; + __shared__ float local[sharedMemSize2]; + + float val0 = 0.0f; + float val1 = 0.0f; + + for (int i = 0; i < (sharedMemSize2 / 64); i++) { + local[lid + i * 64] = lid; + } + + __syncthreads(); + +#pragma nounroll + for (uint i = 0; i < 32; i++) { + val0 += local[8 * i + 0]; + val1 += local[8 * i + 1]; + val0 += local[8 * i + 2]; + val1 += local[8 * i + 3]; + val0 += local[8 * i + 4]; + val1 += local[8 * i + 5]; + val0 += local[8 * i + 6]; + val1 += local[8 * i + 7]; + } + + if (gid < N) { + outBuf[gid] = val0 + val1; + } +}; + +int main(int argc, char *argv[]) { + float *dDst; + float *hDst; + hipStream_t stream; + constexpr uint numSizes = 4; + constexpr uint Sizes[numSizes] = {262144, 1048576, 4194304, 16777216}; + uint numReads1 = 32; + uint numReads2 = 256; + uint sharedMemSizeBytes1 = sharedMemSize1 * sizeof(float); + uint sharedMemSizeBytes2 = sharedMemSize2 * sizeof(float); + int nIter = 1000; + const unsigned threadsPerBlock = 64; + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return 0; + } + + static int device = 0; + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name + << " with " << props.multiProcessorCount << " CUs" << endl; + + HIPCHECK(hipStreamCreate(&stream)); + + for (int nTest = 0; nTest < numSizes; nTest++) { + uint nBytes = Sizes[nTest % numSizes]; + ulong N = nBytes / sizeof(float); + const unsigned blocks = N / threadsPerBlock; + + hDst = new float[nBytes]; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + memset(hDst, 0, nBytes); + + HIPCHECK(hipMalloc(&dDst, nBytes)); + HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), dim3(threadsPerBlock), + 0, stream, dDst, N); + HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + int tmp = 0; + for (int i = 0; i < N; i++) { + if (i % threadsPerBlock == 0) { + tmp = 0; + } + if (hDst[i] != tmp) { + cout << "info: Data validation failed for warm up run!" << endl; + cout << "info: expected " << tmp << " got " << hDst[i] << endl; + HIPCHECK (hipErrorUnknown); + } + tmp += threadsPerBlock / 2; + } + + auto all_start = chrono::steady_clock::now(); + for (int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), + dim3(threadsPerBlock), 0, stream, dDst, N); + } + hipDeviceSynchronize(); + + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double) blocks * threadsPerBlock + * (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64) * nIter + * (double) (1e-09)) / all_kernel_time.count(); + + cout << "info: read speed = " << setw(8) << perf << " GB/s for " + << sharedMemSizeBytes1 / 1024 << " KB shared memory" + " with " << setw(8) << blocks * threadsPerBlock << " threads, " + << setw(4) << numReads1 << " reads in sharedMemReadSpeed1 kernel" << endl; + + delete[] hDst; + hipFree(dDst); + } + + + for (int nTest = 0; nTest < numSizes; nTest++) { + uint nBytes = Sizes[nTest % numSizes]; + ulong N = nBytes / sizeof(float); + const unsigned blocks = N / threadsPerBlock; + + hDst = new float[nBytes]; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + memset(hDst, 0, nBytes); + + HIPCHECK(hipMalloc(&dDst, nBytes)); + HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), dim3(threadsPerBlock), + 0, stream, dDst, N); + HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + auto all_start = chrono::steady_clock::now(); + for (int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), + dim3(threadsPerBlock), 0, stream, dDst, N); + } + hipDeviceSynchronize(); + + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double) blocks * threadsPerBlock + * (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64) * nIter + * (double) (1e-09)) / all_kernel_time.count(); + + cout << "info: read speed = " << setw(8) << perf << " GB/s for " + << sharedMemSizeBytes2 / 1024 << " KB shared memory" + " with " << setw(8) << blocks * threadsPerBlock << " threads, " + << setw(4) << numReads2 << " reads in sharedMemReadSpeed2 kernel" << endl; + + delete[] hDst; + hipFree(dDst); + } + + HIPCHECK(hipStreamDestroy(stream)); + + passed(); +} From 996df70de93d581ed7859c33e586a14fce025ba8 Mon Sep 17 00:00:00 2001 From: David Salinas Date: Wed, 22 Jul 2020 10:38:10 -0400 Subject: [PATCH 32/41] correct -x option in hip::device Change-Id: I8c288e678b4357019144a024395cda1fcead61fc --- hipamd/hip-config.cmake.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/hip-config.cmake.in b/hipamd/hip-config.cmake.in index 27e7c03fbf..ba340134f7 100644 --- a/hipamd/hip-config.cmake.in +++ b/hipamd/hip-config.cmake.in @@ -156,13 +156,13 @@ if(HIP_COMPILER STREQUAL "clang") if (EXISTS ${AMD_DEVICE_LIBS_PREFIX}/amdgcn/bitcode) set_property(TARGET hip::device APPEND PROPERTY - INTERFACE_COMPILE_OPTIONS -x hip + INTERFACE_COMPILE_OPTIONS -xhip ) else() # This path is to support an older build of the device library # TODO: To be removed in the future. set_property(TARGET hip::device APPEND PROPERTY - INTERFACE_COMPILE_OPTIONS -x hip --hip-device-lib-path=${AMD_DEVICE_LIBS_PREFIX}/lib + INTERFACE_COMPILE_OPTIONS -xhip --hip-device-lib-path=${AMD_DEVICE_LIBS_PREFIX}/lib ) endif() From b98727505d6eaa36ad4b81364f5c4810abb11505 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Wed, 29 Jul 2020 21:26:10 -0400 Subject: [PATCH 33/41] [HipPerf] add two subtests for measuring maximum device memory read/write speed SWDEV-245290 / SWDEV-246220 [HIPPerf] Port OCLPerfDevMemWriteSpeed/OCLPerfDevMemReadSpeed into hip performance subtests Change-Id: I5dc323c75cebbc17596dcb4ed9492e18c5246868 --- .../memory/hipPerfDevMemReadSpeed.cpp | 165 ++++++++++++++++++ .../memory/hipPerfDevMemWriteSpeed.cpp | 155 ++++++++++++++++ 2 files changed, 320 insertions(+) create mode 100644 hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp create mode 100644 hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp diff --git a/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp b/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp new file mode 100644 index 0000000000..f740d50ace --- /dev/null +++ b/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp @@ -0,0 +1,165 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +using namespace std; + +#define arraySize 16 + +typedef struct d_uint16 { + uint data[arraySize]; +} d_uint16; + +__global__ void read_kernel(d_uint16 *src, ulong N, uint *dst) { + + size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; + + uint tmp = 0; + for (size_t i = idx; i < N; i += stride) { + for (size_t j = 0; j < arraySize; j++) { + tmp += src[i].data[j]; + } + } + + atomicAdd(dst, tmp); +} + +int main(int argc, char* argv[]) { + d_uint16 *dSrc; + d_uint16 *hSrc; + uint *dDst; + uint *hDst; + hipStream_t stream; + ulong N = 4 * 1024 * 1024; + uint nBytes = N * sizeof(d_uint16); + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return 0; + } + + static int device = 0; + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name << + " with " << props.multiProcessorCount << " CUs" << endl; + + const unsigned threadsPerBlock = 64; + const unsigned blocks = props.multiProcessorCount * 4; + + uint inputData = 0x1; + int nIter = 1000; + + hSrc = new d_uint16[nBytes]; + HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess); + hDst = new uint; + hDst[0] = 0; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + for (size_t i = 0; i < N; i++) { + for (int j = 0; j < arraySize; j++) { + hSrc[i].data[j] = inputData; + } + } + + HIPCHECK(hipMalloc(&dSrc, nBytes)); + HIPCHECK(hipMalloc(&dDst, sizeof(uint))); + + HIPCHECK(hipStreamCreate(&stream)); + + HIPCHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice)); + + cout << "info: warm up launch for 'read_kernel' on the stream " << stream << endl; + + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); + HIPCHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + if (hDst[0] != (nBytes / sizeof(uint))) { + cout << "info: Data validation failed for warm up run!" << endl; + cout << "info: expected " << nBytes / sizeof(uint) << " got " << hDst[0] << endl; + HIPCHECK(hipErrorUnknown); + } + + cout << "info: data validated for warm up launch for 'read_kernel'" << endl; + cout << "info: launching 'read_kernel' on the stream " << stream << " for "<< nIter << " iterations"<< endl; + + // measure performance based on host time + auto all_start = chrono::steady_clock::now(); + + for(int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); + } + hipDeviceSynchronize(); + + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); + + cout << "info: average read speed of " << perf << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB, calculated based on host time" << endl; + + // measure performance based on events time + hipEvent_t start, stop; + HIPCHECK(hipEventCreate(&start)); + HIPCHECK(hipEventCreate(&stop)); + float allEventMs = 0; + for(int i = 0; i < nIter; i++) { + HIPCHECK(hipEventRecord(start, NULL)); + + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); + + HIPCHECK(hipEventRecord(stop, NULL)); + HIPCHECK(hipEventSynchronize(stop)); + + float eventMs = 1.0f; + HIPCHECK(hipEventElapsedTime(&eventMs, start, stop)); + + allEventMs += eventMs; + + } + + double perfe = ((double)nBytes * nIter * (double)(1e-06)) / allEventMs; + cout << "info: average read speed of " << perfe << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB, calculated based on events time" << endl; + + delete [] hSrc; + delete hDst; + hipFree(dSrc); + hipFree(dDst); + HIPCHECK(hipStreamDestroy(stream)); + + passed(); +} diff --git a/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp b/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp new file mode 100644 index 0000000000..9760a161c7 --- /dev/null +++ b/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp @@ -0,0 +1,155 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +using namespace std; + +#define arraySize 16 + +typedef struct d_uint16 { + uint data[arraySize]; +} d_uint16; + +__global__ void write_kernel(d_uint16 *dst, ulong N, d_uint16 pval) { + size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < N; i += stride) { + dst[i] = pval; + } +}; + +int main(int argc, char* argv[]) { + d_uint16 *dDst; + d_uint16 *hDst; + hipStream_t stream; + ulong N = 4 * 1024 * 1024; + uint nBytes = N * sizeof(d_uint16); + d_uint16 pval; + + for (int i = 0; i < arraySize; i++) { + pval.data[i] = 0xabababab; + } + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return 0; + } + + static int device = 0; + HIPCHECK(hipSetDevice(device)); + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, device)); + cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name << + " with " << props.multiProcessorCount << " CUs" << endl; + + size_t threadsPerBlock = 64; + size_t blocks = props.multiProcessorCount * 4; + + uint inputData = 0xabababab; + int nIter = 1000; + + hDst = new d_uint16[nBytes]; + HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < arraySize; j++) { + hDst[i].data[j] = 0; + } + } + + HIPCHECK(hipMalloc(&dDst, nBytes)); + + HIPCHECK(hipStreamCreate(&stream)); + + + cout << "info: warm up launch for 'write_kernel' on the stream " << stream << endl; + + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); + HIPCHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost)); + hipDeviceSynchronize(); + + for (uint i = 0; i < N; i++) { + for (uint j = 0; j < arraySize; j++) { + if (hDst[i].data[j] != inputData) { + cout << "info: Data validation failed for warm up run! " << endl; + cout << "at index i: " << i << " element j: " << j << endl; + cout << hex << "expected 0x" << inputData << " but got 0x" << hDst[i].data[j] << endl; + HIPCHECK(hipErrorUnknown); + } + } + } + + cout << "info: data validated for warm up launch for 'write_kernel" << endl; + cout << "info: launching 'write_kernel' on the stream " << stream << " for "<< nIter << " iterations"<< endl; + + auto all_start = chrono::steady_clock::now(); + for(int i = 0; i < nIter; i++) { + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); + } + hipDeviceSynchronize(); + auto all_end = chrono::steady_clock::now(); + chrono::duration all_kernel_time = all_end - all_start; + + // read speed in GB/s + double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); + + cout << "info: average write speed of " << perf << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB, calculated based on host time" << endl; + + // measure performance based on events time + hipEvent_t start, stop; + HIPCHECK(hipEventCreate(&start)); + HIPCHECK(hipEventCreate(&stop)); + float allEventMs = 0; + for(int i = 0; i < nIter; i++) { + HIPCHECK(hipEventRecord(start, NULL)); + + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); + + HIPCHECK(hipEventRecord(stop, NULL)); + HIPCHECK(hipEventSynchronize(stop)); + + float eventMs = 1.0f; + HIPCHECK(hipEventElapsedTime(&eventMs, start, stop)); + + allEventMs += eventMs; + + } + + double perfe = ((double)nBytes * nIter * (double)(1e-06)) / allEventMs; + cout << "info: average write speed of " << perfe << " GB/s " << "achieved for memory size of " << + nBytes / (1024 * 1024) << " MB, calculated based on events time" << endl; + + delete [] hDst; + hipFree(dDst); + HIPCHECK(hipStreamDestroy(stream)); + + passed(); +} From e93f7b6a56c100f7c5f4d06405b6ff96d31d155f Mon Sep 17 00:00:00 2001 From: Julia jiang Date: Wed, 5 Aug 2020 15:46:07 -0400 Subject: [PATCH 34/41] SWDEV-246878-Update document HIP_porting_guide Change-Id: I7f4c92637f31da76293048133fff5af816c752ca --- hipamd/docs/markdown/hip_porting_guide.md | 152 +++++++++++----------- 1 file changed, 75 insertions(+), 77 deletions(-) diff --git a/hipamd/docs/markdown/hip_porting_guide.md b/hipamd/docs/markdown/hip_porting_guide.md index 9806d841a5..23e5058061 100644 --- a/hipamd/docs/markdown/hip_porting_guide.md +++ b/hipamd/docs/markdown/hip_porting_guide.md @@ -14,7 +14,7 @@ and provides practical suggestions on how to port CUDA code and work through com * [CUDA to HIP Math Library Equivalents](#library-equivalents) - [Distinguishing Compiler Modes](#distinguishing-compiler-modes) * [Identifying HIP Target Platform](#identifying-hip-target-platform) - * [Identifying the Compiler: hcc, hip-clang, or nvcc](#identifying-the-compiler-hcc-hip-clang-or-nvcc) + * [Identifying the Compiler: hip-clang, or nvcc](#identifying-the-compiler-hip-clang-or-nvcc) * [Identifying Current Compilation Pass: Host or Device](#identifying-current-compilation-pass-host-or-device) * [Compiler Defines: Summary](#compiler-defines-summary) - [Identifying Architecture Features](#identifying-architecture-features) @@ -41,12 +41,10 @@ and provides practical suggestions on how to port CUDA code and work through com - [threadfence_system](#threadfence_system) * [Textures and Cache Control](#textures-and-cache-control) - [More Tips](#more-tips) - * [HIPTRACE Mode](#hiptrace-mode) - * [Environment Variables](#environment-variables) + * [HIP Logging](#hip-logging) * [Debugging hipcc](#debugging-hipcc) * [What Does This Error Mean?](#what-does-this-error-mean) + [/usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();](#usrincludecv1memory517215-error-call-to-implicitly-deleted-default-constructor-of-std__1bad_weak_ptr-throw-bad_weak_ptr) - * [HIP Environment Variables](#hip-environment-variables) * [Editor Highlighting](#editor-highlighting) @@ -163,17 +161,19 @@ Many projects use a mixture of an accelerator compiler (AMD or NVIDIA) and a sta -### Identifying the Compiler: hcc, hip-clang or nvcc -Often, it's useful to know whether the underlying compiler is hcc, HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning. - +### Identifying the Compiler: hip-clang or nvcc +Often, it's useful to know whether the underlying compiler is HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning. ``` -#ifdef __HCC__ -// Compiled with hcc +#ifdef __HIP_PLATFORM_HCC__ +// Compiled with HIP-Clang ``` + ``` -#ifdef __HIP__ +#if defined(__HCC__) || (defined(__clang__) && defined(__HIP__)) +#define __HIP_PLATFORM_HCC__ +#endif // Compiled with HIP-Clang ``` @@ -198,7 +198,7 @@ Compiler directly generates the host code (using the Clang x86 target) and passe nvcc makes two passes over the code: one for host code and one for device code. HIP-Clang will have multiple passes over the code: one for the host code, and one for each architecture on the device code. -`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (hcc, HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define. +`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define. ``` // #ifdef __CUDA_ARCH__ @@ -209,24 +209,21 @@ Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, an ### Compiler Defines: Summary -|Define | hcc | HIP-Clang | nvcc | Other (GCC, ICC, Clang, etc.) -|--- | --- | --- | --- |---| +|Define | HIP-Clang | nvcc | Other (GCC, ICC, Clang, etc.) +|--- | --- | --- |---| |HIP-related defines:| -|`__HIP_PLATFORM_HCC__`| Defined | Defined | Undefined | Defined if targeting hcc platform; undefined otherwise | -|`__HIP_PLATFORM_NVCC__`| Undefined | Undefined | Defined | Defined if targeting nvcc platform; undefined otherwise | -|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host | 1 if compiling for device; undefined if compiling for host |1 if compiling for device; undefined if compiling for host | Undefined -|`__HIPCC__` | Defined | Defined | Defined | Undefined -|`__HIP_ARCH_*` | 0 or 1 depending on feature support (see below) |0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0 +|`__HIP_PLATFORM_HCC__`| Defined | Undefined | Defined if targeting AMD platform; undefined otherwise | +|`__HIP_PLATFORM_NVCC__`| Undefined | Defined | Defined if targeting nvcc platform; undefined otherwise | +|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host |1 if compiling for device; undefined if compiling for host | Undefined +|`__HIPCC__` | Defined | Defined | Undefined +|`__HIP_ARCH_*` |0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0 |nvcc-related defines:| -|`__CUDACC__` | Undefined | Undefined | Defined if source code is compiled by nvcc; undefined otherwise | Undefined -|`__NVCC__` | Undefined | Undefined | Defined | Undefined -|`__CUDA_ARCH__` | Undefined | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code | Undefined -|hcc-related defines:| -|`__HCC__` | Defined | Undefined | Undefined | Undefined -|`__HCC_ACCELERATOR__` | Nonzero if in device code; otherwise undefined | Undefined | Undefined | Undefined +|`__CUDACC__` | Defined if source code is compiled by nvcc; undefined otherwise | Undefined +|`__NVCC__` | Undefined | Defined | Undefined +|`__CUDA_ARCH__` | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code | Undefined |hip-clang-related defines:| -|`__HIP__` | Undefined | Defined | Undefined | Undefined -|hcc/HIP-Clang common defines:| +|`__HIP__` | Defined | Undefined | Undefined +|HIP-Clang common defines:| |`__clang__` | Defined | Defined | Undefined | Defined if using Clang; otherwise undefined ## Identifying Architecture Features @@ -274,23 +271,23 @@ The table below shows the full set of architectural properties that HIP supports |`__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__` | hasSharedInt32Atomics |32-bit integer atomics for shared memory |`__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__` | hasSharedFloatAtomicExch |32-bit float atomic exchange for shared memory |`__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__` | hasFloatAtomicAdd |32-bit float atomic add in global and shared memory -|64-bit atomics: | | -|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__` | hasGlobalInt64Atomics |64-bit integer atomics for global memory +|64-bit atomics: | | +|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__` | hasGlobalInt64Atomics |64-bit integer atomics for global memory |`__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__` | hasSharedInt64Atomics |64-bit integer atomics for shared memory |Doubles: | | -|`__HIP_ARCH_HAS_DOUBLES__` | hasDoubles |Double-precision floating point -|Warp cross-lane operations: | | -|`__HIP_ARCH_HAS_WARP_VOTE__` | hasWarpVote |Warp vote instructions (any, all) -|`__HIP_ARCH_HAS_WARP_BALLOT__` | hasWarpBallot |Warp ballot instructions -|`__HIP_ARCH_HAS_WARP_SHUFFLE__` | hasWarpShuffle |Warp shuffle operations (shfl\_\*) +|`__HIP_ARCH_HAS_DOUBLES__` | hasDoubles |Double-precision floating point +|Warp cross-lane operations: | | +|`__HIP_ARCH_HAS_WARP_VOTE__` | hasWarpVote |Warp vote instructions (any, all) +|`__HIP_ARCH_HAS_WARP_BALLOT__` | hasWarpBallot |Warp ballot instructions +|`__HIP_ARCH_HAS_WARP_SHUFFLE__` | hasWarpShuffle |Warp shuffle operations (shfl\_\*) |`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__` | hasFunnelShift |Funnel shift two input words into one |Sync: | | |`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__` | hasThreadFenceSystem |threadfence\_system -|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__` | hasSyncThreadsExt |syncthreads\_count, syncthreads\_and, syncthreads\_or -|Miscellaneous: | | -|`__HIP_ARCH_HAS_SURFACE_FUNCS__` | hasSurfaceFuncs | -|`__HIP_ARCH_HAS_3DGRID__` | has3dGrid | Grids and groups are 3D -|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__` | hasDynamicParallelism | +|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__` | hasSyncThreadsExt |syncthreads\_count, syncthreads\_and, syncthreads\_or +|Miscellaneous: | | +|`__HIP_ARCH_HAS_SURFACE_FUNCS__` | hasSurfaceFuncs | +|`__HIP_ARCH_HAS_3DGRID__` | has3dGrid | Grids and groups are 3D +|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__` | hasDynamicParallelism | ## Finding HIP @@ -498,19 +495,15 @@ int main() std::cout<<"Passed"< Date: Mon, 27 Jul 2020 13:22:19 -0400 Subject: [PATCH 35/41] Don't align 2D image buffer size Cuda allows creating 2D textures from memory with unaligned size. Aligning it will cause us to overshoot the size of the underlying allocation. Change-Id: Ia20dec2105e36ca4a8448c9c6299d0001b388d25 --- hipamd/rocclr/hip_texture.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hipamd/rocclr/hip_texture.cpp b/hipamd/rocclr/hip_texture.cpp index 4980f87eaf..8217dfb6f8 100755 --- a/hipamd/rocclr/hip_texture.cpp +++ b/hipamd/rocclr/hip_texture.cpp @@ -269,8 +269,10 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, case hipResourceTypePitch2D: { const cl_channel_order channelOrder = hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.pitch2D.desc), pTexDesc->sRGB); const cl_channel_type channelType = hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode); + const amd::Image::Format imageFormat({channelOrder, channelType}); const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType); - const size_t imageSizeInBytes = pResDesc->res.pitch2D.pitchInBytes * pResDesc->res.pitch2D.height; + const size_t imageSizeInBytes = pResDesc->res.pitch2D.width * imageFormat.getElementSize() + + pResDesc->res.pitch2D.pitchInBytes * (pResDesc->res.pitch2D.height - 1); amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes); image = ihipImageCreate(channelOrder, channelType, From 236ef9416627f14565f8c826f9b2b1668ed67f19 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 10 Aug 2020 18:25:31 +0000 Subject: [PATCH 36/41] Return error from hipMallocManaged for size =0 Change-Id: Ida0cecf05e1df731176d41e59422d1e606cc9bd9 --- hipamd/rocclr/hip_hmm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/rocclr/hip_hmm.cpp b/hipamd/rocclr/hip_hmm.cpp index ab787e195e..b9df3b25d6 100644 --- a/hipamd/rocclr/hip_hmm.cpp +++ b/hipamd/rocclr/hip_hmm.cpp @@ -59,7 +59,7 @@ static_assert(static_cast(hipMemRangeAttributeLastPrefetchLocation) == hipError_t hipMallocManaged(void** dev_ptr, size_t size, unsigned int flags) { HIP_INIT_API(hipMallocManaged, dev_ptr, size, flags); - if ((dev_ptr == nullptr) || (flags != hipMemAttachGlobal)) { + if ((dev_ptr == nullptr) || (size == 0) || (flags != hipMemAttachGlobal)) { HIP_RETURN(hipErrorInvalidValue); } @@ -213,4 +213,4 @@ static hipError_t ihipMallocManaged(void** ptr, size_t size) { ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] ihipMallocManaged ptr=0x%zx", getpid(), std::this_thread::get_id(), *ptr); return hipSuccess; -} \ No newline at end of file +} From b73673bc7406ef7a4aee1243ae660b5a64e0218a Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Mon, 10 Aug 2020 14:59:44 -0400 Subject: [PATCH 37/41] Allow prefetch on the null stream Change-Id: I87e950460361fab65ab140988fc2288bcb194f08 --- hipamd/rocclr/hip_hmm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/rocclr/hip_hmm.cpp b/hipamd/rocclr/hip_hmm.cpp index b9df3b25d6..82d16b3562 100644 --- a/hipamd/rocclr/hip_hmm.cpp +++ b/hipamd/rocclr/hip_hmm.cpp @@ -71,7 +71,7 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device, hipStream_t stream) { HIP_INIT_API(hipMemPrefetchAsync, dev_ptr, count, device, stream); - if ((dev_ptr == nullptr) || (count == 0) || (stream == nullptr)) { + if ((dev_ptr == nullptr) || (count == 0)) { HIP_RETURN(hipErrorInvalidValue); } amd::HostQueue* queue = nullptr; From d5c97e0c31ed6e059003e25846a34452d33800fd Mon Sep 17 00:00:00 2001 From: Jason Tang Date: Fri, 7 Aug 2020 11:47:13 -0400 Subject: [PATCH 38/41] SWDEV-232197 - Remove refering to gfxipVersion_ Change-Id: I91b6b972e26b18dabf6deda5a142bff491c8bb45 --- hipamd/rocclr/hip_device.cpp | 10 +++++----- hipamd/rocclr/hip_platform.cpp | 9 ++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/hipamd/rocclr/hip_device.cpp b/hipamd/rocclr/hip_device.cpp index f36fde4680..b116819846 100644 --- a/hipamd/rocclr/hip_device.cpp +++ b/hipamd/rocclr/hip_device.cpp @@ -92,8 +92,8 @@ hipError_t hipDeviceComputeCapability(int *major, int *minor, hipDevice_t device auto* deviceHandle = g_devices[device]->devices()[0]; const auto& info = deviceHandle->info(); - *major = info.gfxipVersion_ / 100; - *minor = info.gfxipVersion_ % 100; + *major = info.gfxipMajor_; + *minor = info.gfxipMinor_; HIP_RETURN(hipSuccess); } @@ -177,8 +177,8 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) deviceProps.memoryClockRate = info.maxMemoryClockFrequency_ * 1000; deviceProps.memoryBusWidth = info.globalMemChannels_; deviceProps.totalConstMem = info.maxConstantBufferSize_; - deviceProps.major = info.gfxipVersion_ / 100; - deviceProps.minor = info.gfxipVersion_ % 100; + deviceProps.major = info.gfxipMajor_; + deviceProps.minor = info.gfxipMinor_; deviceProps.multiProcessorCount = info.maxComputeUnits_; deviceProps.l2CacheSize = info.l2CacheSize_; deviceProps.maxThreadsPerMultiProcessor = info.maxThreadsPerCU_; @@ -208,7 +208,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) deviceProps.maxSharedMemoryPerMultiProcessor = info.localMemSizePerCU_; //deviceProps.isMultiGpuBoard = info.; deviceProps.canMapHostMemory = 1; - deviceProps.gcnArch = info.gfxipVersion_; + deviceProps.gcnArch = info.gfxipMajor_ * 100 + info.gfxipMinor_ * 10 + info.gfxipStepping_; sprintf(deviceProps.gcnArchName, "gfx%d%d%x", info.gfxipMajor_, info.gfxipMinor_, info.gfxipStepping_); deviceProps.cooperativeLaunch = info.cooperativeGroups_; deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_; diff --git a/hipamd/rocclr/hip_platform.cpp b/hipamd/rocclr/hip_platform.cpp index e03753e77c..3935e4d5f8 100755 --- a/hipamd/rocclr/hip_platform.cpp +++ b/hipamd/rocclr/hip_platform.cpp @@ -351,10 +351,10 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( size_t GprWaves = VgprWaves; if (wrkGrpInfo->usedSGPRs_ > 0) { size_t maxSGPRs; - if (device.info().gfxipVersion_ < 800) { + if (device.info().gfxipMajor_ < 8) { maxSGPRs = 512; } - else if (device.info().gfxipVersion_ < 1000) { + else if (device.info().gfxipMajor_ < 10) { maxSGPRs = 800; } else { @@ -467,7 +467,7 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* HIP_RETURN(ret); } -hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, +hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) { HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, f, blockSize, dynSharedMemPerBlk); @@ -486,7 +486,7 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, } hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, - hipFunction_t f, int blockSize, + hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) { HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, f, blockSize, dynSharedMemPerBlk, flags); @@ -847,4 +847,3 @@ void PlatformState::popExec(ihipExec_t& exec) { exec = std::move(execStack_.top()); execStack_.pop(); } - From 1a2622dd6028045d6c11041b068c31715bcd6b7e Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Thu, 13 Aug 2020 14:23:58 -0400 Subject: [PATCH 39/41] [HipPerf] report performance based on wall time only for hipPerfDevMemReadSpeed/hipPerfDevMemWriteSpeed Change-Id: I1fda2ec76da6fad6852d328e0a3fc39e28af57bb --- .../memory/hipPerfDevMemReadSpeed.cpp | 31 +------------------ .../memory/hipPerfDevMemWriteSpeed.cpp | 31 +------------------ 2 files changed, 2 insertions(+), 60 deletions(-) diff --git a/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp b/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp index f740d50ace..181cd37f24 100644 --- a/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp +++ b/hipamd/tests/performance/memory/hipPerfDevMemReadSpeed.cpp @@ -99,8 +99,6 @@ int main(int argc, char* argv[]) { HIPCHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice)); - cout << "info: warm up launch for 'read_kernel' on the stream " << stream << endl; - hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); HIPCHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost)); hipDeviceSynchronize(); @@ -111,9 +109,6 @@ int main(int argc, char* argv[]) { HIPCHECK(hipErrorUnknown); } - cout << "info: data validated for warm up launch for 'read_kernel'" << endl; - cout << "info: launching 'read_kernel' on the stream " << stream << " for "<< nIter << " iterations"<< endl; - // measure performance based on host time auto all_start = chrono::steady_clock::now(); @@ -129,31 +124,7 @@ int main(int argc, char* argv[]) { double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); cout << "info: average read speed of " << perf << " GB/s " << "achieved for memory size of " << - nBytes / (1024 * 1024) << " MB, calculated based on host time" << endl; - - // measure performance based on events time - hipEvent_t start, stop; - HIPCHECK(hipEventCreate(&start)); - HIPCHECK(hipEventCreate(&stop)); - float allEventMs = 0; - for(int i = 0; i < nIter; i++) { - HIPCHECK(hipEventRecord(start, NULL)); - - hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); - - HIPCHECK(hipEventRecord(stop, NULL)); - HIPCHECK(hipEventSynchronize(stop)); - - float eventMs = 1.0f; - HIPCHECK(hipEventElapsedTime(&eventMs, start, stop)); - - allEventMs += eventMs; - - } - - double perfe = ((double)nBytes * nIter * (double)(1e-06)) / allEventMs; - cout << "info: average read speed of " << perfe << " GB/s " << "achieved for memory size of " << - nBytes / (1024 * 1024) << " MB, calculated based on events time" << endl; + nBytes / (1024 * 1024) << " MB" << endl; delete [] hSrc; delete hDst; diff --git a/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp b/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp index 9760a161c7..4d706cdde9 100644 --- a/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp +++ b/hipamd/tests/performance/memory/hipPerfDevMemWriteSpeed.cpp @@ -88,9 +88,6 @@ int main(int argc, char* argv[]) { HIPCHECK(hipStreamCreate(&stream)); - - cout << "info: warm up launch for 'write_kernel' on the stream " << stream << endl; - hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); HIPCHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost)); hipDeviceSynchronize(); @@ -106,9 +103,6 @@ int main(int argc, char* argv[]) { } } - cout << "info: data validated for warm up launch for 'write_kernel" << endl; - cout << "info: launching 'write_kernel' on the stream " << stream << " for "<< nIter << " iterations"<< endl; - auto all_start = chrono::steady_clock::now(); for(int i = 0; i < nIter; i++) { hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); @@ -121,31 +115,8 @@ int main(int argc, char* argv[]) { double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); cout << "info: average write speed of " << perf << " GB/s " << "achieved for memory size of " << - nBytes / (1024 * 1024) << " MB, calculated based on host time" << endl; + nBytes / (1024 * 1024) << " MB" << endl; - // measure performance based on events time - hipEvent_t start, stop; - HIPCHECK(hipEventCreate(&start)); - HIPCHECK(hipEventCreate(&stop)); - float allEventMs = 0; - for(int i = 0; i < nIter; i++) { - HIPCHECK(hipEventRecord(start, NULL)); - - hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); - - HIPCHECK(hipEventRecord(stop, NULL)); - HIPCHECK(hipEventSynchronize(stop)); - - float eventMs = 1.0f; - HIPCHECK(hipEventElapsedTime(&eventMs, start, stop)); - - allEventMs += eventMs; - - } - - double perfe = ((double)nBytes * nIter * (double)(1e-06)) / allEventMs; - cout << "info: average write speed of " << perfe << " GB/s " << "achieved for memory size of " << - nBytes / (1024 * 1024) << " MB, calculated based on events time" << endl; delete [] hDst; hipFree(dDst); From de1e14b25268c46b67103376fdc6e04287208ce4 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Thu, 13 Aug 2020 17:26:44 -0400 Subject: [PATCH 40/41] [HipPerf] relocating/renaming some hip perf tests Change-Id: Ie85d242cd68cd14a858e07ed27875a5196014688 --- .../{perfDispatch => dispatch}/hipPerfDispatchSpeed.cpp | 2 +- .../{perfDispatch => memory}/hipPerfBufferCopyRectSpeed.cpp | 2 +- .../{perfDispatch => memory}/hipPerfBufferCopySpeed.cpp | 2 +- .../memory/{hipHostNumaAlloc.cpp => hipPerfHostNumaAlloc.cpp} | 2 +- hipamd/tests/{performance/perfDispatch => src}/timer.cpp | 0 hipamd/tests/{performance/perfDispatch => src}/timer.h | 0 6 files changed, 4 insertions(+), 4 deletions(-) rename hipamd/tests/performance/{perfDispatch => dispatch}/hipPerfDispatchSpeed.cpp (98%) rename hipamd/tests/performance/{perfDispatch => memory}/hipPerfBufferCopyRectSpeed.cpp (98%) rename hipamd/tests/performance/{perfDispatch => memory}/hipPerfBufferCopySpeed.cpp (98%) rename hipamd/tests/performance/memory/{hipHostNumaAlloc.cpp => hipPerfHostNumaAlloc.cpp} (97%) rename hipamd/tests/{performance/perfDispatch => src}/timer.cpp (100%) rename hipamd/tests/{performance/perfDispatch => src}/timer.h (100%) diff --git a/hipamd/tests/performance/perfDispatch/hipPerfDispatchSpeed.cpp b/hipamd/tests/performance/dispatch/hipPerfDispatchSpeed.cpp similarity index 98% rename from hipamd/tests/performance/perfDispatch/hipPerfDispatchSpeed.cpp rename to hipamd/tests/performance/dispatch/hipPerfDispatchSpeed.cpp index 84ba73c3aa..12999ebc33 100644 --- a/hipamd/tests/performance/perfDispatch/hipPerfDispatchSpeed.cpp +++ b/hipamd/tests/performance/dispatch/hipPerfDispatchSpeed.cpp @@ -7,7 +7,7 @@ #include "test_common.h" /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp timer.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/hipamd/tests/performance/perfDispatch/hipPerfBufferCopyRectSpeed.cpp b/hipamd/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp similarity index 98% rename from hipamd/tests/performance/perfDispatch/hipPerfBufferCopyRectSpeed.cpp rename to hipamd/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp index 5000904af9..3cb3243e80 100644 --- a/hipamd/tests/performance/perfDispatch/hipPerfBufferCopyRectSpeed.cpp +++ b/hipamd/tests/performance/memory/hipPerfBufferCopyRectSpeed.cpp @@ -7,7 +7,7 @@ #include "test_common.h" /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp timer.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/hipamd/tests/performance/perfDispatch/hipPerfBufferCopySpeed.cpp b/hipamd/tests/performance/memory/hipPerfBufferCopySpeed.cpp similarity index 98% rename from hipamd/tests/performance/perfDispatch/hipPerfBufferCopySpeed.cpp rename to hipamd/tests/performance/memory/hipPerfBufferCopySpeed.cpp index 6f284ae7fb..d9a2d443a2 100644 --- a/hipamd/tests/performance/perfDispatch/hipPerfBufferCopySpeed.cpp +++ b/hipamd/tests/performance/memory/hipPerfBufferCopySpeed.cpp @@ -7,7 +7,7 @@ #include "test_common.h" /* HIT_START - * BUILD: %t %s ../../src/test_common.cpp timer.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/hipamd/tests/performance/memory/hipHostNumaAlloc.cpp b/hipamd/tests/performance/memory/hipPerfHostNumaAlloc.cpp similarity index 97% rename from hipamd/tests/performance/memory/hipHostNumaAlloc.cpp rename to hipamd/tests/performance/memory/hipPerfHostNumaAlloc.cpp index 38401c8046..a5e60c8549 100644 --- a/hipamd/tests/performance/memory/hipHostNumaAlloc.cpp +++ b/hipamd/tests/performance/memory/hipPerfHostNumaAlloc.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #include #include "hip/hip_runtime.h" /* HIT_START - * BUILD_CMD: hipHostNumaAlloc %hc -I%S/../../src %S/%s %S/../../src/test_common.cpp -lnuma -o %T/%t EXCLUDE_HIP_PLATFORM nvcc + * BUILD_CMD: hipPerfHostNumaAlloc %hc -I%S/../../src %S/%s %S/../../src/test_common.cpp -lnuma -o %T/%t EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ diff --git a/hipamd/tests/performance/perfDispatch/timer.cpp b/hipamd/tests/src/timer.cpp similarity index 100% rename from hipamd/tests/performance/perfDispatch/timer.cpp rename to hipamd/tests/src/timer.cpp diff --git a/hipamd/tests/performance/perfDispatch/timer.h b/hipamd/tests/src/timer.h similarity index 100% rename from hipamd/tests/performance/perfDispatch/timer.h rename to hipamd/tests/src/timer.h From 95729c31e88c83e17b24eb4f1c23bf096f7a0101 Mon Sep 17 00:00:00 2001 From: Freddy Paul Date: Wed, 29 Jul 2020 14:29:08 -0700 Subject: [PATCH 41/41] Enable developers to build hip without CMAKE_PREFIX_PATH With recent changes in ROCclr to take HSA cmake target it require HIP to have CMAKE_PREFIX_PATH added for install location for HSA, amd_comgr,llvm. User can provide CMAKE_PREFIX_PATH if above packages are built/installed at custom location as: cmake -DCMAKE_PREFIX_PATH=" " For default ROCm installation adding a default path will help. Change-Id: Iefa443dc198da76b57422a641c9821c63cde4922 --- hipamd/rocclr/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hipamd/rocclr/CMakeLists.txt b/hipamd/rocclr/CMakeLists.txt index bef5de347c..e6c4225984 100755 --- a/hipamd/rocclr/CMakeLists.txt +++ b/hipamd/rocclr/CMakeLists.txt @@ -66,6 +66,14 @@ if(NOT DEFINED ROCclr_DIR OR NOT DEFINED LIBOCL_STATIC_DIR OR NOT DEFINED LIBROC # message(FATAL_ERROR "define ROCclr_DIR, LIBOCL_STATIC_DIR\n") endif() + +#APPEND default path for CMAKE_PREFIX_PATH +#User provided will be searched first since defualt path is at end. +#Custom install path can be provided at compile time as cmake parameter(-DCMAKE_PREFIX_PATH="") +#/opt/rocm:default:For amd_comgr,hsa-runtime64 +#/opt/rocm/llvm/:default:For llvm/clang pulled in as dependency from hsa/comgr +list( APPEND CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} "/opt/rocm" "/opt/rocm/llvm") + list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules" ) set(CMAKE_MODULE_PATH${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")