From a50fa0f78e56e790b3574b52a1ad8e0119dc92a7 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 9 Feb 2016 08:39:08 -0600 Subject: [PATCH 01/32] Fix bug in device bounds comparison. Shows up in multi-GPU. [ROCm/clr commit: f1bc9af294a647199de6a6afbd477d3a23ee6260] --- projects/clr/hipamd/src/hip_hcc.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index bdfbdb230b..f9f3b1b1f5 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -40,7 +40,7 @@ THE SOFTWARE. #define USE_PINNED_HOST (__hcc_workweek__ >= 1601) -//#define USE_ASYNC_COPY +#define USE_ASYNC_COPY 0 #define INLINE static inline @@ -338,8 +338,7 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) DeviceErrorCheck(err); prop->l2CacheSize = cache_size[1]; - /* Computemode for HSA Devices is always : cudaComputeModeDefault :/ - Default compute mode (Multiple threads can use cudaSetDevice() with this device) */ + /* Computemode for HSA Devices is always : cudaComputeModeDefault */ prop->computeMode = 0; // Get Max Threads Per Multiprocessor @@ -760,7 +759,7 @@ hipError_t hipSetDevice(int device) { std::call_once(hip_initialized, ihipInit); - if ((device < 0) || (device > g_devices.size())) { + if ((device < 0) || (device >= g_devices.size())) { return ihipLogStatus(hipErrorInvalidDevice); } else { tls_defaultDevice = device; @@ -1428,7 +1427,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind hipError_t e = hipSuccess; -#ifdef USE_ASYNC_COPY +#if USE_ASYNC_COPY if (ihipIsValidDevice(stream->_device_index)) { ihipDevice_t *device = &g_devices[stream->_device_index]; From fe67be1134bda50a7620f691720726315d19c019 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 10 Feb 2016 11:52:42 -0600 Subject: [PATCH 02/32] Create address tracker for am_alloc. Tracks device where memory is allocated, pinned-host or device, and more. Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size will find the associated AmPointerInfo. The insertions and lookups use a self-balancing binary tree and should support O(logN) lookup speed. [ROCm/clr commit: 721508cc2f6d9d3e3875b05c85fe72b0c5572aa8] --- projects/clr/hipamd/bin/hipcc | 1 + projects/clr/hipamd/bin/hipify | 2 + projects/clr/hipamd/include/hcc_detail/AM.h | 92 +++++ .../include/hcc_detail/hip_runtime_api.h | 8 + projects/clr/hipamd/include/hip_runtime_api.h | 24 ++ projects/clr/hipamd/src/hc_AM.cpp | 219 ++++++++++++ projects/clr/hipamd/src/hip_hcc.cpp | 71 +++- projects/clr/hipamd/tests/src/CMakeLists.txt | 2 + .../clr/hipamd/tests/src/hipPointerAttrib.cpp | 319 ++++++++++++++++++ projects/clr/hipamd/tests/src/test_common.cpp | 14 +- projects/clr/hipamd/tests/src/test_common.h | 2 + 11 files changed, 743 insertions(+), 11 deletions(-) create mode 100644 projects/clr/hipamd/include/hcc_detail/AM.h create mode 100644 projects/clr/hipamd/src/hc_AM.cpp create mode 100644 projects/clr/hipamd/tests/src/hipPointerAttrib.cpp diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc index d001c6febe..7537750ff6 100755 --- a/projects/clr/hipamd/bin/hipcc +++ b/projects/clr/hipamd/bin/hipcc @@ -71,6 +71,7 @@ if ($HIP_PLATFORM eq "hcc") { $HIPLDFLAGS .= " -L$HSA_PATH/lib -lhsa-runtime64 -lhc_am"; # Add C++ libs for GCC. $HIPLDFLAGS .= " -lstdc++"; + $HIPLDFLAGS .= " -lm"; if ($verbose & 0x2) { print ("HSA_PATH=$HSA_PATH\n"); diff --git a/projects/clr/hipamd/bin/hipify b/projects/clr/hipamd/bin/hipify index d143bdff37..e3b6c64c88 100755 --- a/projects/clr/hipamd/bin/hipify +++ b/projects/clr/hipamd/bin/hipify @@ -277,6 +277,8 @@ while (@ARGV) { $ft{'mem'} += s/\bcudaMemcpyKind\b/hipMemcpyKind/g; + $ft{'mem'} += s/\bcudaPointerAttributes\b/hipPointerAttribute_t/g; + #-------- # Memory management: diff --git a/projects/clr/hipamd/include/hcc_detail/AM.h b/projects/clr/hipamd/include/hcc_detail/AM.h new file mode 100644 index 0000000000..1cfcf2dfb2 --- /dev/null +++ b/projects/clr/hipamd/include/hcc_detail/AM.h @@ -0,0 +1,92 @@ +#pragma once + +#include + +typedef int am_status_t; +#define AM_SUCCESS 0 +// TODO - provide better mapping of HSA error conditions to HC error codes. +#define AM_ERROR_MISC -1 /** Misellaneous error */ + +// Flags for am_alloc API: +#define amHostPinned 0x1 + + +namespace hc { + +// This is the data that is maintained for each pointer: +struct AmPointerInfo { + bool _isDeviceMem; + void * _hostPointer; + void * _devicePointer; + size_t _sizeBytes; + hc::accelerator _acc; + unsigned _allocationFlags; + + AmPointerInfo() {}; + + AmPointerInfo(bool isDeviceMem, void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, unsigned allocationFlags) : + _isDeviceMem(isDeviceMem), + _hostPointer(hostPointer), + _devicePointer(devicePointer), + _sizeBytes(sizeBytes), + _acc(acc), + _allocationFlags(allocationFlags) {}; +}; +} + + + +namespace hc { + + +/** + * Allocates a block of @p size bytes of memory on the specified @p acc. + * + * The contents of the newly allocated block of memory are not initialized. + * + * If @p size == 0, 0 is returned. + * + * Flags must be 0. + * + * @returns : On success, pointer to the newly allocated memory is returned. + * The pointer is typecast to the desired return type. + * + * If an error occurred trying to allocate the requested memory, 0 is returned. + * + * @see am_free, am_copy + */ +auto_voidp AM_alloc(size_t size, hc::accelerator acc, unsigned flags); + +/** + * Frees a block of memory previously allocated with am_alloc. + * + * @see am_alloc, am_copy + */ +am_status_t AM_free(void* ptr); + + +/** + * Copies @p size bytes of memory from @p src to @ dst. The memory areas (src+size and dst+size) must not overlap. + * + * @returns AM_SUCCESS on error or AM_ERROR_MISC if an error occurs. + * @see am_alloc, am_free + */ +am_status_t AM_copy(void* dst, const void* src, size_t size); + +am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr); + + +// TODO-implement these: +//am_status_t AM_track_pointer(void* ptr, size_t size, bool isDeviceMem=false, unsigned allocationFlags=0x0); +//am_status_t AM_untrack_pointer(void* ptr); + +/** + * Prints the contents of the memory tracker table to stderr + * + * Intended primarily for debug purposes. + **/ +void AM_print_tracker(); + + +}; // namespace hc + diff --git a/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h index 225b065654..a0c676987b 100644 --- a/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h @@ -105,6 +105,8 @@ enum hipMemcpyKind { } ; + + // Doxygen end group GlobalDefs /** @} */ @@ -128,6 +130,7 @@ typedef struct hipEvent_t { + #ifdef __cplusplus } /* extern "C" */ #endif @@ -634,6 +637,11 @@ hipError_t hipEventQuery(hipEvent_t event) ; */ +/** + * @brief Return attributes for the specified pointer + */ +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) ; + /** * @brief Allocate memory on the default accelerator diff --git a/projects/clr/hipamd/include/hip_runtime_api.h b/projects/clr/hipamd/include/hip_runtime_api.h index 882103a1f4..41ad338d6d 100644 --- a/projects/clr/hipamd/include/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip_runtime_api.h @@ -97,6 +97,30 @@ typedef struct hipDeviceProp_t { } hipDeviceProp_t; +/** + * Memory type (for pointer attributes) + */ +enum hipMemoryType { + hipMemoryTypeHost, ///< Memory is physically located on host + hipMemoryTypeDevice ///< Memory is physically located on device. (see deviceId for specific device) +}; + + + +/** + * Pointer attributes + */ +typedef struct hipPointerAttribute_t { + enum hipMemoryType memoryType; + int device; + void *devicePointer; + void *hostPointer; + int isManaged; + unsigned allocationFlags; /* flags specified when memory was allocated*/ + /* peers? */ +} hipPointerAttribute_t; + + // hack to get these to show up in Doxygen: /** * @defgroup GlobalDefs Global enum and defines diff --git a/projects/clr/hipamd/src/hc_AM.cpp b/projects/clr/hipamd/src/hc_AM.cpp new file mode 100644 index 0000000000..87e29e4bcc --- /dev/null +++ b/projects/clr/hipamd/src/hc_AM.cpp @@ -0,0 +1,219 @@ + +#include "hc_am.hpp" +#include "hsa.h" + + +#include "hcc_detail/AM.h" // TODO - Remove me. + +#define DB_TRACKER 1 + +#if DB_TRACKER +#define mprintf( ...) {\ + fprintf (stderr, __VA_ARGS__);\ + }; +#else +#define mprintf( ...) +#endif + +//========================================================================================================= +// Pointer Tracker Structures: +//========================================================================================================= +#include +#include +//#include + +struct AmMemoryRange { + void * _basePointer; + void * _endPointer; + AmMemoryRange(void *basePointer, size_t sizeBytes) : + _basePointer(basePointer), _endPointer((unsigned char*)basePointer + sizeBytes - 1) {}; +}; + +// Functor to compare ranges: +struct AmMemoryRangeCompare { + // Return true is LHS range is less than RHS - used to order the + bool operator()(const AmMemoryRange &lhs, const AmMemoryRange &rhs) const + { + return lhs._endPointer < rhs._basePointer; + } + +}; + + +std::ostream &operator<<(std::ostream &os, const hc::AmPointerInfo &ap) +{ + os << "hostPointer:" << ap._hostPointer << " devicePointer:"<< ap._devicePointer << " sizeBytes:" << ap._sizeBytes + << " isDeviceMem:" << ap._isDeviceMem << " allocFlags:" << ap._allocationFlags; + return os; +} + + + +// This structure tracks information for each pointer. +// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size will find the associated AmPointerInfo. +// The insertions and lookups use a self-balancing binary tree and should support O(logN) lookup speed. +// The structure is thread-safe - writers obtain a mutex before modifying the tree. Multiple simulatenous readers are supported. +class AmPointerTracker { +typedef std::map MapTrackerType; +public: + + void insert(void *pointer, const hc::AmPointerInfo &p); + int remove(void *pointer); + + MapTrackerType::iterator find(void *hostPtr); + + MapTrackerType::iterator end() { return _tracker.end(); }; + + std::ostream & print (std::ostream &os); +private: + MapTrackerType _tracker; + //std::shared_timed_mutex _mut; +}; + + +//--- +void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) +{ + // TODO-mutex - write lock. + mprintf ("insert: %p + %zu\n", pointer, p._sizeBytes); + _tracker.insert(std::make_pair(AmMemoryRange(pointer, p._sizeBytes), p)); + + +} + + +//--- +// Return 1 if removed or 0 if not found. +int AmPointerTracker::remove (void *pointer) +{ + // TODO-mutex - write lock. + mprintf ("remove: %p\n", pointer); + return _tracker.erase(AmMemoryRange(pointer,1)); +} + + +//--- +AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointer) +{ + // TODO-mutex- read lock + auto iter = _tracker.find(AmMemoryRange(pointer,1)); + mprintf ("find: %p\n", pointer); + return iter; +} + + +std::ostream & AmPointerTracker::print (std::ostream &os) +{ + for (auto iter = _tracker.begin() ; iter != _tracker.end(); iter++) { + os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; + os << iter->second << std::endl; + } + + return os; +} + + +//========================================================================================================= +// Global var defs: +//========================================================================================================= +AmPointerTracker g_amPointerTracker; // Track all am pointer allocations. + + +//========================================================================================================= +// API Definitions. +//========================================================================================================= +// +// + +namespace hc { + +// Allocate accelerator memory, return NULL if memory could not be allocated: +auto_voidp AM_alloc(size_t sizeBytes, hc::accelerator acc, unsigned flags) +{ + + void *ptr = NULL; + + if (sizeBytes != 0 ) { + if (acc.is_hsa_accelerator()) { + hsa_agent_t *hsa_agent = static_cast (acc.get_default_view().get_hsa_agent()); + hsa_region_t *alloc_region; + if (flags & amHostPinned) { + alloc_region = static_cast(acc.get_hsa_am_system_region()); + } else { + alloc_region = static_cast(acc.get_hsa_am_region()); + } + + if (alloc_region->handle != -1) { + + hsa_status_t s1 = hsa_memory_allocate(*alloc_region, sizeBytes, &ptr); + hsa_status_t s2 = hsa_memory_assign_agent(ptr, *hsa_agent, HSA_ACCESS_PERMISSION_RW); + + if ((s1 != HSA_STATUS_SUCCESS) || (s2 != HSA_STATUS_SUCCESS)) { + ptr = NULL; + } else { + if (flags & amHostPinned) { + g_amPointerTracker.insert(ptr, + hc::AmPointerInfo(false/*isDevice*/, ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + } else { + g_amPointerTracker.insert(ptr, + hc::AmPointerInfo(true/*isDevice*/, NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + } + } + } + } + } + + return ptr; +}; + + +am_status_t AM_free(void* ptr) +{ + am_status_t status = AM_SUCCESS; + + if (ptr != NULL) { + hsa_memory_free(ptr); + + size_t numRemoved = g_amPointerTracker.remove(ptr) ; + if (numRemoved == 0) { + status = AM_ERROR_MISC; + } + } + return status; +} + + + +am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) +{ + am_status_t am_status = AM_ERROR_MISC; + hsa_status_t err = hsa_memory_copy(dst, src, sizeBytes); + + if (err == HSA_STATUS_SUCCESS) { + am_status = AM_SUCCESS; + } else { + am_status = AM_ERROR_MISC; + } + + return am_status; +} + + +am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr) +{ + auto infoI = g_amPointerTracker.find(ptr); + if (infoI != g_amPointerTracker.end()) { + *info = infoI->second; + return AM_SUCCESS; + } else { + return AM_ERROR_MISC; + } +} + +void AM_print_tracker() +{ + g_amPointerTracker.print(std::cerr); +} + + +} // end namespace hc. diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index f9f3b1b1f5..e9ee4c41dc 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -31,6 +31,8 @@ THE SOFTWARE. #include #include #include +#include + #include #include @@ -38,6 +40,9 @@ THE SOFTWARE. #include "hsa_ext_amd.h" + +#include "hc_AM.cpp" + #define USE_PINNED_HOST (__hcc_workweek__ >= 1601) #define USE_ASYNC_COPY 0 @@ -466,7 +471,8 @@ void ihipInit() g_devices.reserve(accs.size()); for (int i=0; imemoryType = amPointerInfo._isDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost; + attributes->hostPointer = amPointerInfo._hostPointer; + attributes->devicePointer = amPointerInfo._devicePointer; + attributes->isManaged = 0; + attributes->allocationFlags = amPointerInfo._allocationFlags; + + + attributes->device = -1; + e = hipErrorInvalidDevice; + for (int i=0; idevice = i; + e = hipSuccess; + break; + } + } + } else { + attributes->memoryType = hipMemoryTypeDevice; + attributes->hostPointer = 0; + attributes->devicePointer = 0; + attributes->device = -1; + attributes->isManaged = 0; + attributes->allocationFlags = 0; + + e = hipErrorInvalidValue; + } + + return ihipLogStatus(e); +} + + // kernel for launching memcpy operations: template @@ -1345,9 +1398,9 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hipError_t hip_status = hipSuccess; const unsigned am_flags = 0; - *ptr = hc::am_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); + *ptr = hc::AM_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); - if (*ptr == NULL) { + if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { hip_status = hipSuccess; @@ -1367,9 +1420,9 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) const unsigned am_flags = amHostPinned; - *ptr = hc::am_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); + *ptr = hc::AM_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); hipError_t hip_status = hipSuccess; - if (*ptr == NULL) { + if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { hip_status = hipSuccess; @@ -1444,7 +1497,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind #else // TODO-hsart - what synchronization does hsa_copy provide? - hc::am_copy(dst, src, sizeBytes); + hc::AM_copy(dst, src, sizeBytes); e = hipSuccess; #endif @@ -1475,7 +1528,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. - hc::am_copy(dst, src, sizeBytes); + hc::AM_copy(dst, src, sizeBytes); #if 0 @@ -1592,7 +1645,7 @@ hipError_t hipFree(void* ptr) ihipWaitAllStreams(ihipGetTlsDefaultDevice()); if (ptr) { - hc::am_free(ptr); + hc::AM_free(ptr); } return ihipLogStatus(hipSuccess); @@ -1606,7 +1659,7 @@ hipError_t hipFreeHost(void* ptr) if (ptr) { #if USE_PINNED_HOST tprintf (TRACE_MEM, " %s: %p\n", __func__, ptr); - hc::am_free(ptr); + hc::AM_free(ptr); #else free(ptr); #endif diff --git a/projects/clr/hipamd/tests/src/CMakeLists.txt b/projects/clr/hipamd/tests/src/CMakeLists.txt index 0ec287b334..bf05fc8407 100644 --- a/projects/clr/hipamd/tests/src/CMakeLists.txt +++ b/projects/clr/hipamd/tests/src/CMakeLists.txt @@ -114,6 +114,7 @@ make_hip_executable (hipSimpleAtomicsTest hipSimpleAtomicsTest.cpp) make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisionMathHost.cpp hipDoublePrecisionMathHost.cpp) make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp) make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp) +make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) target_link_libraries(hipMathFunctionsHost m) make_test(hip_ballot " " ) @@ -128,6 +129,7 @@ make_test(hipMemset --N 10 --memsetval 0x42 ) # small copy, just 10 bytes. make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size. make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy make_test(hipGridLaunch " " ) +make_test(hipPointerAttrib " " ) make_test(hipMemcpy " " ) diff --git a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp new file mode 100644 index 0000000000..9d147d8183 --- /dev/null +++ b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp @@ -0,0 +1,319 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +// Test pointer tracking logic: allocate memory and retrieve stats with hipPointerGetAttributes + +#include "hip_runtime.h" +#include "test_common.h" + +#ifdef __HIP_PLATFORM_HCC__ +#include "hcc_detail/AM.h" +#endif + +size_t Nbytes = 0; + +//================================================================================================= +// Utility Functions: +//================================================================================================= + +bool operator==(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) +{ + return ((lhs.hostPointer == rhs.hostPointer) && + (lhs.devicePointer == rhs.devicePointer) && + (lhs.memoryType == rhs.memoryType) && + (lhs.device == rhs.device) && + (lhs.allocationFlags == rhs.allocationFlags) + ) ; + +}; + + +bool operator!=(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) +{ + return ! (lhs == rhs); +} + + +const char *memoryTypeToString(hipMemoryType memoryType) +{ + switch (memoryType) { + case hipMemoryTypeHost : return "[Host]"; + case hipMemoryTypeDevice : return "[Device]"; + default: return "[Unknown]"; + }; +} + + +void resetAttribs(hipPointerAttribute_t *attribs) +{ + attribs->hostPointer = (void*) (-1); + attribs->devicePointer = (void*) (-1); + attribs->memoryType = hipMemoryTypeHost; + attribs->device = -2; + attribs->isManaged = -1; + attribs->allocationFlags = 0xffff; +}; + + +void printAttribs(hipPointerAttribute_t *attribs) +{ + printf ("hostPointer:%p devicePointer:%p memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n", + attribs->hostPointer, + attribs->devicePointer, + memoryTypeToString(attribs->memoryType), + attribs->device, + attribs->isManaged, + attribs->allocationFlags + ); +}; + + +inline int zrand(int max) +{ + return rand() % max; +} + + +//================================================================================================= +// Functins to run tests +//================================================================================================= +// +//Run through a couple simple cases to test lookups and hostd pointer arithmetic: +void simpleTests() +{ + char *A_d; + char *A_Pinned_h; + char *A_OSAlloc_h; + hipError_t e; + + HIPCHECK ( hipMalloc(&A_d, Nbytes) ); + HIPCHECK ( hipMallocHost(&A_Pinned_h, Nbytes) ); + A_OSAlloc_h = (char*)malloc(Nbytes); + + + hipPointerAttribute_t attribs; + hipPointerAttribute_t attribs2; + + // Device memory + printf ("\nDevice memory (hipMalloc)\n"); + HIPCHECK( hipPointerGetAttributes(&attribs, A_d)); + printf("getAttr:%-20s", "A_d"); printAttribs(&attribs); + + // Check pointer arithmetic cases: + resetAttribs(&attribs2); + HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+100)); + printf("getAttr:%-20s", "A_d+100"); printAttribs(&attribs2); + HIPASSERT(attribs == attribs2); + + // Corner case at end of array: + resetAttribs(&attribs2); + HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+Nbytes-1)); + printf("getAttr:%-20s", "A_d+NBytes-1"); printAttribs(&attribs2); + HIPASSERT(attribs == attribs2); + + // Pointer just beyond array - must be invalid or at least a different pointer + resetAttribs(&attribs2); + e = hipPointerGetAttributes(&attribs2, A_d+Nbytes+1); + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); + if (e != hipErrorInvalidValue) { + // We might have strayed into another pointer area. + printf("getAttr:%-20s", "A_d+NBytes"); printAttribs(&attribs2); + HIPASSERT(attribs.devicePointer != attribs2.devicePointer); + } + + + resetAttribs(&attribs2); + e = hipPointerGetAttributes(&attribs2, A_d+Nbytes); + if (e != hipErrorInvalidValue) { + printf("%-20s", "A_d+Nbytes"); printAttribs(&attribs2); + HIPASSERT(attribs.devicePointer != attribs2.devicePointer); + } + + hipFree(A_d); + e = hipPointerGetAttributes(&attribs, A_d); + HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error. + + + // Device-visible host memory + printf ("\nDevice-visible host memory (hipMallocHost)\n"); + HIPCHECK( hipPointerGetAttributes(&attribs, A_Pinned_h)); + printf("getAttr:%-20s", "A_pinned_h"); printAttribs(&attribs); + + resetAttribs(&attribs2); + HIPCHECK( hipPointerGetAttributes(&attribs2, A_Pinned_h+Nbytes/2)); + printf("getAttr:%-20s", "A_pinned_h+NBytes/2"); printAttribs(&attribs2); + HIPASSERT(attribs == attribs2); + + + hipFreeHost(A_Pinned_h); + e = hipPointerGetAttributes(&attribs, A_Pinned_h); + HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error. + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); + + + // OS memory + printf ("\nOS-allocated memory (malloc)\n"); + e = hipPointerGetAttributes(&attribs, A_OSAlloc_h); + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_OSAlloc_h", e, hipGetErrorString(e)); + HIPASSERT(e == hipErrorInvalidValue); // OS-allocated pointers should return hipErrorInvalidValue. +} + + + + +struct SuperPointerAttribute { + void * _pointer; + size_t _sizeBytes; + hipPointerAttribute_t _attrib; +}; + + + +void checkPointer(SuperPointerAttribute &ref, int major, int minor, void *pointer) +{ + hipPointerAttribute_t attribs; + resetAttribs(&attribs); + + HIPCHECK(hipPointerGetAttributes(&attribs, pointer)); + if (attribs != ref._attrib) { + printf("Test %d.%d", major, minor); + printf(" ref :: "); printAttribs(&ref._attrib); + printf(" getattr:: "); printAttribs(&attribs); + + HIPASSERT(attribs == ref._attrib); + } else { + if (p_verbose & 0x1) { + printf("#%4d.%d GOOD:%p getattr :: ",major, minor, pointer); printAttribs(&attribs); + } + } +} + + +void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) +{ + printf ("===========================================================================\n"); + printf ("clusterAllocs numAllocs=%d size=%lu..%lu\n", numAllocs, minSize, maxSize); + printf ("===========================================================================\n"); + std::vector reference(numAllocs); + + HIPASSERT(minSize > 0); + HIPASSERT(maxSize >= minSize); + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + + //--- + //Populate with device and host allocations. + for (int i=0; i 1) { + checkPointer(ref, i, 2, (char *)ref._pointer + ref._sizeBytes-1); + } + + } +} + + +void testMultiThreaded() +{ + std::thread t1(clusterAllocs, 1000, 101, 1000); + std::thread t2(clusterAllocs, 1000, 11, 100); + std::thread t3(clusterAllocs, 1000, 5, 10); + std::thread t4(clusterAllocs, 1000, 1, 4); + + t1.join(); + t2.join(); + t3.join(); + t4.join(); +} + + +int main(int argc, char *argv[]) +{ + + N= 1000000; + HipTest::parseStandardArguments(argc, argv, true); + + HIPCHECK(hipSetDevice(p_gpuDevice)); + + Nbytes = N*sizeof(char); + + printf ("N=%zu (%6.2f MB) device=%d\n", N, Nbytes/(1024.0*1024.0), p_gpuDevice); + + + if (p_tests & 0x1) { + simpleTests(); + } + + if (p_tests & 0x2) { + srand(0x100); + clusterAllocs(100, 1024*1, 1024*1024); + } + + if (p_tests & 0x4) { + srand(0x200); + clusterAllocs(1000, 1, 10); // Many tiny allocations; + } + + if (p_tests & 0x8) { + testMultiThreaded(); + } + + printf ("\n"); + passed(); +} diff --git a/projects/clr/hipamd/tests/src/test_common.cpp b/projects/clr/hipamd/tests/src/test_common.cpp index d7a108a11b..02deb51c85 100644 --- a/projects/clr/hipamd/tests/src/test_common.cpp +++ b/projects/clr/hipamd/tests/src/test_common.cpp @@ -28,6 +28,8 @@ int iterations = 1; unsigned blocksPerCU = 6; // to hide latency unsigned threadsPerBlock = 256; int p_gpuDevice = 0; +unsigned p_verbose = 0; +int p_tests = -1; /*which tests to run. Interpretation is left to each test. default:all*/ @@ -114,8 +116,16 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg) failed("Bad gpuDevice argument"); } - } - else { + } else if (!strcmp(arg, "--verbose") || (!strcmp(arg, "-v"))) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_verbose)) { + failed("Bad verbose argument"); + } + } else if (!strcmp(arg, "--tests") || (!strcmp(arg, "-t"))) { + if (++i >= argc || !HipTest::parseInt(argv[i], &p_tests)) { + failed("Bad tests argument"); + } + + } else { if (failOnUndefinedArg) { failed("Bad argument '%s'", arg); } else { diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index fee052c1ad..57d2ebc831 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -53,6 +53,8 @@ extern int iterations; extern unsigned blocksPerCU; extern unsigned threadsPerBlock; extern int p_gpuDevice; +extern unsigned p_verbose; +extern int p_tests; namespace HipTest { From 2089e549eb2bec9958d44336026341467a3b10f2 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 11 Feb 2016 22:03:01 -0600 Subject: [PATCH 03/32] Tracker improvements - add API to add / remove user-pointers from the tracker. - test for thread-safety with MultiThreadtest_2 - rapid insertions/removal. - add mutex to provide thread-safety. - rename tracker interface to "memtracker_..." for consistency. - add am_memtracker_reset, connect to hipDeviceReset. - [ROCm/clr commit: 7216727fba00e2c210eb1c05c4fd3c29e50b1b88] --- projects/clr/hipamd/include/hcc_detail/AM.h | 60 ++++-- projects/clr/hipamd/src/hc_AM.cpp | 135 ++++++++++-- projects/clr/hipamd/src/hip_hcc.cpp | 15 +- .../clr/hipamd/tests/src/hipPointerAttrib.cpp | 197 +++++++++++++++--- 4 files changed, 353 insertions(+), 54 deletions(-) diff --git a/projects/clr/hipamd/include/hcc_detail/AM.h b/projects/clr/hipamd/include/hcc_detail/AM.h index 1cfcf2dfb2..d41fed317a 100644 --- a/projects/clr/hipamd/include/hcc_detail/AM.h +++ b/projects/clr/hipamd/include/hcc_detail/AM.h @@ -15,22 +15,27 @@ namespace hc { // This is the data that is maintained for each pointer: struct AmPointerInfo { - bool _isDeviceMem; - void * _hostPointer; - void * _devicePointer; - size_t _sizeBytes; - hc::accelerator _acc; - unsigned _allocationFlags; + void * _hostPointer; ///< Host pointer. If host access is not allowed, NULL. + void * _devicePointer; ///< Device pointer. + size_t _sizeBytes; ///< Size of allocation. + hc::accelerator _acc; ///< Device / Accelerator to use. + bool _isInDeviceMem; ///< Memory is physically resident on a device (if false, memory is located on host) + bool _isAmManaged; ///< Memory was allocated by AM and should be freed when am_reset is called. + + int _appId; ///< App-specific storage. Used by HIP to store deviceID. + unsigned _appAllocationFlags; ///< App-specific allocation flags. Used by HIP to store allocation flags. AmPointerInfo() {}; - AmPointerInfo(bool isDeviceMem, void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, unsigned allocationFlags) : - _isDeviceMem(isDeviceMem), + AmPointerInfo(void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, bool isInDeviceMem, bool isAmManaged) : _hostPointer(hostPointer), _devicePointer(devicePointer), _sizeBytes(sizeBytes), _acc(acc), - _allocationFlags(allocationFlags) {}; + _isInDeviceMem(isInDeviceMem), + _isAmManaged(isAmManaged), + _appId(-1), + _appAllocationFlags(0) {}; }; } @@ -73,19 +78,46 @@ am_status_t AM_free(void* ptr); */ am_status_t AM_copy(void* dst, const void* src, size_t size); -am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr); +/** + * Return information about tracked pointer. + * + * AM tracks pointers when they are allocated or added to tracker with am_track_pointer. + * The tracker tracks the base pointer as well as the size of the allocation, and will + * find the information for a pointer anywhere in the tracked range. + * + * @returns AM_ERROR_MISC if pointer is not currently being tracked. + * @returns AM_SUCCESS if pointer is tracked and writes info to @p info. + * + * @see AM_memtracker_add, + */ +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr); +//TODO-doc +am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); -// TODO-implement these: -//am_status_t AM_track_pointer(void* ptr, size_t size, bool isDeviceMem=false, unsigned allocationFlags=0x0); -//am_status_t AM_untrack_pointer(void* ptr); +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); + +/** + * Remove the pointer from the tracker structure. + * + * @p ptr may be anywhere in a tracked memory range. + * + * @returns AM_ERROR_MISC if pointer is not found in tracker. + * @returns AM_SUCCESS if pointer is not found in tracker. + */ +am_status_t am_memtracker_remove(void* ptr); + +/** + * Remove all memory allocations associated with specified accelerator. + */ +size_t am_memtracker_reset(hc::accelerator acc); /** * Prints the contents of the memory tracker table to stderr * * Intended primarily for debug purposes. **/ -void AM_print_tracker(); +void am_memtracker_print(); }; // namespace hc diff --git a/projects/clr/hipamd/src/hc_AM.cpp b/projects/clr/hipamd/src/hc_AM.cpp index 87e29e4bcc..36c8abf193 100644 --- a/projects/clr/hipamd/src/hc_AM.cpp +++ b/projects/clr/hipamd/src/hc_AM.cpp @@ -5,7 +5,8 @@ #include "hcc_detail/AM.h" // TODO - Remove me. -#define DB_TRACKER 1 +#define DB_TRACKER 0 +#define MUTEX_LOCK 1 #if DB_TRACKER #define mprintf( ...) {\ @@ -43,14 +44,16 @@ struct AmMemoryRangeCompare { std::ostream &operator<<(std::ostream &os, const hc::AmPointerInfo &ap) { os << "hostPointer:" << ap._hostPointer << " devicePointer:"<< ap._devicePointer << " sizeBytes:" << ap._sizeBytes - << " isDeviceMem:" << ap._isDeviceMem << " allocFlags:" << ap._allocationFlags; + << " isInDeviceMem:" << ap._isInDeviceMem << " isAmManaged:" << ap._isAmManaged + << " appId:" << ap._appId << " appAllocFlags:" << ap._appAllocationFlags; return os; } - +//------------------------------------------------------------------------------------------------- // This structure tracks information for each pointer. -// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size will find the associated AmPointerInfo. +// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size +// will find the associated AmPointerInfo. // The insertions and lookups use a self-balancing binary tree and should support O(logN) lookup speed. // The structure is thread-safe - writers obtain a mutex before modifying the tree. Multiple simulatenous readers are supported. class AmPointerTracker { @@ -64,9 +67,18 @@ public: MapTrackerType::iterator end() { return _tracker.end(); }; + size_t reset (hc::accelerator acc); + std::ostream & print (std::ostream &os); private: + // TODO - use or remove. + inline void writeLock(); + inline void writeUnlock(); + inline void readLock(); + inline void readUnlock(); + MapTrackerType _tracker; + std::mutex _mutex; //std::shared_timed_mutex _mut; }; @@ -74,11 +86,10 @@ private: //--- void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) { - // TODO-mutex - write lock. + std::lock_guard l (_mutex); + mprintf ("insert: %p + %zu\n", pointer, p._sizeBytes); _tracker.insert(std::make_pair(AmMemoryRange(pointer, p._sizeBytes), p)); - - } @@ -87,6 +98,7 @@ void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) int AmPointerTracker::remove (void *pointer) { // TODO-mutex - write lock. + std::lock_guard l (_mutex); mprintf ("remove: %p\n", pointer); return _tracker.erase(AmMemoryRange(pointer,1)); } @@ -96,14 +108,17 @@ int AmPointerTracker::remove (void *pointer) AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointer) { // TODO-mutex- read lock + std::lock_guard l (_mutex); auto iter = _tracker.find(AmMemoryRange(pointer,1)); mprintf ("find: %p\n", pointer); return iter; } +//--- std::ostream & AmPointerTracker::print (std::ostream &os) { + std::lock_guard l (_mutex); for (auto iter = _tracker.begin() ; iter != _tracker.end(); iter++) { os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; os << iter->second << std::endl; @@ -112,6 +127,65 @@ std::ostream & AmPointerTracker::print (std::ostream &os) return os; } +//--- +// Remove all tracked locations, and free the associated memory (if the range was originally allocated by AM). +// Returns count of ranges removed. +size_t AmPointerTracker::reset (hc::accelerator acc) +{ + std::lock_guard l (_mutex); + mprintf ("reset: \n"); + + size_t count = 0; + // relies on C++11 (erase returns iterator) + for (auto iter = _tracker.begin() ; iter != _tracker.end(); ) { + if (iter->second._acc == acc) { + if (iter->second._isAmManaged) { + hsa_memory_free(iter->first._basePointer); + } + count++; + + iter = _tracker.erase(iter); + } else { + iter++; + } + } + + return count; +} + + + +//--- +void AmPointerTracker::writeLock () +{ + _mutex.lock(); +} + + +//--- +void AmPointerTracker::writeUnlock () +{ + _mutex.unlock(); +} + + +//--- +// TODO - support multiple concurrent reader +void AmPointerTracker::readLock () +{ + _mutex.lock(); +} + + +//--- +// TODO - support multiple concurrent reader +void AmPointerTracker::readUnlock () +{ + _mutex.unlock(); +} + + + //========================================================================================================= // Global var defs: @@ -153,10 +227,10 @@ auto_voidp AM_alloc(size_t sizeBytes, hc::accelerator acc, unsigned flags) } else { if (flags & amHostPinned) { g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(false/*isDevice*/, ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, true /*isAMManaged*/)); } else { g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(true/*isDevice*/, NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, true /*isAMManaged*/)); } } } @@ -172,9 +246,10 @@ am_status_t AM_free(void* ptr) am_status_t status = AM_SUCCESS; if (ptr != NULL) { + // See also tracker::reset which can free memory. hsa_memory_free(ptr); - size_t numRemoved = g_amPointerTracker.remove(ptr) ; + int numRemoved = g_amPointerTracker.remove(ptr) ; if (numRemoved == 0) { status = AM_ERROR_MISC; } @@ -199,7 +274,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) } -am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr) +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) { auto infoI = g_amPointerTracker.find(ptr); if (infoI != g_amPointerTracker.end()) { @@ -210,10 +285,46 @@ am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr) } } -void AM_print_tracker() + +am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); + + +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) +{ + if (isDeviceMem) { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, false /*isAMManaged*/)); + } else { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, false /*isAMManaged*/)); + } + + return AM_SUCCESS; +} + + +am_status_t am_memtracker_remove(void* ptr) +{ + am_status_t status = AM_SUCCESS; + + int numRemoved = g_amPointerTracker.remove(ptr) ; + if (numRemoved == 0) { + status = AM_ERROR_MISC; + } + + return status; +} + +//--- +void am_memtracker_print() { g_amPointerTracker.print(std::cerr); } +//--- +size_t am_memtracker_reset(hc::accelerator acc) +{ + return g_amPointerTracker.reset(acc); +} + + } // end namespace hc. diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index e9ee4c41dc..a4246dc9cb 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -44,8 +44,8 @@ THE SOFTWARE. #include "hc_AM.cpp" #define USE_PINNED_HOST (__hcc_workweek__ >= 1601) - #define USE_ASYNC_COPY 0 +#define USE_AM_TRACKER 1 /* use new AM memory tracker features */ #define INLINE static inline @@ -802,6 +802,13 @@ hipError_t hipDeviceReset(void) // It should destroy and clean up all resources allocated with the default device in the current process. // and needs to destroy all queues as well. // +#if USE_AM_TRACKER + // TODO - remove bug above. + ihipDevice_t *device = ihipGetTlsDefaultDevice(); + if (device) { + am_memtracker_reset(device->_acc); + } +#endif return ihipLogStatus(hipSuccess); } @@ -1281,14 +1288,14 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) hipError_t e = hipSuccess; hc::AmPointerInfo amPointerInfo; - am_status_t status = hc::AM_get_pointer_info(&amPointerInfo, ptr); + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); if (status == AM_SUCCESS) { - attributes->memoryType = amPointerInfo._isDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost; + attributes->memoryType = amPointerInfo._isInDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost; attributes->hostPointer = amPointerInfo._hostPointer; attributes->devicePointer = amPointerInfo._devicePointer; attributes->isManaged = 0; - attributes->allocationFlags = amPointerInfo._allocationFlags; + attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = -1; diff --git a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp index 9d147d8183..93d503af65 100644 --- a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp +++ b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp @@ -75,7 +75,7 @@ void resetAttribs(hipPointerAttribute_t *attribs) }; -void printAttribs(hipPointerAttribute_t *attribs) +void printAttribs(const hipPointerAttribute_t *attribs) { printf ("hostPointer:%p devicePointer:%p memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n", attribs->hostPointer, @@ -99,8 +99,13 @@ inline int zrand(int max) //================================================================================================= // //Run through a couple simple cases to test lookups and hostd pointer arithmetic: -void simpleTests() +void testSimple() { + printf ("\n"); + printf ("===========================================================================\n"); + printf ("Simple Tests\n"); + printf ("===========================================================================\n"); + char *A_d; char *A_Pinned_h; char *A_OSAlloc_h; @@ -179,8 +184,24 @@ void simpleTests() } +void resetTracker () +{ + if (p_verbose & 0x1) { + printf ("info: reset tracker for all devices in platform\n"); + } + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + + // Clean up: + for (int i=0; i reference(numAllocs); HIPASSERT(minSize > 0); @@ -244,14 +264,15 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) reference[i]._attrib.memoryType = hipMemoryTypeHost; reference[i]._attrib.devicePointer = ptr; reference[i]._attrib.hostPointer = ptr; - reference[i]._attrib.allocationFlags = 1; // TODO-randomize these. + reference[i]._attrib.allocationFlags = 0; // TODO-randomize these. } reference[i]._pointer = ptr; } #ifdef __HIP_PLATFORM_HCC__ if (p_verbose & 0x2) { - hc::AM_print_tracker(); + printf ("Tracker after insertions:\n"); + hc::am_memtracker_print(); } #endif @@ -265,27 +286,143 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) checkPointer(ref, i, 2, (char *)ref._pointer + ref._sizeBytes-1); } + if (ref._attrib.memoryType == hipMemoryTypeDevice) { + hipFree(ref._pointer); + } else { + hipFreeHost(ref._pointer); + } + + } + + + +#ifdef __HIP_PLATFORM_HCC__ + if (p_verbose & 0x2) { + printf ("Tracker after cleanup:\n"); + hc::am_memtracker_print(); + } +#endif +} + + +void testMultiThreaded_1(bool serialize=false) +{ + printf ("\n===========================================================================\n"); + printf ("MultiThreaded_1\n"); + if (serialize) printf ("[SERIALIZE]\n"); + printf ("===========================================================================\n"); + std::thread t1(clusterAllocs, 1000, 101, 1000); + if (serialize) t1.join(); + + std::thread t2(clusterAllocs, 1000, 11, 100); + if (serialize) t2.join(); + + std::thread t3(clusterAllocs, 1000, 5, 10); + if (serialize) t3.join(); + + std::thread t4(clusterAllocs, 1000, 1, 4); + if (serialize) t4.join(); + + if (!serialize) { + t1.join(); + t2.join(); + t3.join(); + t4.join(); + } + + resetTracker(); +} + + +///================================================================================================ + + +// Add pointers to tracker very quickly. +void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib) +{ + int count = 0; + + for (int count=0; count< 1000000; count++) { + hipPointerAttribute_t a; + hipError_t e = hipPointerGetAttributes(&a, ptr); + if ((e != hipSuccess) || (a!= *refAttrib)) { + printf("Test %d (err=%d)\n", count, e); + HIPCHECK(e); + + printf(" ref :: "); printAttribs(refAttrib); + printf(" getattr:: "); printAttribs(&a); + } } } -void testMultiThreaded() +enum Dir {Up, Down}; +void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir removeDir) { - std::thread t1(clusterAllocs, 1000, 101, 1000); - std::thread t2(clusterAllocs, 1000, 11, 100); - std::thread t3(clusterAllocs, 1000, 5, 10); - std::thread t4(clusterAllocs, 1000, 1, 4); + const size_t bufferSize = 16; + size_t maxSize = numBuffers*bufferSize; + HIPASSERT((maxSize % bufferSize) == 0); // loop logic assumes this is true + + + for (int i=0; i inflight(2); + + printf ("\n===========================================================================\n"); + printf ("MultiThreaded_2\n"); + printf ("===========================================================================\n"); + + hipSetDevice(0); + hipDeviceReset(); + + // Create some entries in the tracker: + for (int i=0; i<1000; i++) { + void *C_d; + HIPCHECK(hipMalloc(&C_d, 32)); + } + + + // Allocate a pointer that we will repeatedly lookup: + void *A_d; + HIPCHECK(hipMalloc(&A_d, 10000)); + hipPointerAttribute_t attrib1; + HIPCHECK(hipPointerGetAttributes(&attrib1, A_d)); + std::thread t1(thread_query, A_d, &attrib1); + + std::thread t2(thread_noise_generator, 10000, 1000, Up, Up); t1.join(); t2.join(); - t3.join(); - t4.join(); + + hipSetDevice(0); + hipDeviceReset(); } int main(int argc, char *argv[]) { - N= 1000000; HipTest::parseStandardArguments(argc, argv, true); @@ -296,22 +433,34 @@ int main(int argc, char *argv[]) printf ("N=%zu (%6.2f MB) device=%d\n", N, Nbytes/(1024.0*1024.0), p_gpuDevice); - if (p_tests & 0x1) { - simpleTests(); + if (p_tests & 0x01) { + testSimple(); } - if (p_tests & 0x2) { + if (p_tests & 0x02) { srand(0x100); + printf ("\n===========================================================================\n"); clusterAllocs(100, 1024*1, 1024*1024); + resetTracker(); } - if (p_tests & 0x4) { + if (p_tests & 0x04) { srand(0x200); + printf ("\n===========================================================================\n"); clusterAllocs(1000, 1, 10); // Many tiny allocations; + resetTracker(); } - if (p_tests & 0x8) { - testMultiThreaded(); + if (p_tests & 0x08) { + srand(0x300); + testMultiThreaded_1(true); + testMultiThreaded_1(false); + } + + if (p_tests & 0x10) { + srand(0x400); + testMultiThreaded_2(); + resetTracker(); } printf ("\n"); From 712750e1a5fd25553e02a34a295a201459510a32 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 11 Feb 2016 23:07:19 -0600 Subject: [PATCH 04/32] Use memtracker 'appID' to store deviceID associated with ptr [ROCm/clr commit: c04b5d3afb22cbfb7a21a1a72c2abf4caca4af54] --- projects/clr/hipamd/src/hc_AM.cpp | 12 +++- projects/clr/hipamd/src/hip_hcc.cpp | 87 +++++++++++++++++++---------- 2 files changed, 70 insertions(+), 29 deletions(-) diff --git a/projects/clr/hipamd/src/hc_AM.cpp b/projects/clr/hipamd/src/hc_AM.cpp index 36c8abf193..92310164c0 100644 --- a/projects/clr/hipamd/src/hc_AM.cpp +++ b/projects/clr/hipamd/src/hc_AM.cpp @@ -286,7 +286,17 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) } -am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); +am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags) +{ + auto iter = g_amPointerTracker.find(ptr); + if (iter != g_amPointerTracker.end()) { + iter->second._appId = appId; + iter->second._appAllocationFlags = allocationFlags; + return AM_SUCCESS; + } else { + return AM_ERROR_MISC; + } +} am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index a4246dc9cb..21da73d1da 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -1295,18 +1295,11 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) attributes->hostPointer = amPointerInfo._hostPointer; attributes->devicePointer = amPointerInfo._devicePointer; attributes->isManaged = 0; + attributes->allocationFlags = amPointerInfo._appAllocationFlags; + attributes->device = amPointerInfo._appId; - attributes->device = -1; - e = hipErrorInvalidDevice; - for (int i=0; idevice = i; - e = hipSuccess; - break; - } - } } else { attributes->memoryType = hipMemoryTypeDevice; attributes->hostPointer = 0; @@ -1322,6 +1315,36 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) } +// TODO - test this function: +/** + * @returns #hipSuccess, + * @returns #hipErrorInvalidValue if flags are not 0 + * @returns #hipErrorMemoryAllocation if hostPointer is not a tracked allocation. + */ +hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsigned flags) +{ + std::call_once(hip_initialized, ihipInit); + + hipError_t e = hipSuccess; + + // Flags must be 0: + if (flags == 0) { + e = hipErrorInvalidValue; + } else { + hc::AmPointerInfo amPointerInfo; + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer); + if (status == AM_SUCCESS) { + *devicePointer = amPointerInfo._devicePointer; + } else { + e = hipErrorMemoryAllocation; + *devicePointer = NULL; + } + } + + return ihipLogStatus(e); +} + + // kernel for launching memcpy operations: template @@ -1398,24 +1421,31 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) } //--- +/** + * @returns #hipSuccess #hipErrorMemoryAllocation + */ hipError_t hipMalloc(void** ptr, size_t sizeBytes) { std::call_once(hip_initialized, ihipInit); hipError_t hip_status = hipSuccess; - const unsigned am_flags = 0; - *ptr = hc::AM_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); + auto device = ihipGetTlsDefaultDevice(); - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; + if (device) { + const unsigned am_flags = 0; + *ptr = hc::AM_alloc(sizeBytes, device->_acc, am_flags); + + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } else { + hc::am_memtracker_update(*ptr, device->_device_index, 0); + } } else { - hip_status = hipSuccess; + hip_status = hipErrorMemoryAllocation; } - ihipLogStatus(hip_status); - - return hip_status; + return ihipLogStatus(hip_status); } @@ -1423,23 +1453,24 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) { std::call_once(hip_initialized, ihipInit); + hipError_t hip_status = hipSuccess; #if USE_PINNED_HOST const unsigned am_flags = amHostPinned; + auto device = ihipGetTlsDefaultDevice(); - *ptr = hc::AM_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); - hipError_t hip_status = hipSuccess; - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; - } else { - hip_status = hipSuccess; + if (device) { + *ptr = hc::AM_alloc(sizeBytes, device->_acc, am_flags); + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } else { + hc::am_memtracker_update(*ptr, device->_device_index, 0); + } + + tprintf (TRACE_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); } - tprintf (TRACE_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); - - ihipLogStatus(hip_status); - - return hip_status; + return ihipLogStatus(hip_status); #else // TODO-hcc remove-me From 80d7c867d15e0913897e92b62856f0448722ea14 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 11 Feb 2016 23:13:43 -0600 Subject: [PATCH 05/32] Remove ! USE_PINNED_HOST support [ROCm/clr commit: f2c1bf3bc013aca86f8516e4208596c5529db18f] --- projects/clr/hipamd/src/hip_hcc.cpp | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 21da73d1da..e63186692c 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -43,7 +43,6 @@ THE SOFTWARE. #include "hc_AM.cpp" -#define USE_PINNED_HOST (__hcc_workweek__ >= 1601) #define USE_ASYNC_COPY 0 #define USE_AM_TRACKER 1 /* use new AM memory tracker features */ @@ -1454,7 +1453,6 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) std::call_once(hip_initialized, ihipInit); hipError_t hip_status = hipSuccess; -#if USE_PINNED_HOST const unsigned am_flags = amHostPinned; auto device = ihipGetTlsDefaultDevice(); @@ -1472,20 +1470,6 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) return ihipLogStatus(hip_status); -#else - // TODO-hcc remove-me - - // This code only works on Kaveri: - *ptr = malloc(sizeBytes); // TODO - call am_alloc for device memory, this will only on KV HSA. - if (*ptr != NULL) { - //TODO-hsart : need memory pin APIs to implement this correctly. - // FOr now do our best to allocate the memory, but return an error since - // the returned pointer can only be used on the HOST not the GPU. - return ihipLogStatus(hipErrorMemoryAllocation); - } else { - return ihipLogStatus(hipErrorMemoryAllocation); - } -#endif } hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) @@ -1695,12 +1679,8 @@ hipError_t hipFreeHost(void* ptr) std::call_once(hip_initialized, ihipInit); if (ptr) { -#if USE_PINNED_HOST tprintf (TRACE_MEM, " %s: %p\n", __func__, ptr); hc::AM_free(ptr); -#else - free(ptr); -#endif } return ihipLogStatus(hipSuccess); From 5978d5f372b93420e61e1fbd939e25ec07ed1872 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 00:08:52 -0600 Subject: [PATCH 06/32] Query tracked memory sizes. Support more accurate hipMemGetInfo. Add test to hipPointerAttrib. [ROCm/clr commit: f464cedcf4d54bdc4b455fc4c405cdbcec1b4686] --- projects/clr/hipamd/include/hcc_detail/AM.h | 13 +++++- projects/clr/hipamd/src/hc_AM.cpp | 43 +++++++++++++++++-- projects/clr/hipamd/src/hip_hcc.cpp | 29 ++++++++++--- .../clr/hipamd/tests/src/hipPointerAttrib.cpp | 36 +++++++++++++--- 4 files changed, 103 insertions(+), 18 deletions(-) diff --git a/projects/clr/hipamd/include/hcc_detail/AM.h b/projects/clr/hipamd/include/hcc_detail/AM.h index d41fed317a..c183844869 100644 --- a/projects/clr/hipamd/include/hcc_detail/AM.h +++ b/projects/clr/hipamd/include/hcc_detail/AM.h @@ -78,6 +78,7 @@ am_status_t AM_free(void* ptr); */ am_status_t AM_copy(void* dst, const void* src, size_t size); + /** * Return information about tracked pointer. * @@ -92,10 +93,14 @@ am_status_t AM_copy(void* dst, const void* src, size_t size); */ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr); + +//TODO-doc +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); + + //TODO-doc am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); -am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); /** * Remove the pointer from the tracker structure. @@ -109,16 +114,20 @@ am_status_t am_memtracker_remove(void* ptr); /** * Remove all memory allocations associated with specified accelerator. + * + * @returns Number of entries reset. */ size_t am_memtracker_reset(hc::accelerator acc); /** - * Prints the contents of the memory tracker table to stderr + * Prints info about the memory tracker table. * * Intended primarily for debug purposes. **/ void am_memtracker_print(); +void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize); + }; // namespace hc diff --git a/projects/clr/hipamd/src/hc_AM.cpp b/projects/clr/hipamd/src/hc_AM.cpp index 92310164c0..3a6d116261 100644 --- a/projects/clr/hipamd/src/hc_AM.cpp +++ b/projects/clr/hipamd/src/hc_AM.cpp @@ -63,13 +63,15 @@ public: void insert(void *pointer, const hc::AmPointerInfo &p); int remove(void *pointer); - MapTrackerType::iterator find(void *hostPtr); + MapTrackerType::iterator find(void *hostPtr) ; + + MapTrackerType::iterator readerLockBegin() { _mutex.lock(); return _tracker.begin(); } ; + MapTrackerType::iterator end() { return _tracker.end(); } ; + void readerUnlock() { _mutex.unlock(); }; - MapTrackerType::iterator end() { return _tracker.end(); }; size_t reset (hc::accelerator acc); - std::ostream & print (std::ostream &os); private: // TODO - use or remove. inline void writeLock(); @@ -115,6 +117,7 @@ AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointe } +#if 0 //--- std::ostream & AmPointerTracker::print (std::ostream &os) { @@ -126,6 +129,7 @@ std::ostream & AmPointerTracker::print (std::ostream &os) return os; } +#endif //--- // Remove all tracked locations, and free the associated memory (if the range was originally allocated by AM). @@ -326,7 +330,38 @@ am_status_t am_memtracker_remove(void* ptr) //--- void am_memtracker_print() { - g_amPointerTracker.print(std::cerr); + std::ostream &os = std::cerr; + + //g_amPointerTracker.print(std::cerr); + for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) { + os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; + os << iter->second << std::endl; + } + + g_amPointerTracker.readerUnlock(); +} + + +//--- +void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize) +{ + *deviceMemSize = *hostMemSize = *userMemSize = 0; + for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) { + if (iter->second._acc == acc) { + size_t sizeBytes = iter->second._sizeBytes; + if (iter->second._isAmManaged) { + if (iter->second._isInDeviceMem) { + *deviceMemSize += sizeBytes; + } else { + *hostMemSize += sizeBytes; + } + } else { + *userMemSize += sizeBytes; + } + } + } + + g_amPointerTracker.readerUnlock(); } diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index e63186692c..fe273aa21c 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -1286,6 +1286,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) hipError_t e = hipSuccess; +#if USE_AM_TRACKER hc::AmPointerInfo amPointerInfo; am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); if (status == AM_SUCCESS) { @@ -1309,11 +1310,15 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) e = hipErrorInvalidValue; } +#else + e = hipErrorInvalidValue; +#endif return ihipLogStatus(e); } +#if USE_AM_TRACKER // TODO - test this function: /** * @returns #hipSuccess, @@ -1342,6 +1347,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi return ihipLogStatus(e); } +#endif @@ -1438,7 +1444,9 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { +#ifdef USE_AM_TRACKER hc::am_memtracker_update(*ptr, device->_device_index, 0); +#endif } } else { hip_status = hipErrorMemoryAllocation; @@ -1462,7 +1470,9 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { +#ifdef USE_AM_TRACKER hc::am_memtracker_update(*ptr, device->_device_index, 0); +#endif } tprintf (TRACE_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); @@ -1627,10 +1637,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) /* - * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue (if free != NULL due to bug) - * @bug - on hcc free always returns 50% of peak regardless of current allocations. hipMemGetInfo returns hipErrorInvalidValue to indicate this. + * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue (if free != NULL due to bug)S + * @warning On HCC, the free memory only accounts for memory allocated by this process and may be optimistic. */ -hipError_t hipMemGetInfo ( size_t * free, size_t * total ) +hipError_t hipMemGetInfo (size_t *free, size_t *total) { std::call_once(hip_initialized, ihipInit); @@ -1643,17 +1653,22 @@ hipError_t hipMemGetInfo ( size_t * free, size_t * total ) } if (free) { - *free = hipDevice->_props.totalGlobalMem * 0.5; // TODO +#if USE_AM_TRACKER + // TODO - replace with kernel-level for reporting free memory: + size_t deviceMemSize, hostMemSize, userMemSize; + hc::am_memtracker_sizeinfo(hipDevice->_acc, &deviceMemSize, &hostMemSize, &userMemSize); + *free = hipDevice->_props.totalGlobalMem - deviceMemSize; +#else + *free = hipDevice->_props.totalGlobalMem * 0.5; // TODO e=hipErrorInvalidValue; +#endif } } else { e = hipErrorInvalidDevice; } - // TODO-runtime - when we fix the 50% bug. - //return ihipLogStatus(hipErrorSuccess); - return ihipLogStatus(hipErrorInvalidValue); + return ihipLogStatus(e); } diff --git a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp index 93d503af65..1418997274 100644 --- a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp +++ b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp @@ -115,6 +115,11 @@ void testSimple() HIPCHECK ( hipMallocHost(&A_Pinned_h, Nbytes) ); A_OSAlloc_h = (char*)malloc(Nbytes); + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + printf ("hipMemGetInfo: free=%zu (%4.2f) Nbytes=%lu total=%zu (%4.2f)\n", free, (float)(free/1024.0/1024.0), Nbytes, total, (float)(total/1024.0/1024.0)); + HIPASSERT(free + Nbytes <= total); + hipPointerAttribute_t attribs; hipPointerAttribute_t attribs2; @@ -244,6 +249,10 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) //--- //Populate with device and host allocations. + size_t totalDeviceAllocated[numDevices]; + for (int i =0; i=0; p-=bufferSize) { hc::am_memtracker_add(p, bufferSize, acc, false); } } if (removeDir == Up) { - for (char *p = basePtr; p=0; p-=bufferSize) { + hc::am_memtracker_remove(p); + } + } } } From 89e461988ecb33ac954d9ce40c992b2d180618ef Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 04:30:09 -0600 Subject: [PATCH 07/32] Step1 in staging buffer copy. - use StagingBuffer class for copies. - refactor g_device to use array rather than vector. (keeps pointers from moving). [ROCm/clr commit: 90af462b85afd47db723cb0191afe9c6be8cb29a] --- projects/clr/hipamd/include/hcc_detail/AM.h | 8 +- projects/clr/hipamd/src/hc_AM.cpp | 18 +- projects/clr/hipamd/src/hip_hcc.cpp | 200 +++++++++++++++--- projects/clr/hipamd/tests/src/hipMemcpy.cpp | 2 +- projects/clr/hipamd/tests/src/test_common.cpp | 2 +- projects/clr/hipamd/util/vim/hip.vim | 3 + 6 files changed, 188 insertions(+), 45 deletions(-) diff --git a/projects/clr/hipamd/include/hcc_detail/AM.h b/projects/clr/hipamd/include/hcc_detail/AM.h index c183844869..04804ffaa5 100644 --- a/projects/clr/hipamd/include/hcc_detail/AM.h +++ b/projects/clr/hipamd/include/hcc_detail/AM.h @@ -22,8 +22,8 @@ struct AmPointerInfo { bool _isInDeviceMem; ///< Memory is physically resident on a device (if false, memory is located on host) bool _isAmManaged; ///< Memory was allocated by AM and should be freed when am_reset is called. - int _appId; ///< App-specific storage. Used by HIP to store deviceID. - unsigned _appAllocationFlags; ///< App-specific allocation flags. Used by HIP to store allocation flags. + int _appId; ///< App-specific storage. (Used by HIP to store deviceID.) + unsigned _appAllocationFlags; ///< App-specific allocation flags. (Used by HIP to store allocation flags.) AmPointerInfo() {}; @@ -91,7 +91,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t size); * * @see AM_memtracker_add, */ -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr); +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); //TODO-doc @@ -99,7 +99,7 @@ am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, //TODO-doc -am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); +am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags); /** diff --git a/projects/clr/hipamd/src/hc_AM.cpp b/projects/clr/hipamd/src/hc_AM.cpp index 3a6d116261..2d22b49fd4 100644 --- a/projects/clr/hipamd/src/hc_AM.cpp +++ b/projects/clr/hipamd/src/hc_AM.cpp @@ -24,10 +24,10 @@ //#include struct AmMemoryRange { - void * _basePointer; - void * _endPointer; - AmMemoryRange(void *basePointer, size_t sizeBytes) : - _basePointer(basePointer), _endPointer((unsigned char*)basePointer + sizeBytes - 1) {}; + const void * _basePointer; + const void * _endPointer; + AmMemoryRange(const void *basePointer, size_t sizeBytes) : + _basePointer(basePointer), _endPointer((const unsigned char*)basePointer + sizeBytes - 1) {}; }; // Functor to compare ranges: @@ -63,7 +63,7 @@ public: void insert(void *pointer, const hc::AmPointerInfo &p); int remove(void *pointer); - MapTrackerType::iterator find(void *hostPtr) ; + MapTrackerType::iterator find(const void *hostPtr) ; MapTrackerType::iterator readerLockBegin() { _mutex.lock(); return _tracker.begin(); } ; MapTrackerType::iterator end() { return _tracker.end(); } ; @@ -107,7 +107,7 @@ int AmPointerTracker::remove (void *pointer) //--- -AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointer) +AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void *pointer) { // TODO-mutex- read lock std::lock_guard l (_mutex); @@ -144,7 +144,7 @@ size_t AmPointerTracker::reset (hc::accelerator acc) for (auto iter = _tracker.begin() ; iter != _tracker.end(); ) { if (iter->second._acc == acc) { if (iter->second._isAmManaged) { - hsa_memory_free(iter->first._basePointer); + hsa_memory_free(const_cast (iter->first._basePointer)); } count++; @@ -278,7 +278,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) } -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr) { auto infoI = g_amPointerTracker.find(ptr); if (infoI != g_amPointerTracker.end()) { @@ -290,7 +290,7 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) } -am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags) +am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags) { auto iter = g_amPointerTracker.find(ptr); if (iter != g_amPointerTracker.end()) { diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index fe273aa21c..08f7859271 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -43,7 +43,7 @@ THE SOFTWARE. #include "hc_AM.cpp" -#define USE_ASYNC_COPY 0 +#define USE_ASYNC_COPY 1 #define USE_AM_TRACKER 1 /* use new AM memory tracker features */ #define INLINE static inline @@ -60,10 +60,12 @@ static const int release = 1; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; +int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -#define TRACE_API 0x1 /* trace API calls and return values */ -#define TRACE_SYNC 0x2 /* trace synchronization pieces */ -#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */ +#define TRACE_API 0x1 /* trace API calls and return values */ +#define TRACE_SYNC 0x2 /* trace synchronization pieces */ +#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */ +#define TRACE_COPY2 0x8 /* trace memory copy commands. Detailed. */ #define tprintf(trace_level, ...) {\ if (HIP_TRACE_API & trace_level) {\ @@ -119,6 +121,28 @@ struct ihipEvent_t { } ; +//------------------------------------------------------------------------------------------------- +struct StagingBuffer { + static const int numBuffers = 2; + + int _bufferIndex; // Operating on buffer 0 or 1? + + ihipDevice_t *_device; + size_t _bufferSize; // Size of the buffers. + + + StagingBuffer(ihipDevice_t *device, size_t bufferSize) ; + + void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); + +private: + char *_pinnedStagingBuffer[numBuffers]; + hsa_signal_t _completion_signal[numBuffers]; +}; + + + +//------------------------------------------------------------------------------------------------- struct ihipDevice_t { unsigned _device_index; // index into g_devices. @@ -135,8 +159,11 @@ struct ihipDevice_t unsigned _compute_units; + StagingBuffer *_staging_host2device; + StagingBuffer *_staging_device2host; + public: - ihipDevice_t(unsigned device_index, hc::accelerator acc); + void init(unsigned device_index, hc::accelerator acc); hipError_t getProperties(hipDeviceProp_t* prop); // TODO- create a copy constructor. @@ -145,10 +172,10 @@ public: //================================================================================================= -ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc) - : _device_index(device_index), - _acc(acc) +void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) { + _device_index = device_index; + _acc = acc; hsa_agent_t *agent = static_cast (acc.get_default_view().get_hsa_agent()); if (agent) { int err = hsa_agent_get_info(*agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_compute_units); @@ -166,6 +193,9 @@ ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc) _null_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault); this->_streams.push_back(_null_stream); tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream); + + _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); + _staging_device2host = NULL; }; #if 0 @@ -187,7 +217,8 @@ thread_local int tls_defaultDevice = 0; // Global initialization. std::once_flag hip_initialized; -std::vector g_devices; // Vector of all non-emulated (ie GPU) accelerators in the system. +ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. +unsigned g_deviceCnt; //================================================================================================= @@ -462,25 +493,36 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c //It is called with C++11 call_once, which provided thread-safety. void ihipInit() { - - /* - * Build a table of valid compute devices. - */ - auto accs = hc::accelerator::get_all(); - g_devices.reserve(accs.size()); - for (int i=0; i"); @@ -489,7 +531,7 @@ void ihipInit() INLINE bool ihipIsValidDevice(unsigned deviceIndex) { // deviceIndex is unsigned so always > 0 - return (deviceIndex < g_devices.size()); + return (deviceIndex < g_deviceCnt); } @@ -508,7 +550,7 @@ INLINE ihipDevice_t *ihipGetTlsDefaultDevice() //--- INLINE ihipDevice_t *ihipGetDevice(int deviceId) { - if ((deviceId >= 0) && (deviceId < g_devices.size())) { + if ((deviceId >= 0) && (deviceId < g_deviceCnt)) { return &g_devices[deviceId]; } else { return NULL; @@ -675,7 +717,7 @@ hipError_t hipGetDeviceCount(int *count) { std::call_once(hip_initialized, ihipInit); - *count = g_devices.size(); + *count = g_deviceCnt; if (*count > 0) { return ihipLogStatus(hipSuccess); @@ -764,7 +806,7 @@ hipError_t hipSetDevice(int device) { std::call_once(hip_initialized, ihipInit); - if ((device < 0) || (device >= g_devices.size())) { + if ((device < 0) || (device >= g_deviceCnt)) { return ihipLogStatus(hipErrorInvalidDevice); } else { tls_defaultDevice = device; @@ -1299,6 +1341,10 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = amPointerInfo._appId; + if (attributes->device < 0) { + e = hipErrorInvalidDevice; + } + } else { attributes->memoryType = hipMemoryTypeDevice; @@ -1482,6 +1528,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) } +//--- hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { #ifdef USE_MEMCPYTOSYMBOL @@ -1500,6 +1547,102 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou } +//------------------------------------------------------------------------------------------------- +StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : + _bufferIndex(0), + _device(device), + _bufferSize(bufferSize) +{ + for (int i=0; i_acc, amHostPinned); + if (_pinnedStagingBuffer[i] == NULL) { + throw; + } + hsa_signal_create(0, 0, NULL, &_completion_signal[i]); + } +}; + + +//--- +void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) { + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); + + assert(sizeBytes < UINT64_MAX/2); // TODO + for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0; bytesRemaining -= _bufferSize) { + + // TODO - double-buffer these guys. + size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; + + tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, _bufferIndex, _pinnedStagingBuffer[_bufferIndex]); + + memcpy(_pinnedStagingBuffer[_bufferIndex], srcp, theseBytes); + + tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[_bufferIndex], dstp); + + hsa_signal_store_relaxed(_completion_signal[_bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[_bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[_bufferIndex]); + + tprintf (TRACE_COPY2, "waiting... status=%d\n", hsa_status); + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_acquire(_completion_signal[_bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + srcp += theseBytes; + dstp += theseBytes; + } +} + + + + +#if USE_AM_TRACKER +// TODO - add mutex to limit in/out: +void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +{ + hc::AmPointerInfo dstPtrInfo, srcPtrInfo; + + bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); + bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); + + bool useStagingBuffer = true; + + // Resolve default to a specific Kind, since we use different algorithms: + if (kind == hipMemcpyDefault) { + bool dstIsHost = (dstNotTracked || dstPtrInfo._isInDeviceMem); + bool srcIsHost = (srcNotTracked || srcPtrInfo._isInDeviceMem); + if (srcIsHost && !dstIsHost) { + kind = hipMemcpyHostToDevice; + } else if (!srcIsHost && dstIsHost) { + kind = hipMemcpyDeviceToHost; + } else if (srcIsHost && dstIsHost) { + kind = hipMemcpyHostToHost; + } else if (srcIsHost && dstIsHost) { + kind = hipMemcpyDeviceToDevice; + } + } + + switch (kind) { + case hipMemcpyHostToDevice: + if (srcNotTracked) { + device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + } else { + assert(0); // TODO + //hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + break; + case hipMemcpyDeviceToHost: + // TODO - optimize the copy here. + hc::AM_copy(dst, src, sizeBytes); + break; + default: + assert(0); // TODO + } +} +#endif + + + //--- hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { @@ -1517,11 +1660,8 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind ihipDevice_t *device = &g_devices[stream->_device_index]; - hsa_signal_t completion_signal; // init/obtain from pool. + ihipAsyncCopy(device, dst, src, sizeBytes, kind); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, size, device->_hsa_agent, 0, NULL, &completion_signal); - - e = (hsa_status == HSA_STATUS_SUCCESS) ? hipSuccess : hipErrorTbd; } else { e = hipErrorInvalidResourceHandle; } diff --git a/projects/clr/hipamd/tests/src/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/hipMemcpy.cpp index 5db2b270d6..7664cfb581 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpy.cpp @@ -30,7 +30,7 @@ int main(int argc, char *argv[]) size_t Nbytes = N*sizeof(int); - printf ("N=%zu \n", N); + printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; diff --git a/projects/clr/hipamd/tests/src/test_common.cpp b/projects/clr/hipamd/tests/src/test_common.cpp index 02deb51c85..3da5568b7c 100644 --- a/projects/clr/hipamd/tests/src/test_common.cpp +++ b/projects/clr/hipamd/tests/src/test_common.cpp @@ -88,7 +88,7 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg) if (!strcmp(arg, " ")) { // skip NULL args. - } else if (!strcmp(arg, "--N")) { + } else if (!strcmp(arg, "--N") || (!strcmp(arg, "-N"))) { if (++i >= argc || !HipTest::parseSize(argv[i], &N)) { failed("Bad N size argument"); } diff --git a/projects/clr/hipamd/util/vim/hip.vim b/projects/clr/hipamd/util/vim/hip.vim index 01f3b3f2ad..e2236f4fbc 100644 --- a/projects/clr/hipamd/util/vim/hip.vim +++ b/projects/clr/hipamd/util/vim/hip.vim @@ -151,6 +151,9 @@ syn keyword hipFunctionName hipUnbindTexture syn keyword hipFlags hipFilterModePoint syn keyword hipFlags hipMemcpyHostToDevice syn keyword hipFlags hipMemcpyDeviceToHost +syn keyword hipFlags hipMemcpyHostToHost +syn keyword hipFlags hipMemcpyDeviceToDevice +syn keyword hipFlags hipMemcpyDefault syn keyword hipFlags hipReadModeElementType syn keyword hipFlags hipSuccess syn keyword hipFlags hipTextureType1D From 4dfe77a99bbf6c03e758b07cc29b62085f2ac47b Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 17:39:44 -0600 Subject: [PATCH 08/32] Improve copy testing implementation. - add tests for (unpinned/pinned) x H2H x D2D. - Free memory at end of test. [ROCm/clr commit: 112861080192db61332a45874e278dfb67639c82] --- projects/clr/hipamd/src/hip_hcc.cpp | 72 ++++++++++---- projects/clr/hipamd/tests/src/hipMemcpy.cpp | 105 ++++++++++++++++++-- projects/clr/hipamd/tests/src/test_common.h | 70 +++++++++++-- projects/clr/hipamd/util/vim/hip.vim | 1 + 4 files changed, 214 insertions(+), 34 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 08f7859271..f397b02cbe 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -132,6 +132,7 @@ struct StagingBuffer { StagingBuffer(ihipDevice_t *device, size_t bufferSize) ; + ~StagingBuffer(); void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); @@ -163,6 +164,7 @@ struct ihipDevice_t StagingBuffer *_staging_device2host; public: + void reset(); void init(unsigned device_index, hc::accelerator acc); hipError_t getProperties(hipDeviceProp_t* prop); @@ -172,6 +174,17 @@ public: //================================================================================================= +// +//Reset the device - this is called from hipDeviceReset. +//Device may be reset multiple times, and may be reset after init. +void ihipDevice_t::reset() +{ + _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); + _staging_device2host = NULL; +}; + + +//--- void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) { _device_index = device_index; @@ -194,8 +207,7 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) this->_streams.push_back(_null_stream); tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream); - _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); - _staging_device2host = NULL; + this->reset(); }; #if 0 @@ -205,6 +217,13 @@ ihipDevice_t::~ihipDevice_t() delete _null_stream; _null_stream = NULL; } + + if (_staging_device2host) { + delete _staging_device2host; + } + if (_staging_host2device){ + delete _staging_host2device; + } } #endif @@ -848,6 +867,7 @@ hipError_t hipDeviceReset(void) ihipDevice_t *device = ihipGetTlsDefaultDevice(); if (device) { am_memtracker_reset(device->_acc); + device->reset(); // re-allocate required resources. } #endif @@ -1562,6 +1582,18 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : } }; +//--- +StagingBuffer::~StagingBuffer() +{ + for (int i=0; i_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); - } else { - assert(0); // TODO - //hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } - break; - case hipMemcpyDeviceToHost: - // TODO - optimize the copy here. - hc::AM_copy(dst, src, sizeBytes); - break; - default: - assert(0); // TODO + if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { + if (useStagingBuffer) { + device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + } + } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { + // TODO - optimize the copy here. + hc::AM_copy(dst, src, sizeBytes); + } else { + // Let HSA runtime handle it: + // TODO - need buffer pool for the signals: + hsa_signal_t completion_signal; + hsa_signal_create(1, 0, NULL, &completion_signal); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, completion_signal); + + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + hsa_signal_destroy(completion_signal); } } #endif @@ -1815,6 +1851,7 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total) //--- hipError_t hipFree(void* ptr) { + // TODO - ensure this pointer was created by hipMalloc and not hipMallocHost std::call_once(hip_initialized, ihipInit); @@ -1831,6 +1868,7 @@ hipError_t hipFree(void* ptr) hipError_t hipFreeHost(void* ptr) { + // TODO - ensure this pointer was created by hipMallocHost and not hipMalloc std::call_once(hip_initialized, ihipInit); if (ptr) { diff --git a/projects/clr/hipamd/tests/src/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/hipMemcpy.cpp index 7664cfb581..241c39c2ad 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpy.cpp @@ -23,24 +23,21 @@ THE SOFTWARE. #include "test_common.h" - -int main(int argc, char *argv[]) +// Test simple H2D copies and back. +void simpleTest1() { - HipTest::parseStandardArguments(argc, argv, true); - + printf ("test: %s\n", __func__); size_t Nbytes = N*sizeof(int); - printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N); - + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); - HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); @@ -50,8 +47,98 @@ int main(int argc, char *argv[]) HIPCHECK (hipDeviceSynchronize()); - HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK (hipDeviceReset()); + + printf (" %s success\n", __func__); +} + + +// Test many different kinds of memory copies: + +template +void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) +{ + printf ("test: %s\n", __func__); + + + T *A_d, *B_d, *C_d; + T *A_h, *B_h, *C_h; + + size_t Nbytes = N*sizeof(T); + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, usePinnedHost); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + T *A_hh = NULL; + T *B_hh = NULL; + T *C_dd = NULL; + + // Allocate some extra arrays: + + HIPCHECK ( hipMalloc(&C_dd, Nbytes) ); + + + if (useHostToHost) { + if (usePinnedHost) { + HIPCHECK ( hipMallocHost(&A_hh, Nbytes) ); + HIPCHECK ( hipMallocHost(&B_hh, Nbytes) ); + } else { + A_hh = (T*)malloc(Nbytes); + B_hh = (T*)malloc(Nbytes); + } + + + // Do some extra host copies here to mix things up: + HIPCHECK ( hipMemcpy(A_hh, A_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(B_hh, B_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + + + HIPCHECK ( hipMemcpy(A_d, A_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } else { + HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + +#if 0 + // Do some extra host copies here to mix things up: + HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + + //Destroy the original C_d: + HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes)); + + HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); +#else + HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); +#endif + + HIPCHECK ( hipDeviceSynchronize() ); + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); + HIPCHECK ( hipDeviceReset() ); + + printf (" %s success\n", __func__); +} + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + + + simpleTest1(); + + memcpyKind(false, false, false); + memcpyKind(true, false, false); + //memcpyKind(true); + passed(); } diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index 57d2ebc831..5b631d2c3a 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -96,7 +96,7 @@ vectorADD(hipLaunchParm lp, template void initArrays(T **A_d, T **B_d, T **C_d, T **A_h, T **B_h, T **C_h, - size_t N) + size_t N, bool usePinnedHost=false) { size_t Nbytes = N*sizeof(T); @@ -110,14 +110,32 @@ void initArrays(T **A_d, T **B_d, T **C_d, HIPCHECK ( hipMalloc(C_d, Nbytes) ); } - if (A_h) - *A_h = (T*)malloc(Nbytes); - - if (B_h) - *B_h = (T*)malloc(Nbytes); + if (usePinnedHost) { + if (A_h) { + HIPCHECK ( hipMallocHost(A_h, Nbytes) ); + } + if (B_h) { + HIPCHECK ( hipMallocHost(B_h, Nbytes) ); + } + if (C_h) { + HIPCHECK ( hipMallocHost(C_h, Nbytes) ); + } + } else { + if (A_h) { + *A_h = (T*)malloc(Nbytes); + HIPASSERT(*A_h != NULL); + } + + if (B_h) { + *B_h = (T*)malloc(Nbytes); + HIPASSERT(*B_h != NULL); + } - if (C_h) - *C_h = (T*)malloc(Nbytes); + if (C_h) { + *C_h = (T*)malloc(Nbytes); + HIPASSERT(*C_h != NULL); + } + } // Initialize the host data: @@ -130,7 +148,43 @@ void initArrays(T **A_d, T **B_d, T **C_d, } +template +void freeArrays(T *A_d, T *B_d, T *C_d, + T *A_h, T *B_h, T *C_h, bool usePinnedHost) +{ + if (A_d) { + HIPCHECK ( hipFree(A_d) ); + } + if (B_d) { + HIPCHECK ( hipFree(B_d) ); + } + if (C_d) { + HIPCHECK ( hipFree(C_d) ); + } + if (usePinnedHost) { + if (A_h) { + HIPCHECK (hipFreeHost(A_h)); + } + if (B_h) { + HIPCHECK (hipFreeHost(B_h)); + } + if (C_h) { + HIPCHECK (hipFreeHost(C_h)); + } + } else { + if (A_h) { + free (A_h); + } + if (B_h) { + free (B_h); + } + if (C_h) { + free (C_h); + } + } + +} // Assumes C_h contains vector add of A_h + B_h diff --git a/projects/clr/hipamd/util/vim/hip.vim b/projects/clr/hipamd/util/vim/hip.vim index e2236f4fbc..e4ea0a4a9e 100644 --- a/projects/clr/hipamd/util/vim/hip.vim +++ b/projects/clr/hipamd/util/vim/hip.vim @@ -91,6 +91,7 @@ syn keyword hipFunctionName hipD3D9UnmapResources syn keyword hipFunctionName hipD3D9UnregisterResource syn keyword hipFunctionName hipDeviceGetProperties syn keyword hipFunctionName hipDeviceSynchronize +syn keyword hipFunctionName hipDeviceReset syn keyword hipFunctionName hipEventCreate syn keyword hipFunctionName hipEventDestroy syn keyword hipFunctionName hipEventElapsedTime From b8ea1ec374912f1c37f998f6c7748345a3494df9 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 18:23:55 -0600 Subject: [PATCH 09/32] Improve copy testing [ROCm/clr commit: 2353cbb028549f87024be6284a4f51b7ef4d34c7] --- projects/clr/hipamd/src/hip_hcc.cpp | 4 +- projects/clr/hipamd/tests/src/hipMemcpy.cpp | 63 +++++++++++++++------ 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index f397b02cbe..0b7c81a365 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -1641,8 +1641,8 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size // Resolve default to a specific Kind, since we use different algorithms: if (kind == hipMemcpyDefault) { - bool dstIsHost = (dstNotTracked || dstPtrInfo._isInDeviceMem); - bool srcIsHost = (srcNotTracked || srcPtrInfo._isInDeviceMem); + bool dstIsHost = (dstNotTracked || !dstPtrInfo._isInDeviceMem); + bool srcIsHost = (srcNotTracked || !srcPtrInfo._isInDeviceMem); if (srcIsHost && !dstIsHost) { kind = hipMemcpyHostToDevice; } else if (!srcIsHost && dstIsHost) { diff --git a/projects/clr/hipamd/tests/src/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/hipMemcpy.cpp index 241c39c2ad..0de1b0b7a0 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpy.cpp @@ -22,6 +22,8 @@ THE SOFTWARE. #include "hip_runtime.h" #include "test_common.h" +//:w #include + // Test simple H2D copies and back. void simpleTest1() @@ -59,9 +61,12 @@ void simpleTest1() // Test many different kinds of memory copies: template -void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) +void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { - printf ("test: %s\n", __func__); + printf ("test: %s<%s> usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + __func__, + typeid(T).name(), + usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); T *A_d, *B_d, *C_d; @@ -76,9 +81,6 @@ void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) T *B_hh = NULL; T *C_dd = NULL; - // Allocate some extra arrays: - - HIPCHECK ( hipMalloc(&C_dd, Nbytes) ); if (useHostToHost) { @@ -91,7 +93,7 @@ void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) } - // Do some extra host copies here to mix things up: + // Do some extra host-to-host copies here to mix things up: HIPCHECK ( hipMemcpy(A_hh, A_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(B_hh, B_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); @@ -105,17 +107,19 @@ void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); -#if 0 - // Do some extra host copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + if (useDeviceToDevice) { + HIPCHECK ( hipMalloc(&C_dd, Nbytes) ); - //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes)); + // Do an extra device-to-device copies here to mix things up: + HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); -#else - HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); -#endif + //Destroy the original C_d: + HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes)); + + HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } else { + HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, N); @@ -127,6 +131,25 @@ void memcpyKind(bool usePinnedHost, bool useHostToHost, bool useMemkindDefault) } +template +void memcpytest2_loop() +{ + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { +#define USE_HOST_2_HOST +#ifdef USE_HOST_2_HOST + for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO +#else + for (int useHostToHost =0; useHostToHost<=0; useHostToHost++) { // TODO +#endif + for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { + for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { + memcpytest2(usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + } + } + } + } +} + int main(int argc, char *argv[]) { @@ -135,9 +158,13 @@ int main(int argc, char *argv[]) simpleTest1(); - memcpyKind(false, false, false); - memcpyKind(true, false, false); - //memcpyKind(true); + //memcpytest2(0/*usePinnedHost*/, 0/*useHostToHost*/, 0/*useDeviceToDevice*/, 1/*useMemkindDefault*/); + + memcpytest2_loop(); + memcpytest2_loop(); + memcpytest2_loop(); + memcpytest2_loop(); + passed(); From ea298fae23f56f37f2bb2e20f87902ba458f1c44 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 21:30:43 -0600 Subject: [PATCH 10/32] Add Bus Bandwidth test, leveraged from SHOC. [ROCm/clr commit: ba7497b40906832f103894dfb258dfb445067cf7] --- .../1_Utils/hipBusBandwidth/LICENSE.txt | 27 + .../samples/1_Utils/hipBusBandwidth/Makefile | 16 + .../hipBusBandwidth/ResultDatabase.cpp | 520 ++++++++++++++++++ .../1_Utils/hipBusBandwidth/ResultDatabase.h | 100 ++++ .../hipBusBandwidth/hipBusBandwidth.cpp | 170 ++++++ 5 files changed, 833 insertions(+) create mode 100644 projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt create mode 100644 projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile create mode 100644 projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp create mode 100644 projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h create mode 100644 projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt new file mode 100644 index 0000000000..5d0d603232 --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt @@ -0,0 +1,27 @@ + +Copyright (c) 2011, UT-Battelle, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor + the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile new file mode 100644 index 0000000000..d233216313 --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile @@ -0,0 +1,16 @@ +HIP_PATH?=$(shell hipconfig -p) +HIPCC=$(HIP_PATH)/bin/hipcc + +EXE=hipBusBandwidth + +all: install + +$(EXE): hipBusBandwidth.cpp ResultDatabase.cpp + $(HIPCC) $^ -o $@ + +install: $(EXE) + cp $(EXE) $(HIP_PATH)/bin + + +clean: + rm -f *.o $(EXE) diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp new file mode 100644 index 0000000000..f57aed11be --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -0,0 +1,520 @@ +#include "ResultDatabase.h" + +#include +#include +#include +#include + +using namespace std; + +bool ResultDatabase::Result::operator<(const Result &rhs) const +{ + if (test < rhs.test) + return true; + if (test > rhs.test) + return false; + if (atts < rhs.atts) + return true; + if (atts > rhs.atts) + return false; + return false; // less-operator returns false on equal +} + +double ResultDatabase::Result::GetMin() const +{ + double r = FLT_MAX; + for (int i=0; i= 100) + return value[n-1]; + + double index = ((n + 1.) * q / 100.) - 1; + + vector sorted = value; + sort(sorted.begin(), sorted.end()); + + if (n == 2) + return (sorted[0] * (1 - q/100.) + sorted[1] * (q/100.)); + + int index_lo = int(index); + double frac = index - index_lo; + if (frac == 0) + return sorted[index_lo]; + + double lo = sorted[index_lo]; + double hi = sorted[index_lo + 1]; + return lo + (hi-lo)*frac; +} + +double ResultDatabase::Result::GetMean() const +{ + double r = 0; + for (int i=0; i &values) +{ + for (int i=0; i= results.size()) + { + Result r; + r.test = test; + r.atts = atts; + r.unit = unit; + results.push_back(r); + } + + results[index].value.push_back(value); +} + +// **************************************************************************** +// Method: ResultDatabase::DumpDetailed +// +// Purpose: +// Writes the full results, including all trials. +// +// Arguments: +// out where to print +// +// Programmer: Jeremy Meredith +// Creation: August 14, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010 +// Renamed to DumpDetailed to make room for a DumpSummary. +// +// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010 +// Added note about (*) missing value tag. +// +// Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010 +// Changed note about missing values to be worded a little better. +// +// **************************************************************************** +void ResultDatabase::DumpDetailed(ostream &out) +{ + vector sorted(results); + + sort(sorted.begin(), sorted.end()); + + int maxtrials = 1; + for (int i=0; i maxtrials) + maxtrials = sorted[i].value.size(); + } + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test\t" + << "atts\t" + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + for (int i=0; i sorted(results); + + sort(sorted.begin(), sorted.end()); + + out << std::fixed << right << std::setw(9) << std::setprecision(4); + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test\t" + << "atts\t" + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + out << endl; + + for (int i=0; i sorted(results); + + sort(sorted.begin(), sorted.end()); + + //Check to see if the file is empty - if so, add the headers + emptyFile = this->IsFileEmpty(fileName); + + //Open file and append by default + ofstream out; + out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); + + //Add headers only for empty files + if(emptyFile) + { + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test, " + << "atts, " + << "units, " + << "median, " + << "mean, " + << "stddev, " + << "min, " + << "max, "; + out << endl; + } + + for (int i=0; i +ResultDatabase::GetResultsForTest(const string &test) +{ + // get only the given test results + vector retval; + for (int i=0; i & +ResultDatabase::GetResults() const +{ + return results; +} diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h new file mode 100644 index 0000000000..4b63a02a1f --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h @@ -0,0 +1,100 @@ +#ifndef RESULT_DATABASE_H +#define RESULT_DATABASE_H + +#include +#include +#include +#include +#include +using std::string; +using std::vector; +using std::ostream; +using std::ofstream; +using std::ifstream; + + +// **************************************************************************** +// Class: ResultDatabase +// +// Purpose: +// Track numerical results as they are generated. +// Print statistics of raw results. +// +// Programmer: Jeremy Meredith +// Creation: June 12, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 +// Split timing reports into detailed and summary. E.g. for serial code, +// we might report all trial values, but skip them in parallel. +// +// Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010 +// Added check for missing value tag. +// +// Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010 +// Added percentile statistic. +// +// Jeremy Meredith, Fri Dec 3 16:30:31 EST 2010 +// Added a method to extract a subset of results based on test name. Also, +// the Result class is now public, so that clients can use them directly. +// Added a GetResults method as well, and made several functions const. +// +// **************************************************************************** +class ResultDatabase +{ + public: + // + // A performance result for a single SHOC benchmark run. + // + struct Result + { + string test; // e.g. "readback" + string atts; // e.g. "pagelocked 4k^2" + string unit; // e.g. "MB/sec" + vector value; // e.g. "837.14" + double GetMin() const; + double GetMax() const; + double GetMedian() const; + double GetPercentile(double q) const; + double GetMean() const; + double GetStdDev() const; + + bool operator<(const Result &rhs) const; + + bool HadAnyFLTMAXValues() const + { + for (int i=0; i= FLT_MAX) + return true; + } + return false; + } + }; + + protected: + vector results; + + public: + void AddResult(const string &test, + const string &atts, + const string &unit, + double value); + void AddResults(const string &test, + const string &atts, + const string &unit, + const vector &values); + vector GetResultsForTest(const string &test); + const vector &GetResults() const; + void ClearAllResults(); + void DumpDetailed(ostream&); + void DumpSummary(ostream&); + void DumpCsv(string fileName); + + private: + bool IsFileEmpty(string fileName); + +}; + + +#endif diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp new file mode 100644 index 0000000000..8481476fc8 --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -0,0 +1,170 @@ +#include +#include +#include + +#include "ResultDatabase.h" + +// Cmdline parms: +const bool p_verbose = false; +const bool p_pinned = true; +const unsigned int p_iters = 10; + +#define CHECK_HIP_ERROR() \ +{ \ + hipError_t err = hipGetLastError(); \ + if (err != hipSuccess) \ + { \ + printf("error=%d name=%s at " \ + "ln: %d\n ",err,hipGetErrorString(err),__LINE__); \ + exit(EXIT_FAILURE); \ + } \ +} + + +// **************************************************************************** +// Function: runBenchmark +// +// Purpose: +// Measures the bandwidth of the bus connecting the host processor to the +// OpenCL device. This benchmark repeatedly transfers data chunks of various +// sizes across the bus to the OpenCL device, and calculates the bandwidth. +// +// +// Arguments: +// +// Returns: nothing +// +// Programmer: Jeremy Meredith +// Creation: September 08, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Dec 1 17:05:27 EST 2010 +// Added calculation of latency estimate. +// Ben Sander - moved to standalone test +// +// **************************************************************************** +void RunBenchmark(ResultDatabase &resultDB) +{ + // Sizes are in kb + int nSizes = 20; + int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, + 32768,65536,131072,262144,524288}; + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + // Create some host memory pattern + float *hostMem = NULL; + if (p_pinned) + { + hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + while (hipGetLastError() != hipSuccess) + { + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + } + } + else + { + hostMem = new float[numMaxFloats]; + } + + for (int i = 0; i < numMaxFloats; i++) + { + hostMem[i] = i % 77; + } + + float *device; + hipMalloc((void**)&device, sizeof(float) * numMaxFloats); + while (hipGetLastError() != hipSuccess) + { + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating device mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any device buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMalloc((void**)&device, sizeof(float) * numMaxFloats); + } + + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + CHECK_HIP_ERROR(); + + // Three passes, forward and backward both + for (int pass = 0; pass < p_iters; pass++) + { + // store the times temporarily to estimate latency + //float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) + { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; + + int nbytes = sizes[sizeIndex] * 1024; + + hipEventRecord(start, 0); + hipMemcpy(device, hostMem, nbytes, hipMemcpyHostToDevice); + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + //times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) + { + std::cerr << "size " << sizes[sizeIndex] << "k took " << t << + " ms\n"; + } + + double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; + char sizeStr[256]; + sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); + resultDB.AddResult("DownloadSpeed", sizeStr, "GB/sec", speed); + resultDB.AddResult("DownloadTime", sizeStr, "ms", t); + } + } + + // Cleanup + hipFree((void*)device); + CHECK_HIP_ERROR(); + if (p_pinned) + { + hipFreeHost((void*)hostMem); + CHECK_HIP_ERROR(); + } + else + { + delete[] hostMem; + } + hipEventDestroy(start); + hipEventDestroy(stop); +} + + + +int main(int argc, char *argv[]) +{ + ResultDatabase resultDB; + RunBenchmark(resultDB); + + resultDB.DumpSummary(std::cout); + + resultDB.DumpDetailed(std::cout); +} From 06e7be346a467cb957085aeb2209d08c5a28e657 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 22:46:34 -0600 Subject: [PATCH 11/32] Add D2H test [ROCm/clr commit: 81c275da13b5c8a7a29318db1cbc32e11e8fd4d7] --- .../hipBusBandwidth/hipBusBandwidth.cpp | 235 +++++++++++++++++- 1 file changed, 226 insertions(+), 9 deletions(-) diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 8481476fc8..c908fa655e 100644 --- a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -5,9 +5,15 @@ #include "ResultDatabase.h" // Cmdline parms: -const bool p_verbose = false; -const bool p_pinned = true; -const unsigned int p_iters = 10; +bool p_verbose = false; +bool p_pinned = true; +int p_iterations = 10; +int p_device = 0; +int p_detailed = 0; + +bool p_h2d = true; +bool p_d2h = true; + #define CHECK_HIP_ERROR() \ { \ @@ -43,7 +49,7 @@ const unsigned int p_iters = 10; // Ben Sander - moved to standalone test // // **************************************************************************** -void RunBenchmark(ResultDatabase &resultDB) +void RunBenchmark_H2D(ResultDatabase &resultDB) { // Sizes are in kb int nSizes = 20; @@ -51,6 +57,8 @@ void RunBenchmark(ResultDatabase &resultDB) 32768,65536,131072,262144,524288}; long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipSetDevice(p_device); + // Create some host memory pattern float *hostMem = NULL; if (p_pinned) @@ -103,7 +111,7 @@ void RunBenchmark(ResultDatabase &resultDB) CHECK_HIP_ERROR(); // Three passes, forward and backward both - for (int pass = 0; pass < p_iters; pass++) + for (int pass = 0; pass < p_iterations; pass++) { // store the times temporarily to estimate latency //float times[nSizes]; @@ -158,13 +166,222 @@ void RunBenchmark(ResultDatabase &resultDB) } +void RunBenchmark_D2H(ResultDatabase &resultDB) +{ + + // Sizes are in kb + int nSizes = 20; + int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, + 32768,65536,131072,262144,524288}; + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + // Create some host memory pattern + float *hostMem1; + float *hostMem2; + if (p_pinned) + { + hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + hipError_t err1 = hipGetLastError(); + hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + hipError_t err2 = hipGetLastError(); + while (err1 != hipSuccess || err2 != hipSuccess) + { + // free the first buffer if only the second failed + if (err1 == hipSuccess) + hipFreeHost((void*)hostMem1); + + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + err1 = hipGetLastError(); + hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + err2 = hipGetLastError(); + } + } + else + { + hostMem1 = new float[numMaxFloats]; + hostMem2 = new float[numMaxFloats]; + } + for (int i=0; i= argc || !parseInt(argv[i], &p_iterations)) { + failed("Bad iterations argument"); + } + } else if (!strcmp(arg, "--device") || (!strcmp(arg, "-d"))) { + if (++i >= argc || !parseInt(argv[i], &p_device)) { + failed("Bad device argument"); + } + } else if (!strcmp(arg, "--unpinned")) { + p_pinned = 0; + } else if (!strcmp(arg, "--h2d")) { + p_h2d = true; + p_d2h = false; + + } else if (!strcmp(arg, "--d2h")) { + p_h2d = false; + p_d2h = true; + + } else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) { + help(); + + } else if (!strcmp(arg, "--verbose")) { + p_verbose = 1; + } else if (!strcmp(arg, "--detailed")) { + p_detailed = 1; + } else { + failed("Bad argument '%s'", arg); + } + } + + return 0; +}; + + int main(int argc, char *argv[]) { - ResultDatabase resultDB; - RunBenchmark(resultDB); + parseStandardArguments(argc, argv); - resultDB.DumpSummary(std::cout); + if (p_h2d) { + ResultDatabase resultDB; + RunBenchmark_H2D(resultDB); - resultDB.DumpDetailed(std::cout); + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } + + if (p_d2h) { + ResultDatabase resultDB; + RunBenchmark_D2H(resultDB); + + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } } From 6096173cd164152dfef44fcd3b4a243efc5e003c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 22:47:26 -0600 Subject: [PATCH 12/32] Add D2H test [ROCm/clr commit: 1cd35f6371afeb88e72ffb412308e966d6742fbe] --- .../samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index c908fa655e..b847f8db40 100644 --- a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -144,8 +144,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); - resultDB.AddResult("DownloadSpeed", sizeStr, "GB/sec", speed); - resultDB.AddResult("DownloadTime", sizeStr, "ms", t); + resultDB.AddResult("H2D_Bandwidth", sizeStr, "GB/sec", speed); + resultDB.AddResult("H2D_Time", sizeStr, "ms", t); } } @@ -273,8 +273,8 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); - resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed); - resultDB.AddResult("ReadbackTime", sizeStr, "ms", t); + resultDB.AddResult("D2H_Bandwidth", sizeStr, "GB/sec", speed); + resultDB.AddResult("D2H_Time", sizeStr, "ms", t); } //resultDB.AddResult("ReadbackLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.); //resultDB.AddResult("ReadbackLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.); From e7666bd9d0efc03ed36d861b245d5e06e71b013f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 13 Feb 2016 01:14:01 -0600 Subject: [PATCH 13/32] Result formatting [ROCm/clr commit: 8e3dd664eb1171a0a32ec253dded0975a7bd8455] --- .../samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp index f57aed11be..7d2f3aef84 100644 --- a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -278,13 +278,16 @@ void ResultDatabase::DumpSummary(ostream &out) { vector sorted(results); + int testW = 15 ; + const int fieldW = 9; + sort(sorted.begin(), sorted.end()); - out << std::fixed << right << std::setw(9) << std::setprecision(4); + out << std::fixed << right << std::setprecision(4); // TODO: in big parallel runs, the "trials" are the procs // and we really don't want to print them all out.... - out << "test\t" + out << setw(testW) << "test\t" << setw(fieldW) << "atts\t" << "units\t" << "median\t" @@ -297,7 +300,7 @@ void ResultDatabase::DumpSummary(ostream &out) for (int i=0; i Date: Sat, 13 Feb 2016 01:15:23 -0600 Subject: [PATCH 14/32] D2H multi-buffer [ROCm/clr commit: a002833a89b060658deb92c18fb1c3a4059c6628] --- projects/clr/hipamd/src/hip_hcc.cpp | 138 +++++++++++++++----- projects/clr/hipamd/tests/src/hipMemcpy.cpp | 136 ++++++++++++++----- 2 files changed, 212 insertions(+), 62 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 0b7c81a365..4921a61c72 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -31,7 +31,6 @@ THE SOFTWARE. #include #include #include -#include #include #include @@ -61,6 +60,7 @@ int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ +int HIP_STAGING_DOUBLE_BUFFER = 1; #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -123,22 +123,23 @@ struct ihipEvent_t { //------------------------------------------------------------------------------------------------- struct StagingBuffer { - static const int numBuffers = 2; - - int _bufferIndex; // Operating on buffer 0 or 1? - - ihipDevice_t *_device; - size_t _bufferSize; // Size of the buffers. + static const int _numBuffers = 2; - StagingBuffer(ihipDevice_t *device, size_t bufferSize) ; + + StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) ; ~StagingBuffer(); + void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes); void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); private: - char *_pinnedStagingBuffer[numBuffers]; - hsa_signal_t _completion_signal[numBuffers]; + ihipDevice_t *_device; + size_t _bufferSize; // Size of the buffers. + bool _double_buffer; + + char *_pinnedStagingBuffer[_numBuffers]; + hsa_signal_t _completion_signal[_numBuffers]; }; @@ -179,7 +180,7 @@ public: //Device may be reset multiple times, and may be reset after init. void ihipDevice_t::reset() { - _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); + _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_DOUBLE_BUFFER); _staging_device2host = NULL; }; @@ -519,6 +520,7 @@ void ihipInit() READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of staging buffer, in KB" ); + READ_ENV_I(release, HIP_STAGING_DOUBLE_BUFFER, 0, "Double-buffer copies to device" ); /* * Build a table of valid compute devices. @@ -1568,12 +1570,13 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou //------------------------------------------------------------------------------------------------- -StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : - _bufferIndex(0), +StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) : _device(device), - _bufferSize(bufferSize) + _bufferSize(bufferSize), + _double_buffer(doubleBuffer) { - for (int i=0; i_acc, amHostPinned); if (_pinnedStagingBuffer[i] == NULL) { throw; @@ -1585,7 +1588,7 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : //--- StagingBuffer::~StagingBuffer() { - for (int i=0; i (src); char *dstp = static_cast (dst); - assert(sizeBytes < UINT64_MAX/2); // TODO - for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0; bytesRemaining -= _bufferSize) { + for (int i=0; i<_numBuffers; i++) { + hsa_signal_store_relaxed(_completion_signal[i], 0); + } + + assert(sizeBytes < UINT64_MAX/2); // TODO + int bufferIndex = 0; + for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { - // TODO - double-buffer these guys. size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; - tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, _bufferIndex, _pinnedStagingBuffer[_bufferIndex]); + tprintf (TRACE_COPY2, "waiting... on completion signal\n"); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - memcpy(_pinnedStagingBuffer[_bufferIndex], srcp, theseBytes); + tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); + // TODO - use uncached memcpy, someday. + memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); - tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[_bufferIndex], dstp); + tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp); - hsa_signal_store_relaxed(_completion_signal[_bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[_bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[_bufferIndex]); + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); - tprintf (TRACE_COPY2, "waiting... status=%d\n", hsa_status); - if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_acquire(_completion_signal[_bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp += theseBytes; dstp += theseBytes; + if (_double_buffer) { + bufferIndex = (bufferIndex + 1) % _numBuffers; + } } + + + for (int i=0; i<_numBuffers; i++) { + hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } +} + +//--- +void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes) +{ + const char *srcp0 = static_cast (src); + char *dstp1 = static_cast (dst); + + int numBuffers = _double_buffer ? _numBuffers : 1; + + for (int i=0; i 0) { + // First launch the async copies to copy from dest to host + for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { + + size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; + + tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + + srcp0 += theseBytes; + } + + // Now unload the staging buffers: + for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { + + size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; + + tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + tprintf (TRACE_COPY2, "D2H: copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); + memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); + + dstp1 += theseBytes; + } + } + + + //for (int i=0; i<_numBuffers; i++) { + // hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + //} } @@ -1657,10 +1725,18 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (useStagingBuffer) { device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + } else { + hc::AM_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { - // TODO - optimize the copy here. - hc::AM_copy(dst, src, sizeBytes); + if (useStagingBuffer) { + device->_staging_host2device->CopyDeviceToHost(dst, src, sizeBytes); + } else { + hc::AM_copy(dst, src, sizeBytes); + } + } else if (kind == hipMemcpyHostToHost) { + memcpy(dst, src, sizeBytes); + } else { // Let HSA runtime handle it: // TODO - need buffer pool for the signals: diff --git a/projects/clr/hipamd/tests/src/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/hipMemcpy.cpp index 0de1b0b7a0..509f4a1177 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpy.cpp @@ -22,8 +22,11 @@ THE SOFTWARE. #include "hip_runtime.h" #include "test_common.h" -//:w #include +void printSep() +{ + printf ("======================================================================================\n"); +} // Test simple H2D copies and back. void simpleTest1() @@ -61,21 +64,22 @@ void simpleTest1() // Test many different kinds of memory copies: template -void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { - printf ("test: %s<%s> usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + size_t sizeElements = numElements * sizeof(T); + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, typeid(T).name(), + sizeElements, sizeElements/1024.0/1024.0, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; - size_t Nbytes = N*sizeof(T); - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, usePinnedHost); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); T *A_hh = NULL; T *B_hh = NULL; @@ -85,44 +89,44 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, if (useHostToHost) { if (usePinnedHost) { - HIPCHECK ( hipMallocHost(&A_hh, Nbytes) ); - HIPCHECK ( hipMallocHost(&B_hh, Nbytes) ); + HIPCHECK ( hipMallocHost(&A_hh, sizeElements) ); + HIPCHECK ( hipMallocHost(&B_hh, sizeElements) ); } else { - A_hh = (T*)malloc(Nbytes); - B_hh = (T*)malloc(Nbytes); + A_hh = (T*)malloc(sizeElements); + B_hh = (T*)malloc(sizeElements); } // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(A_hh, A_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(B_hh, B_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(A_d, A_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); if (useDeviceToDevice) { - HIPCHECK ( hipMalloc(&C_dd, Nbytes) ); + HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); // Do an extra device-to-device copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes)); + HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); - HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(A_h, B_h, C_h, N); + HipTest::checkVectorADD(A_h, B_h, C_h, numElements); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); HIPCHECK ( hipDeviceReset() ); @@ -132,8 +136,10 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, template -void memcpytest2_loop() +void memcpytest2_loop(size_t numElements) { + printSep(); + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { #define USE_HOST_2_HOST #ifdef USE_HOST_2_HOST @@ -143,7 +149,7 @@ void memcpytest2_loop() #endif for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); } } } @@ -151,20 +157,88 @@ void memcpytest2_loop() } +template +void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) +{ + printSep(); + printf ("test: %s<%s>\n", __func__, typeid(T).name()); + + int deviceId; + HIPCHECK(hipGetDevice(&deviceId)); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + + if (maxElem == 0) { + maxElem = free/sizeof(T)/5; + } + + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + + for (size_t elem=64; elem+offset<=maxElem; elem*=2) { + memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host + memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host + } +} + + +template +void multiThread_1(bool serialize) +{ + printSep(); + printf ("test: %s<%s> serialize=%d\n", __func__, typeid(T).name(), serialize); + std::thread t1 (memcpytest2,N, 0,0,0,0); + if (serialize) { + t1.join(); + } + + + std::thread t2 (memcpytest2,N, 0,0,0,0); + if (serialize) { + t2.join(); + } + + if (!serialize) { + t1.join(); + t2.join(); + } +} + + + int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); - simpleTest1(); + if (p_tests & 0x1) { + simpleTest1(); + } - //memcpytest2(0/*usePinnedHost*/, 0/*useHostToHost*/, 0/*useDeviceToDevice*/, 1/*useMemkindDefault*/); + if (p_tests & 0x2) { + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + } - memcpytest2_loop(); - memcpytest2_loop(); - memcpytest2_loop(); - memcpytest2_loop(); + if (p_tests & 0x4) { + printSep(); + memcpytest2_sizes(0,0); + printSep(); + memcpytest2_sizes(0,64); + printSep(); + memcpytest2_sizes(1024*1024, 13); + printSep(); + memcpytest2_sizes(1024*1024, 50); + } + if (p_tests & 0x8) { + printSep(); + multiThread_1(true); + multiThread_1(false); + } passed(); From 275a36ee47d76333f4a23d350a2ce85d5f8ca10d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 13 Feb 2016 03:17:42 -0600 Subject: [PATCH 15/32] Enable -O3, style points on array size [ROCm/clr commit: 56b3d2e7c499996538786ed9d24bf9041df373ed] --- .../clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile | 3 ++- .../samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile index d233216313..77a92fb1a6 100644 --- a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile @@ -2,11 +2,12 @@ HIP_PATH?=$(shell hipconfig -p) HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipBusBandwidth +CXXFLAGS = -O3 -g all: install $(EXE): hipBusBandwidth.cpp ResultDatabase.cpp - $(HIPCC) $^ -o $@ + $(HIPCC) $(CXXFLAGS) $^ -o $@ install: $(EXE) cp $(EXE) $(HIP_PATH)/bin diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index b847f8db40..d276725921 100644 --- a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -52,9 +52,9 @@ bool p_d2h = true; void RunBenchmark_H2D(ResultDatabase &resultDB) { // Sizes are in kb - int nSizes = 20; - int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, - 32768,65536,131072,262144,524288}; + int sizes[] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288}; + int nSizes = sizeof(sizes) / sizeof(int); + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; hipSetDevice(p_device); From ed618e637e3a3c1176114eb8968f9a4926b10fee Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 13 Feb 2016 03:18:01 -0600 Subject: [PATCH 16/32] Add multi-threading synchonization on staging buffers and signals. Also pre-allocate a couple signals for copies. [ROCm/clr commit: 8939b4f0e552747fe45666fbcaac3467a3250ad0] --- projects/clr/hipamd/src/hip_hcc.cpp | 85 ++++++++++++--------- projects/clr/hipamd/tests/src/hipMemcpy.cpp | 20 +++-- 2 files changed, 60 insertions(+), 45 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 4921a61c72..4f95320ac3 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -60,7 +60,7 @@ int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -int HIP_STAGING_DOUBLE_BUFFER = 1; +int HIP_STAGING_BUFFERS = 2; #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -123,11 +123,10 @@ struct ihipEvent_t { //------------------------------------------------------------------------------------------------- struct StagingBuffer { - static const int _numBuffers = 2; + static const int _max_buffers = 4; - - StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) ; + StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) ; ~StagingBuffer(); void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes); @@ -136,10 +135,10 @@ struct StagingBuffer { private: ihipDevice_t *_device; size_t _bufferSize; // Size of the buffers. - bool _double_buffer; + int _numBuffers; - char *_pinnedStagingBuffer[_numBuffers]; - hsa_signal_t _completion_signal[_numBuffers]; + char *_pinnedStagingBuffer[_max_buffers]; + hsa_signal_t _completion_signal[_max_buffers]; }; @@ -161,8 +160,9 @@ struct ihipDevice_t unsigned _compute_units; - StagingBuffer *_staging_host2device; - StagingBuffer *_staging_device2host; + hsa_signal_t _copy_signal; // signal to use for copies + std::mutex _copy_lock[2]; // mutex for each direction. + StagingBuffer *_staging_buffer[2]; // one buffer for each direction. public: void reset(); @@ -170,7 +170,7 @@ public: hipError_t getProperties(hipDeviceProp_t* prop); // TODO- create a copy constructor. - //~ihipDevice_t(); + ~ihipDevice_t(); }; @@ -180,8 +180,8 @@ public: //Device may be reset multiple times, and may be reset after init. void ihipDevice_t::reset() { - _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_DOUBLE_BUFFER); - _staging_device2host = NULL; + _staging_buffer[0] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); + _staging_buffer[1] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); }; @@ -208,10 +208,13 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) this->_streams.push_back(_null_stream); tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream); + hsa_signal_create(0, 0, NULL, &_copy_signal); + this->reset(); }; -#if 0 +#if 1 +// TODO-remove #ifdef ihipDevice_t::~ihipDevice_t() { if (_null_stream) { @@ -219,12 +222,12 @@ ihipDevice_t::~ihipDevice_t() _null_stream = NULL; } - if (_staging_device2host) { - delete _staging_device2host; - } - if (_staging_host2device){ - delete _staging_host2device; + for (int i=0; i<2; i++) { + if (_staging_buffer[i]) { + delete _staging_buffer[i]; + } } + hsa_signal_destroy(_copy_signal); } #endif @@ -519,8 +522,8 @@ void ihipInit() READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); - READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of staging buffer, in KB" ); - READ_ENV_I(release, HIP_STAGING_DOUBLE_BUFFER, 0, "Double-buffer copies to device" ); + READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)." ); + READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction."); /* * Build a table of valid compute devices. @@ -1570,11 +1573,14 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou //------------------------------------------------------------------------------------------------- -StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) : +StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) : _device(device), _bufferSize(bufferSize), - _double_buffer(doubleBuffer) + _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers) { + + + for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. _pinnedStagingBuffer[i] = hc::AM_alloc(_bufferSize, device->_acc, amHostPinned); @@ -1630,8 +1636,8 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte srcp += theseBytes; dstp += theseBytes; - if (_double_buffer) { - bufferIndex = (bufferIndex + 1) % _numBuffers; + if (++bufferIndex >= _numBuffers) { + bufferIndex = 0; } } @@ -1647,9 +1653,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte const char *srcp0 = static_cast (src); char *dstp1 = static_cast (dst); - int numBuffers = _double_buffer ? _numBuffers : 1; - - for (int i=0; i 0) { // First launch the async copies to copy from dest to host - for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { + for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; @@ -1673,7 +1677,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte } // Now unload the staging buffers: - for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { + for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; @@ -1705,7 +1709,7 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); - bool useStagingBuffer = true; + bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. // Resolve default to a specific Kind, since we use different algorithms: if (kind == hipMemcpyDefault) { @@ -1724,31 +1728,36 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (useStagingBuffer) { - device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + std::lock_guard l (device->_copy_lock[0]); + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes); } else { hc::AM_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { if (useStagingBuffer) { - device->_staging_host2device->CopyDeviceToHost(dst, src, sizeBytes); + std::lock_guard l (device->_copy_lock[1]); + device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes); } else { hc::AM_copy(dst, src, sizeBytes); } } else if (kind == hipMemcpyHostToHost) { - memcpy(dst, src, sizeBytes); + memcpy(dst, src, sizeBytes); // TODO - not async. } else { // Let HSA runtime handle it: // TODO - need buffer pool for the signals: - hsa_signal_t completion_signal; - hsa_signal_create(1, 0, NULL, &completion_signal); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, completion_signal); + + device->_copy_lock[1].lock(); + + hsa_signal_store_relaxed(device->_copy_signal, 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } - hsa_signal_destroy(completion_signal); + device->_copy_lock[1].unlock(); + } } #endif diff --git a/projects/clr/hipamd/tests/src/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/hipMemcpy.cpp index 509f4a1177..3502b81e9d 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpy.cpp @@ -129,7 +129,6 @@ void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, boo HipTest::checkVectorADD(A_h, B_h, C_h, numElements); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); - HIPCHECK ( hipDeviceReset() ); printf (" %s success\n", __func__); } @@ -177,24 +176,26 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); for (size_t elem=64; elem+offset<=maxElem; elem*=2) { + HIPCHECK ( hipDeviceReset() ); memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host + HIPCHECK ( hipDeviceReset() ); memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host } } template -void multiThread_1(bool serialize) +void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); - printf ("test: %s<%s> serialize=%d\n", __func__, typeid(T).name(), serialize); - std::thread t1 (memcpytest2,N, 0,0,0,0); + printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, typeid(T).name(), serialize, usePinnedHost); + std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); if (serialize) { t1.join(); } - std::thread t2 (memcpytest2,N, 0,0,0,0); + std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); if (serialize) { t2.join(); } @@ -213,10 +214,12 @@ int main(int argc, char *argv[]) if (p_tests & 0x1) { + HIPCHECK ( hipDeviceReset() ); simpleTest1(); } if (p_tests & 0x2) { + HIPCHECK ( hipDeviceReset() ); memcpytest2_loop(N); memcpytest2_loop(N); memcpytest2_loop(N); @@ -224,6 +227,7 @@ int main(int argc, char *argv[]) } if (p_tests & 0x4) { + HIPCHECK ( hipDeviceReset() ); printSep(); memcpytest2_sizes(0,0); printSep(); @@ -235,9 +239,11 @@ int main(int argc, char *argv[]) } if (p_tests & 0x8) { + HIPCHECK ( hipDeviceReset() ); printSep(); - multiThread_1(true); - multiThread_1(false); + multiThread_1(true, true); + multiThread_1(false, true); + multiThread_1(false, false); // TODO } passed(); From 7e46e9059184b8d826e2c745d12d0bebbdf7a5e9 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 15 Feb 2016 05:19:52 -0600 Subject: [PATCH 17/32] Fix tests to account for multi-gpu [ROCm/clr commit: afbe451b0da2a25b1e5f05497f04e6286e45f12c] --- projects/clr/hipamd/tests/src/hipMemcpy.cpp | 3 +++ projects/clr/hipamd/tests/src/hipPointerAttrib.cpp | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/tests/src/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/hipMemcpy.cpp index 3502b81e9d..b76f98c687 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpy.cpp @@ -212,6 +212,9 @@ int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); + printf ("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + if (p_tests & 0x1) { HIPCHECK ( hipDeviceReset() ); diff --git a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp index 1418997274..12856da984 100644 --- a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp +++ b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp @@ -289,6 +289,7 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) for (int i =0; i Date: Mon, 15 Feb 2016 05:40:12 -0600 Subject: [PATCH 18/32] Update docs, cleanup [ROCm/clr commit: 322a3bd9b242f57c57c25232f459b4258747ef49] --- projects/clr/hipamd/include/hcc_detail/AM.h | 31 +++++++-- projects/clr/hipamd/src/hc_AM.cpp | 76 +++------------------ 2 files changed, 38 insertions(+), 69 deletions(-) diff --git a/projects/clr/hipamd/include/hcc_detail/AM.h b/projects/clr/hipamd/include/hcc_detail/AM.h index 04804ffaa5..40d9ea8382 100644 --- a/projects/clr/hipamd/include/hcc_detail/AM.h +++ b/projects/clr/hipamd/include/hcc_detail/AM.h @@ -79,6 +79,7 @@ am_status_t AM_free(void* ptr); am_status_t AM_copy(void* dst, const void* src, size_t size); + /** * Return information about tracked pointer. * @@ -94,11 +95,23 @@ am_status_t AM_copy(void* dst, const void* src, size_t size); am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); -//TODO-doc +/** + * Adds a pointer to the memory tracker. + * + * @return AM_SUCCESS + * @see am_memtracker_getinfo + */ am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); -//TODO-doc +/* + * Updates infor for an existing pointer in the memory tracker. + * + * @returns AM_ERROR_MISC if pointer is not found in tracker. + * @returns AM_SUCCESS if pointer is not found in tracker. + * + * @see am_memtracker_getinfo, am_memtracker_add + */ am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags); @@ -109,23 +122,33 @@ am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocation * * @returns AM_ERROR_MISC if pointer is not found in tracker. * @returns AM_SUCCESS if pointer is not found in tracker. + * + * @see am_memtracker_getinfo, am_memtracker_add */ am_status_t am_memtracker_remove(void* ptr); /** - * Remove all memory allocations associated with specified accelerator. + * Remove all memory allocations associated with specified accelerator from the memory tracker. * * @returns Number of entries reset. + * @see am_memtracker_getinfo */ size_t am_memtracker_reset(hc::accelerator acc); /** - * Prints info about the memory tracker table. + * Prints the entries in the memory tracker table. * * Intended primarily for debug purposes. + * @see am_memtracker_getinfo **/ void am_memtracker_print(); + +/** + * Returns total sizes of device, host, and user memory allocated by the application + * + * User memory is registered with am_tracker_add. + **/ void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize); diff --git a/projects/clr/hipamd/src/hc_AM.cpp b/projects/clr/hipamd/src/hc_AM.cpp index 2d22b49fd4..221322f4b0 100644 --- a/projects/clr/hipamd/src/hc_AM.cpp +++ b/projects/clr/hipamd/src/hc_AM.cpp @@ -73,12 +73,6 @@ public: size_t reset (hc::accelerator acc); private: - // TODO - use or remove. - inline void writeLock(); - inline void writeUnlock(); - inline void readLock(); - inline void readUnlock(); - MapTrackerType _tracker; std::mutex _mutex; //std::shared_timed_mutex _mut; @@ -117,20 +111,6 @@ AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void * } -#if 0 -//--- -std::ostream & AmPointerTracker::print (std::ostream &os) -{ - std::lock_guard l (_mutex); - for (auto iter = _tracker.begin() ; iter != _tracker.end(); iter++) { - os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; - os << iter->second << std::endl; - } - - return os; -} -#endif - //--- // Remove all tracked locations, and free the associated memory (if the range was originally allocated by AM). // Returns count of ranges removed. @@ -158,39 +138,6 @@ size_t AmPointerTracker::reset (hc::accelerator acc) } - -//--- -void AmPointerTracker::writeLock () -{ - _mutex.lock(); -} - - -//--- -void AmPointerTracker::writeUnlock () -{ - _mutex.unlock(); -} - - -//--- -// TODO - support multiple concurrent reader -void AmPointerTracker::readLock () -{ - _mutex.lock(); -} - - -//--- -// TODO - support multiple concurrent reader -void AmPointerTracker::readUnlock () -{ - _mutex.unlock(); -} - - - - //========================================================================================================= // Global var defs: //========================================================================================================= @@ -289,6 +236,17 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr) } } +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) +{ + if (isDeviceMem) { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, false /*isAMManaged*/)); + } else { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, false /*isAMManaged*/)); + } + + return AM_SUCCESS; +} + am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags) { @@ -303,18 +261,6 @@ am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocation } -am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) -{ - if (isDeviceMem) { - g_amPointerTracker.insert(ptr, hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, false /*isAMManaged*/)); - } else { - g_amPointerTracker.insert(ptr, hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, false /*isAMManaged*/)); - } - - return AM_SUCCESS; -} - - am_status_t am_memtracker_remove(void* ptr) { am_status_t status = AM_SUCCESS; From dc6f0ef3a6dbdbde295d3b0a6afe0004c478f99c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 15 Feb 2016 05:40:30 -0600 Subject: [PATCH 19/32] Remove old include path. [ROCm/clr commit: 3b2d4acabc3257fced3da3d7922aea34fc33f980] --- projects/clr/hipamd/tests/src/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/projects/clr/hipamd/tests/src/CMakeLists.txt b/projects/clr/hipamd/tests/src/CMakeLists.txt index bf05fc8407..7e4736a99a 100644 --- a/projects/clr/hipamd/tests/src/CMakeLists.txt +++ b/projects/clr/hipamd/tests/src/CMakeLists.txt @@ -19,7 +19,6 @@ MESSAGE ("HIP_PATH=" ${HIP_PATH}) if (${HIP_PLATFORM} STREQUAL "hcc") MESSAGE ("HIP_PLATFORM=hcc") - set (HC_PATH ${HIP_PATH}/hc) set (HSA_PATH /opt/hsa) #--- @@ -30,7 +29,7 @@ if (${HIP_PLATFORM} STREQUAL "hcc") #These includes are used for all files. #Include HIP and HC since the tests need both of these: #Note below HSA path is surgically included only where necessary. - include_directories(${HIP_PATH}/include ${HC_PATH}/include) + include_directories(${HIP_PATH}/include) # hip_hcc.o: add_library(hip_hcc OBJECT ${HIP_PATH}/src/hip_hcc.cpp) From 93c07bc3d1d8485908e8aca58fc741d4ebdca450 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 15 Feb 2016 05:41:09 -0600 Subject: [PATCH 20/32] Move warpSize to header, have shuffles use default warpsize. [ROCm/clr commit: bd7e3b83b9979525a24340cdadda323b2466e177] --- .../hipamd/include/hcc_detail/hip_runtime.h | 23 +++++++++++-------- projects/clr/hipamd/src/hip_hcc.cpp | 6 ----- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/projects/clr/hipamd/include/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hcc_detail/hip_runtime.h index 8474f066df..7c5a2f2e36 100644 --- a/projects/clr/hipamd/include/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hcc_detail/hip_runtime.h @@ -108,6 +108,12 @@ THE SOFTWARE. #define __HCC_C__ #endif + +// TODO - hipify-clang - change to use the function call. +//#define warpSize hc::__wavesize() +const int warpSize = 64; + + #define clock_t long long int __device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); }; __device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); }; @@ -344,42 +350,42 @@ __device__ inline unsigned long long int __ballot( int input) } // warp shuffle functions -__device__ inline int __shfl(int input, int lane, int width) +__device__ inline int __shfl(int input, int lane, int width=warpSize) { return hc::__shfl(input,lane,width); } -__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width) +__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_up(input,lane_delta,width); } -__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width) +__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_down(input,lane_delta,width); } -__device__ inline int __shfl_xor(int input, int lane_mask, int width) +__device__ inline int __shfl_xor(int input, int lane_mask, int width=warpSize) { return hc::__shfl_xor(input,lane_mask,width); } -__device__ inline float __shfl(float input, int lane, int width) +__device__ inline float __shfl(float input, int lane, int width=warpSize) { return hc::__shfl(input,lane,width); } -__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width) +__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_up(input,lane_delta,width); } -__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width) +__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_down(input,lane_delta,width); } -__device__ inline float __shfl_xor(float input, int lane_mask, int width) +__device__ inline float __shfl_xor(float input, int lane_mask, int width=warpSize) { return hc::__shfl_xor(input,lane_mask,width); } @@ -438,7 +444,6 @@ __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); }; #define hipGridDim_z (hc_get_num_groups(0)) -extern int warpSize ; #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 4f95320ac3..d4a6857559 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -49,8 +49,6 @@ THE SOFTWARE. //--- // Environment variables: -// TODO-HCC - map this to the HC instruction that uses HSAIL to get the wave size. -int warpSize = 64; // Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release. //static const int debug = 0; @@ -169,7 +167,6 @@ public: void init(unsigned device_index, hc::accelerator acc); hipError_t getProperties(hipDeviceProp_t* prop); - // TODO- create a copy constructor. ~ihipDevice_t(); }; @@ -213,8 +210,6 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) this->reset(); }; -#if 1 -// TODO-remove #ifdef ihipDevice_t::~ihipDevice_t() { if (_null_stream) { @@ -229,7 +224,6 @@ ihipDevice_t::~ihipDevice_t() } hsa_signal_destroy(_copy_signal); } -#endif //---- From f8f40e07bf77c1aadb6df5574015b840f78ff262 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 15 Feb 2016 21:16:00 -0600 Subject: [PATCH 21/32] Update before checkin to HCC. Add support for USE_AM_TRACKER=2 (HCC version). Add AM_ALLOC, AM_FREE indirection to ease swapping AM implementations. [ROCm/clr commit: 1ed431c0f66791f7b0df78cf57e5cc8f6bf83427] --- projects/clr/hipamd/include/hcc_detail/AM.h | 23 +++---- projects/clr/hipamd/src/hc_AM.cpp | 2 - projects/clr/hipamd/src/hip_hcc.cpp | 69 ++++++++++++++++----- 3 files changed, 66 insertions(+), 28 deletions(-) diff --git a/projects/clr/hipamd/include/hcc_detail/AM.h b/projects/clr/hipamd/include/hcc_detail/AM.h index 40d9ea8382..74542789af 100644 --- a/projects/clr/hipamd/include/hcc_detail/AM.h +++ b/projects/clr/hipamd/include/hcc_detail/AM.h @@ -13,7 +13,7 @@ typedef int am_status_t; namespace hc { -// This is the data that is maintained for each pointer: +// Info for each pointer in the memtry tracker: struct AmPointerInfo { void * _hostPointer; ///< Host pointer. If host access is not allowed, NULL. void * _devicePointer; ///< Device pointer. @@ -45,7 +45,7 @@ namespace hc { /** - * Allocates a block of @p size bytes of memory on the specified @p acc. + * Allocate a block of @p size bytes of memory on the specified @p acc. * * The contents of the newly allocated block of memory are not initialized. * @@ -53,7 +53,7 @@ namespace hc { * * Flags must be 0. * - * @returns : On success, pointer to the newly allocated memory is returned. + * @return : On success, pointer to the newly allocated memory is returned. * The pointer is typecast to the desired return type. * * If an error occurred trying to allocate the requested memory, 0 is returned. @@ -63,17 +63,18 @@ namespace hc { auto_voidp AM_alloc(size_t size, hc::accelerator acc, unsigned flags); /** - * Frees a block of memory previously allocated with am_alloc. + * Free a block of memory previously allocated with am_alloc. * + * @return AM_SUCCESS * @see am_alloc, am_copy */ am_status_t AM_free(void* ptr); /** - * Copies @p size bytes of memory from @p src to @ dst. The memory areas (src+size and dst+size) must not overlap. + * Copy @p size bytes of memory from @p src to @ dst. The memory areas (src+size and dst+size) must not overlap. * - * @returns AM_SUCCESS on error or AM_ERROR_MISC if an error occurs. + * @return AM_SUCCESS on error or AM_ERROR_MISC if an error occurs. * @see am_alloc, am_free */ am_status_t AM_copy(void* dst, const void* src, size_t size); @@ -96,7 +97,7 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); /** - * Adds a pointer to the memory tracker. + * Add a pointer to the memory tracker. * * @return AM_SUCCESS * @see am_memtracker_getinfo @@ -105,7 +106,7 @@ am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, /* - * Updates infor for an existing pointer in the memory tracker. + * Update info for an existing pointer in the memory tracker. * * @returns AM_ERROR_MISC if pointer is not found in tracker. * @returns AM_SUCCESS if pointer is not found in tracker. @@ -116,7 +117,7 @@ am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocation /** - * Remove the pointer from the tracker structure. + * Remove @ptr from the tracker structure. * * @p ptr may be anywhere in a tracked memory range. * @@ -136,7 +137,7 @@ am_status_t am_memtracker_remove(void* ptr); size_t am_memtracker_reset(hc::accelerator acc); /** - * Prints the entries in the memory tracker table. + * Print the entries in the memory tracker table. * * Intended primarily for debug purposes. * @see am_memtracker_getinfo @@ -145,7 +146,7 @@ void am_memtracker_print(); /** - * Returns total sizes of device, host, and user memory allocated by the application + * Return total sizes of device, host, and user memory allocated by the application * * User memory is registered with am_tracker_add. **/ diff --git a/projects/clr/hipamd/src/hc_AM.cpp b/projects/clr/hipamd/src/hc_AM.cpp index 221322f4b0..272024cfe7 100644 --- a/projects/clr/hipamd/src/hc_AM.cpp +++ b/projects/clr/hipamd/src/hc_AM.cpp @@ -93,7 +93,6 @@ void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) // Return 1 if removed or 0 if not found. int AmPointerTracker::remove (void *pointer) { - // TODO-mutex - write lock. std::lock_guard l (_mutex); mprintf ("remove: %p\n", pointer); return _tracker.erase(AmMemoryRange(pointer,1)); @@ -103,7 +102,6 @@ int AmPointerTracker::remove (void *pointer) //--- AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void *pointer) { - // TODO-mutex- read lock std::lock_guard l (_mutex); auto iter = _tracker.find(AmMemoryRange(pointer,1)); mprintf ("find: %p\n", pointer); diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index d4a6857559..30a0d993e0 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -40,10 +40,18 @@ THE SOFTWARE. #include "hsa_ext_amd.h" -#include "hc_AM.cpp" #define USE_ASYNC_COPY 1 -#define USE_AM_TRACKER 1 /* use new AM memory tracker features */ +#define USE_AM_TRACKER 2 /* >0 = use new AM memory tracker features. 1= use HIP impl, 2=use HCC impl */ + +#if USE_AM_TRACKER==1 +#include "hc_AM.cpp" +#define AM_ALLOC hc::AM_alloc +#define AM_FREE hc::AM_free +#else +#define AM_ALLOC hc::am_alloc +#define AM_FREE hc::am_free +#endif #define INLINE static inline @@ -1504,7 +1512,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (device) { const unsigned am_flags = 0; - *ptr = hc::AM_alloc(sizeBytes, device->_acc, am_flags); + *ptr = AM_ALLOC(sizeBytes, device->_acc, am_flags); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; @@ -1531,7 +1539,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) auto device = ihipGetTlsDefaultDevice(); if (device) { - *ptr = hc::AM_alloc(sizeBytes, device->_acc, am_flags); + *ptr = AM_ALLOC(sizeBytes, device->_acc, am_flags); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { @@ -1577,7 +1585,7 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuf for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. - _pinnedStagingBuffer[i] = hc::AM_alloc(_bufferSize, device->_acc, amHostPinned); + _pinnedStagingBuffer[i] = AM_ALLOC(_bufferSize, device->_acc, amHostPinned); if (_pinnedStagingBuffer[i] == NULL) { throw; } @@ -1590,7 +1598,7 @@ StagingBuffer::~StagingBuffer() { for (int i=0; i<_numBuffers; i++) { if (_pinnedStagingBuffer[i]) { - hc::AM_free(_pinnedStagingBuffer[i]); + AM_FREE(_pinnedStagingBuffer[i]); _pinnedStagingBuffer[i] = NULL; } hsa_signal_destroy(_completion_signal[i]); @@ -1695,8 +1703,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte #if USE_AM_TRACKER -// TODO - add mutex to limit in/out: -void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { hc::AmPointerInfo dstPtrInfo, srcPtrInfo; @@ -1725,14 +1732,16 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size std::lock_guard l (device->_copy_lock[0]); device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes); } else { - hc::AM_copy(dst, src, sizeBytes); + // TODO - remove, slow path. + hc::am_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { if (useStagingBuffer) { std::lock_guard l (device->_copy_lock[1]); device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes); } else { - hc::AM_copy(dst, src, sizeBytes); + // TODO - remove, slow path. + hc::am_copy(dst, src, sizeBytes); } } else if (kind == hipMemcpyHostToHost) { memcpy(dst, src, sizeBytes); // TODO - not async. @@ -1757,6 +1766,36 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size #endif +#if 0 // USE_AM_TRACKER +void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +{ + bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. + + hipStatus_t e = hipSuccess; + + // TODO - check kind is not default. + if (kind == hipMemcpyDefault) { + e = hipErrorInvalidMemoryDirection; + } else { + // Let HSA runtime handle it: + // TODO - need buffer pool for the signals: + + device->_copy_lock[1].lock(); + + hsa_signal_store_relaxed(device->_copy_signal, 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); + + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + device->_copy_lock[1].unlock(); + + } +} +#endif + + //--- hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) @@ -1775,7 +1814,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind ihipDevice_t *device = &g_devices[stream->_device_index]; - ihipAsyncCopy(device, dst, src, sizeBytes, kind); + ihipSyncCopy(device, dst, src, sizeBytes, kind); } else { e = hipErrorInvalidResourceHandle; @@ -1784,7 +1823,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind #else // TODO-hsart - what synchronization does hsa_copy provide? - hc::AM_copy(dst, src, sizeBytes); + hc::am_copy(dst, src, sizeBytes); e = hipSuccess; #endif @@ -1815,7 +1854,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. - hc::AM_copy(dst, src, sizeBytes); + hc::am_copy(dst, src, sizeBytes); #if 0 @@ -1938,7 +1977,7 @@ hipError_t hipFree(void* ptr) ihipWaitAllStreams(ihipGetTlsDefaultDevice()); if (ptr) { - hc::AM_free(ptr); + AM_FREE(ptr); } return ihipLogStatus(hipSuccess); @@ -1952,7 +1991,7 @@ hipError_t hipFreeHost(void* ptr) if (ptr) { tprintf (TRACE_MEM, " %s: %p\n", __func__, ptr); - hc::AM_free(ptr); + AM_FREE(ptr); } return ihipLogStatus(hipSuccess); From d0b29d9192ae615f16baa2ad2cdbe6f19613c0c8 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 Feb 2016 01:58:24 -0600 Subject: [PATCH 22/32] Add comments to tests [ROCm/clr commit: 731a2a58d331490b32457c8ee5ea5e94dbff4db3] --- projects/clr/hipamd/tests/src/hipMemcpy.cpp | 25 +++++++--- .../clr/hipamd/tests/src/hipPointerAttrib.cpp | 48 +++++++++++++++---- 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/projects/clr/hipamd/tests/src/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/hipMemcpy.cpp index 3502b81e9d..97a372304a 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpy.cpp @@ -28,7 +28,9 @@ void printSep() printf ("======================================================================================\n"); } +//--- // Test simple H2D copies and back. +// Designed to stress a small number of simple smoke tests void simpleTest1() { printf ("test: %s\n", __func__); @@ -61,8 +63,16 @@ void simpleTest1() } -// Test many different kinds of memory copies: - +//--- +// Test many different kinds of memory copies. +// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. +// +// IN: numElements controls the number of elements used for allocations. +// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc. +// IN: useHostToHost : If true, add an extra host-to-host copy. +// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced. +// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. +// template void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { @@ -134,18 +144,15 @@ void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, boo } +//--- +//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault template void memcpytest2_loop(size_t numElements) { printSep(); for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { -#define USE_HOST_2_HOST -#ifdef USE_HOST_2_HOST for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO -#else - for (int useHostToHost =0; useHostToHost<=0; useHostToHost++) { // TODO -#endif for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); @@ -156,6 +163,8 @@ void memcpytest2_loop(size_t numElements) } +//--- +//Try many different sizes to memory copy. template void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) { @@ -184,6 +193,8 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) } +//--- +//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic: template void multiThread_1(bool serialize, bool usePinnedHost) { diff --git a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp index 1418997274..586b2af5b5 100644 --- a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp +++ b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp @@ -27,7 +27,9 @@ THE SOFTWARE. #include "test_common.h" #ifdef __HIP_PLATFORM_HCC__ -#include "hcc_detail/AM.h" +//#include "hcc_detail/AM.h" +#include "hc_am.hpp" + #endif size_t Nbytes = 0; @@ -97,8 +99,8 @@ inline int zrand(int max) //================================================================================================= // Functins to run tests //================================================================================================= -// -//Run through a couple simple cases to test lookups and hostd pointer arithmetic: +//-- +//Run through a couple simple cases to test lookups and host pointer arithmetic: void testSimple() { printf ("\n"); @@ -188,7 +190,10 @@ void testSimple() HIPASSERT(e == hipErrorInvalidValue); // OS-allocated pointers should return hipErrorInvalidValue. } - +//--- +//Reset the memory tracker (remove allocations from all known devices): +//This frees any memory allocated through the runtime. +//The routine will not release any void resetTracker () { if (p_verbose & 0x1) { @@ -214,7 +219,8 @@ struct SuperPointerAttribute { }; - +//--- +//Support function to check result against a reference: void checkPointer(SuperPointerAttribute &ref, int major, int minor, void *pointer) { hipPointerAttribute_t attribs; @@ -236,6 +242,12 @@ void checkPointer(SuperPointerAttribute &ref, int major, int minor, void *pointe } +//--- +//Test that allocates memory across all 4 devices withing the specified size range (minSize...maxSize). +//Then does lookups to make sure the info reported by the tracker matches expecations +//Then deallocates it all. +// +//Multiple threads can call this funtion and in fact we do this in the testMultiThreaded_1 test. void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) { printf (" clusterAllocs numAllocs=%d size=%lu..%lu\n", numAllocs, minSize, maxSize); @@ -313,9 +325,6 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) } - - - #ifdef __HIP_PLATFORM_HCC__ if (p_verbose & 0x2) { printf ("Tracker after cleanup:\n"); @@ -325,6 +334,10 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) } +//--- +// Multi-threaded test with many simul allocs. +// IN : serialize will force the test to run in serial fashion. +// Seems like this does not hit MT corner cases in the tracker very often - testMultiThreaded_2 below seems more effective. void testMultiThreaded_1(bool serialize=false) { printf ("\n===========================================================================\n"); @@ -356,8 +369,8 @@ void testMultiThreaded_1(bool serialize=false) ///================================================================================================ - -// Add pointers to tracker very quickly. +//--- +//Repeatedly query a single entry: void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib) { int count = 0; @@ -376,6 +389,9 @@ void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib) } +#ifdef __HIP_PLATFORM_HCC__ +//--- +// Add pointers to tracker very quickly, then remove them quickly: enum Dir {Up, Down}; void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir removeDir) { @@ -412,6 +428,13 @@ void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir remove } +//--- +//Multi-thread test that is effective at catching locking errors in the alloc/dealloc/tracker. +//The query thread repeately requests information on the same block of memory. +//Meanwhile, the thread_noise_generator registers a large number of blocks, and +//then unregisters them. This causes a large amount of rebalancing in the tree +//structure and will generate errors unless the locks in the tracker are preventing reading +//while writing. void testMultiThreaded_2() { std::atomic inflight(2); @@ -445,6 +468,8 @@ void testMultiThreaded_2() hipSetDevice(0); hipDeviceReset(); } +#endif + int main(int argc, char *argv[]) @@ -483,11 +508,14 @@ int main(int argc, char *argv[]) testMultiThreaded_1(false); } + +#ifdef __HIP_PLATFORM_HCC__ if (p_tests & 0x10) { srand(0x400); testMultiThreaded_2(); resetTracker(); } +#endif printf ("\n"); passed(); From 0e83efe14d21da0b69827d9c0b35072a5ece518b Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 Feb 2016 01:59:13 -0600 Subject: [PATCH 23/32] Add per-stream pool for hsa_signals. [ROCm/clr commit: 5d721a2649f71014d5beebc1f9c2f504e94d8050] --- .../include/hcc_detail/hip_runtime_api.h | 2 +- projects/clr/hipamd/include/hip_runtime_api.h | 3 + projects/clr/hipamd/src/hip_hcc.cpp | 209 +++++++++++++----- 3 files changed, 154 insertions(+), 60 deletions(-) diff --git a/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h index a0c676987b..5fe398b84c 100644 --- a/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h @@ -115,7 +115,7 @@ enum hipMemcpyKind { // The handle allows the async commands to use the stream even if the parent hipStream_t goes out-of-scope. -typedef struct ihipStream_t * hipStream_t; +typedef class ihipStream_t * hipStream_t; /* diff --git a/projects/clr/hipamd/include/hip_runtime_api.h b/projects/clr/hipamd/include/hip_runtime_api.h index 41ad338d6d..de6d175039 100644 --- a/projects/clr/hipamd/include/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip_runtime_api.h @@ -134,6 +134,7 @@ typedef struct hipPointerAttribute_t { * @enum * @ingroup Enumerations */ +// Developer note - when updating these, update the hipErrorName and hipErrorString functions typedef enum hipError_t { hipSuccess = 0 ///< Successful completion. ,hipErrorMemoryAllocation ///< Memory allocation error. @@ -143,6 +144,8 @@ typedef enum hipError_t { ,hipErrorInvalidValue ///< One or more of the parameters passed to the API call is NULL or not in an acceptable range. ,hipErrorInvalidResourceHandle ///< Resource handle (hipEvent_t or hipStream_t) invalid. ,hipErrorInvalidDevice ///< DeviceID must be in range 0...#compute-devices. + ,hipErrorInvalidMemcpyDirection ///< Invalid memory copy direction + ,hipErrorNoDevice ///< Call to hipGetDeviceCount returned 0 devices ,hipErrorNotReady ///< Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery. ,hipErrorUnknown ///< Unknown error. diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 30a0d993e0..ff1f39d780 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -67,6 +67,7 @@ int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; +int HIP_STREAM_SIGNALS = 2; /* number of signals to use when stream is created */ #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -90,18 +91,50 @@ enum ihipCommand_t { ihipCommandData, }; + +// Small wrapper around signals. +// Designed to be used from stream. +struct ihipSignal_t { + hsa_signal_t _hsa_signal; + int _refCnt; + + ihipSignal_t() : _refCnt(0) { + if (hsa_signal_create(1, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; + } + } + + ~ihipSignal_t() { + if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; + } + // _refCnt should be 0, unless we are shutting down... + _refCnt = 0; + }; +}; + + // Internal stream structure. -struct ihipStream_t { +class ihipStream_t { +public: unsigned _device_index; hc::accelerator_view _av; unsigned _flags; ihipCommand_t _last_command; //ihipStream_t() : _av(){ }; - ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : - _device_index(device_index), _av(av), _flags(flags), _last_command(ihipCommandKernel) - {}; -} ; + ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags); + ~ihipStream_t(); + + inline ihipDevice_t * getDevice() const; + + hsa_signal_t getSignal() ; + void releaseSignal(ihipSignal_t *signal) ; + +private: + int _signalCursor; + std::vector _signalPool; +}; @@ -179,6 +212,91 @@ public: }; +//================================================================================================= +// Global Data Structures: +//================================================================================================= +//TLS - must be initialized here. +thread_local hipError_t tls_lastHipError = hipSuccess; +thread_local int tls_defaultDevice = 0; + +// Global initialization. +std::once_flag hip_initialized; +ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. +unsigned g_deviceCnt; +//================================================================================================= + + +//================================================================================================= +// Implementation: +//================================================================================================= + + +//================================================================================================= +// ihipStream_t: +//================================================================================================= +//--- +ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : + _device_index(device_index), _av(av), _flags(flags), _last_command(ihipCommandKernel), + _signalCursor(0) +{ + _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); + +}; + +//--- +ihipStream_t::~ihipStream_t() +{ + _signalPool.clear(); +} + + +//--- +inline ihipDevice_t * ihipStream_t::getDevice() const +{ + return &g_devices[_device_index]; +}; + + +// Allocate a new signal from the signal pool. +// Returned signals are initialized to a value of "1". +hsa_signal_t ihipStream_t::getSignal() +{ + int numToScan = _signalPool.size(); + do { + auto thisCursor = _signalCursor; + if (++_signalCursor > _signalPool.size()) { + _signalCursor = 0; + } + + if (_signalPool[thisCursor]._refCnt == 0) { + _signalPool[thisCursor]._refCnt ++; // allocate it + return _signalPool[thisCursor]._hsa_signal; + } + + numToScan--; + } while (numToScan) ; + + assert(numToScan == 0); + + // Have to grow the pool: + printf ("Grow signal pool\n"); + _signalCursor = _signalPool.size(); // set to the beginning of the new entries: + _signalPool.resize(_signalPool.size() * 2); + return getSignal(); // try again, + + // Shouldnever reach here. + assert(0); +} + + +void ihipStream_t::releaseSignal(ihipSignal_t *signal) +{ + if (--signal->_refCnt <= 0) { + // restore signal to the initial value 1 + hsa_signal_store_release(signal->_hsa_signal, 1); + } +} + //================================================================================================= // //Reset the device - this is called from hipDeviceReset. @@ -235,17 +353,6 @@ ihipDevice_t::~ihipDevice_t() //---- -//================================================================================================= -//TLS - must be initialized here. -thread_local hipError_t tls_lastHipError = hipSuccess; -thread_local int tls_defaultDevice = 0; - -// Global initialization. -std::once_flag hip_initialized; -ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. -unsigned g_deviceCnt; - -//================================================================================================= @@ -524,8 +631,9 @@ void ihipInit() READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); - READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)." ); - READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction."); + READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); + READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction"); + READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to use when creating a new stream (pool can later grow)"); /* * Build a table of valid compute devices. @@ -1012,6 +1120,7 @@ const char *hipGetErrorName(hipError_t hip_error) case hipErrorInvalidValue : return "hipErrorInvalidValue"; case hipErrorInvalidResourceHandle : return "hipErrorInvalidResourceHandle"; case hipErrorInvalidDevice : return "hipErrorInvalidDevice"; + case hipErrorInvalidMemcpyDirection : return "hipErrorInvalidMemcpyDirection"; case hipErrorNoDevice : return "hipErrorNoDevice"; case hipErrorNotReady : return "hipErrorNotReady"; case hipErrorUnknown : return "hipErrorUnknown"; @@ -1744,7 +1853,7 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB hc::am_copy(dst, src, sizeBytes); } } else if (kind == hipMemcpyHostToHost) { - memcpy(dst, src, sizeBytes); // TODO - not async. + memcpy(dst, src, sizeBytes); } else { // Let HSA runtime handle it: @@ -1766,37 +1875,6 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB #endif -#if 0 // USE_AM_TRACKER -void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) -{ - bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. - - hipStatus_t e = hipSuccess; - - // TODO - check kind is not default. - if (kind == hipMemcpyDefault) { - e = hipErrorInvalidMemoryDirection; - } else { - // Let HSA runtime handle it: - // TODO - need buffer pool for the signals: - - device->_copy_lock[1].lock(); - - hsa_signal_store_relaxed(device->_copy_signal, 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); - - if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } - - device->_copy_lock[1].unlock(); - - } -} -#endif - - - //--- hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { @@ -1822,13 +1900,10 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind #else - // TODO-hsart - what synchronization does hsa_copy provide? hc::am_copy(dst, src, sizeBytes); e = hipSuccess; #endif - // TODO - when am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here. - return ihipLogStatus(e); } @@ -1856,20 +1931,34 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. hc::am_copy(dst, src, sizeBytes); -#if 0 - - hipStream_t s =ihipGetStream(stream); +#if USE_ASYNC_COPY + hipStream_t s = ihipSyncAndResolveStream(stream); if (s) { - hc::completion_future cf = ihipMemcpyKernel (s, static_cast (dst), static_cast (src), sizeBytes); + ihipDevice_t *device = s->getDevice(); - //cf.wait(); + if (kind == hipMemcpyDefault) { + e = hipErrorInvalidMemcpyDirection; + } else { + // Let HSA runtime handle it: + // TODO - need buffer pool for the signals rather than lock: + device->_copy_lock[1].lock(); - e = hipSuccess; + hsa_signal_store_relaxed(device->_copy_signal, 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); + + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + device->_copy_lock[1].unlock(); + + } } else { e = hipErrorInvalidValue; } + #endif // TODO - if am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here. @@ -2113,3 +2202,5 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a hipError_t err = hipSuccess; return ihipLogStatus(err); } + +// TODO - review signal / error reporting code. From 6856d28ca3b95e9cc16ef7f1c8ab01bc789085e1 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 00:59:12 -0600 Subject: [PATCH 24/32] more work on async copies [ROCm/clr commit: 0cdbe1ff0591b3cc10bdb34500936496bc6dca74] --- projects/clr/hipamd/bin/hipcc | 2 +- projects/clr/hipamd/src/hip_hcc.cpp | 70 +++++--- projects/clr/hipamd/tests/src/CMakeLists.txt | 2 + .../clr/hipamd/tests/src/hipMemcpyAsync.cpp | 149 ++++++++++++++++++ 4 files changed, 198 insertions(+), 25 deletions(-) create mode 100644 projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc index 7537750ff6..1ab4cf2759 100755 --- a/projects/clr/hipamd/bin/hipcc +++ b/projects/clr/hipamd/bin/hipcc @@ -164,7 +164,7 @@ if ($needHipHcc) { if ((not -e $object) or ((stat($source))[9] > (stat($object))[9])) { my $CMD = "$HCC $HCCFLAGS -I$HSA_PATH/include -I$HIP_PATH/include -Wall -c $source -o $object"; if ($verbose & 0x10) { - $CMD .= " -g" ; + $CMD .= " -g -O2" ; } else { $CMD .= " -O3" ; } diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index ff1f39d780..ae28947ef3 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -62,12 +62,13 @@ THE SOFTWARE. //static const int debug = 0; static const int release = 1; +int HIP_LAUNCH_BLOCKING = 0; + int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; -int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; -int HIP_STREAM_SIGNALS = 2; /* number of signals to use when stream is created */ +int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -128,7 +129,7 @@ public: inline ihipDevice_t * getDevice() const; - hsa_signal_t getSignal() ; + ihipSignal_t * getSignal() ; void releaseSignal(ihipSignal_t *signal) ; private: @@ -241,6 +242,13 @@ ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsig { _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); + auto s = this; + + std::for_each(_signalPool.begin(), _signalPool.end(), + [s](ihipSignal_t &iter) { + printf (" stream:%p allocated hsa_signal=%p\n", s, (iter._hsa_signal)); + }); + }; //--- @@ -259,18 +267,18 @@ inline ihipDevice_t * ihipStream_t::getDevice() const // Allocate a new signal from the signal pool. // Returned signals are initialized to a value of "1". -hsa_signal_t ihipStream_t::getSignal() +ihipSignal_t *ihipStream_t::getSignal() { int numToScan = _signalPool.size(); do { auto thisCursor = _signalCursor; - if (++_signalCursor > _signalPool.size()) { + if (++_signalCursor == _signalPool.size()) { _signalCursor = 0; } if (_signalPool[thisCursor]._refCnt == 0) { _signalPool[thisCursor]._refCnt ++; // allocate it - return _signalPool[thisCursor]._hsa_signal; + return &_signalPool[thisCursor]; } numToScan--; @@ -336,6 +344,7 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) this->reset(); }; + ihipDevice_t::~ihipDevice_t() { if (_null_stream) { @@ -628,12 +637,14 @@ void ihipInit() /* * Environment variables */ - READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); - READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); + READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); + //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading + READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); + READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction"); - READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to use when creating a new stream (pool can later grow)"); + READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); /* * Build a table of valid compute devices. @@ -791,7 +802,10 @@ inline bool ihipCheckCommandSwitchSync(hipStream_t stream, ihipCommand_t new_com addedSync = true; *marker = stream->_av.create_marker(); - tprintf (TRACE_SYNC, "stream %p switch to %s (barrier pkt inserted)\n", (void*)stream, new_command == ihipCommandKernel ? "Kernel" : "Data"); + tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted)\n", + (void*)stream, + stream->_last_command == ihipCommandKernel ? "Kernel" : "Data", + new_command == ihipCommandKernel ? "Kernel" : "Data"); stream->_last_command = new_command; } @@ -1908,10 +1922,12 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind } -//--- -/* +#if USE_ASYNC_COPY==0 +/** * @warning on HCC hipMemcpyAsync uses a synchronous copy. */ +#endif +//--- hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { std::call_once(hip_initialized, ihipInit); @@ -1927,9 +1943,6 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // Async - need to set up dependency on the last command queued to the device? - // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. - // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. - hc::am_copy(dst, src, sizeBytes); #if USE_ASYNC_COPY @@ -1943,25 +1956,33 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp } else { // Let HSA runtime handle it: // TODO - need buffer pool for the signals rather than lock: - device->_copy_lock[1].lock(); + ihipSignal_t *ihip_signal = stream->getSignal(); - hsa_signal_store_relaxed(device->_copy_signal, 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); + //stream->saveLastSignal(ihipSignal); + + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, ihip_signal->_hsa_signal); if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + if (HIP_LAUNCH_BLOCKING) { + hsa_signal_wait_relaxed(ihip_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + stream->releaseSignal(ihip_signal); + } + } else { + // This path can be hit if src or dst point to unpinned host memory. + // TODO - does async-copy fall back to sync if input pointers are not pinned? + e = hipErrorInvalidValue; } - - device->_copy_lock[1].unlock(); - } } else { e = hipErrorInvalidValue; } - +#else + // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. + // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. + hc::am_copy(dst, src, sizeBytes); #endif - // TODO - if am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here. return ihipLogStatus(e); } @@ -2015,6 +2036,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) { + // TODO - call an ihip memset so HIP_TRACE is correct. return hipMemsetAsync(dst, value, sizeBytes, hipStreamNull); } diff --git a/projects/clr/hipamd/tests/src/CMakeLists.txt b/projects/clr/hipamd/tests/src/CMakeLists.txt index 7e4736a99a..ec0b15ad62 100644 --- a/projects/clr/hipamd/tests/src/CMakeLists.txt +++ b/projects/clr/hipamd/tests/src/CMakeLists.txt @@ -104,6 +104,7 @@ make_hip_executable (hip_brev hip_brev.cpp) make_hip_executable (hip_ffs hip_ffs.cpp) make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) make_hip_executable (hipMemcpy hipMemcpy.cpp) +make_hip_executable (hipMemcpyAsync hipMemcpyAsync.cpp) make_hip_executable (hipMemset hipMemset.cpp) make_hip_executable (hipEventRecord hipEventRecord.cpp) make_hip_executable (hipLanguageExtensions hipLanguageExtensions.cpp) @@ -131,6 +132,7 @@ make_test(hipGridLaunch " " ) make_test(hipPointerAttrib " " ) make_test(hipMemcpy " " ) +make_test(hipMemcpyAsync " " ) make_test(hipHcc " " ) diff --git a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp new file mode 100644 index 0000000000..e2968af2f2 --- /dev/null +++ b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp @@ -0,0 +1,149 @@ +// Test under-development. Calls async mem-copy API, experiment with functionality. + +#include "hip_runtime.h" +#include "test_common.h" + +unsigned p_streams = 2; + + +void simpleNegTest() +{ + printf ("testing: %s\n",__func__); + hipError_t e; + float *A_malloc, *A_pinned, *A_d; + + size_t Nbytes = N*sizeof(float); + A_malloc = (float*)malloc(Nbytes); + HIPCHECK(hipMallocHost(&A_pinned, Nbytes)); + HIPCHECK(hipMalloc(&A_d, Nbytes)); + + + // Can't use default with async copy + e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL); + HIPASSERT (e==hipErrorInvalidMemcpyDirection); + + + // Not sure what happens here, the memory must be pinned. + e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyHostToDevice, NULL); + HIPASSERT (e==hipErrorInvalidValue); + + +} + +//--- +//Classic example showing how to overlap data transfer with compute. +//We divide the work into "chunks" and create a stream for each chunk. +//Each chunk then runs a H2D copy, followed by kernel execution, followed by D2H copyback. +//Work in separate streams is independent which enables concurrency. + +// IN: nStreams : number of streams to use for the test +// IN :useNullStream - use NULL stream. Synchronizes everything. +// IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D +// IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H +void chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H) +{ + + size_t Nbytes = N*sizeof(int); + printf ("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d) ",__func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H); + printf ("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes)/1024.0/1024.0); + + int *A_d, *B_d, *C_d; + int *A_h, *B_h, *C_h; + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); + + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + + hipStream_t *stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams); + if (useNullStream) { + nStreams = 1; + stream[0] = NULL; + } else { + for (int i = 0; i < nStreams; ++i) { + HIPCHECK (hipStreamCreate(&stream[i])); + } + } + + + size_t workLeft = N; + size_t workPerStream = N / nStreams; + for (int i = 0; i < nStreams; ++i) { + size_t work = (workLeft < workPerStream) ? workLeft : workPerStream; + size_t workBytes = work * sizeof(int); + + size_t offset = i*workPerStream; + + if (useSyncMemcpyH2D) { + HIPCHECK ( hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice)); + } else { + HIPCHECK ( hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); + HIPCHECK ( hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); + }; + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work); + + if (useSyncMemcpyD2H) { + HIPCHECK ( hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost)); + } else { + HIPCHECK ( hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i])); + } + } + + + HIPCHECK (hipDeviceSynchronize()); + + + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, true); +}; + + +//--- +//Parse arguments specific to this test. +void parseMyArguments(int argc, char *argv[]) +{ + int more_argc = HipTest::parseStandardArguments(argc, argv, false); + + // parse args for this test: + for (int i = 1; i < more_argc; i++) { + const char *arg = argv[i]; + + if (!strcmp(arg, "--streams")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { + failed("Bad streams argument"); + } + } else { + failed("Bad argument '%s'", arg); + } + }; +}; + + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + parseMyArguments(argc, argv); + + + printf ("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + + simpleNegTest(); + + + chunkedAsyncExample(p_streams, true, true, true); // Easy sync version + chunkedAsyncExample(p_streams, false, true, true); // Easy sync version + chunkedAsyncExample(p_streams, false, false, true); // Some async + chunkedAsyncExample(p_streams, false, false, false); // All async + + + + passed(); + +} From 527d64a2da8ef59e058a142fa37a9848c58bcc6e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 21:22:07 -0600 Subject: [PATCH 25/32] Support HSA_PATH env, async path tweak [ROCm/clr commit: 9a82d316c353f68a786b68132296a3b6b3f7c43a] --- projects/clr/hipamd/tests/src/CMakeLists.txt | 5 ++++- projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/tests/src/CMakeLists.txt b/projects/clr/hipamd/tests/src/CMakeLists.txt index ec0b15ad62..09c0ca7162 100644 --- a/projects/clr/hipamd/tests/src/CMakeLists.txt +++ b/projects/clr/hipamd/tests/src/CMakeLists.txt @@ -19,7 +19,10 @@ MESSAGE ("HIP_PATH=" ${HIP_PATH}) if (${HIP_PLATFORM} STREQUAL "hcc") MESSAGE ("HIP_PLATFORM=hcc") - set (HSA_PATH /opt/hsa) + set (HSA_PATH $ENV{HSA_PATH}) + if (NOT DEFINED HSA_PATH) + set (HSA_PATH /opt/hsa) + endif() #--- # Add HSA library: diff --git a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp index e2968af2f2..6192940270 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp @@ -25,9 +25,9 @@ void simpleNegTest() // Not sure what happens here, the memory must be pinned. e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyHostToDevice, NULL); - HIPASSERT (e==hipErrorInvalidValue); - + printf (" async memcpy of A_malloc to A_d. Result=%d\n", e); + //HIPASSERT (e==hipErrorInvalidValue); } //--- From f8d9017103b0ee8fdcfa193249f232139929de28 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 21:22:31 -0600 Subject: [PATCH 26/32] Tweak full formatting [ROCm/clr commit: d75279c8ebe462e73f8c91d52243a7d2006dd0c1] --- projects/clr/hipamd/bin/hipconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/clr/hipamd/bin/hipconfig b/projects/clr/hipamd/bin/hipconfig index db53d6014e..1687983330 100755 --- a/projects/clr/hipamd/bin/hipconfig +++ b/projects/clr/hipamd/bin/hipconfig @@ -82,6 +82,7 @@ if ($p_full) { system("$HCC_HOME/bin/hcc-config --cxxflags"); print ("HCC-ldflags : "); system("$HCC_HOME/bin/hcc-config --ldflags"); + printf("\n"); } if ($HIP_PLATFORM eq "nvcc") { print "\n" ; @@ -98,6 +99,8 @@ if ($p_full) { print "\n" ; print "== Linux Kernel\n"; system ("uname -a"); + + print "\n" ; $printed = 1; } From 98e608a5ce8da336a8391ac472ad1cefa19b5d12 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 21:23:36 -0600 Subject: [PATCH 27/32] USE_AM_TRACKER=0 works [ROCm/clr commit: 354c9f945aa1cd7e846602bbf57b513ee3aafe52] --- projects/clr/hipamd/src/hip_hcc.cpp | 42 ++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index ae28947ef3..83083022c2 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -41,8 +41,13 @@ THE SOFTWARE. -#define USE_ASYNC_COPY 1 -#define USE_AM_TRACKER 2 /* >0 = use new AM memory tracker features. 1= use HIP impl, 2=use HCC impl */ +#define USE_AM_TRACKER 0 /* >0 = use new AM memory tracker features. 2=use HCC impl */ +#define USE_ROCR_V2 0 + +#if ((USE_AM_TRACKER!=0) && (USE_AM_TRACKER!=2)) +#error (USE_AM_TRACKER must be 0 or 2) +#endif + #if USE_AM_TRACKER==1 #include "hc_AM.cpp" @@ -244,10 +249,12 @@ ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsig auto s = this; +#if 0 std::for_each(_signalPool.begin(), _signalPool.end(), [s](ihipSignal_t &iter) { - printf (" stream:%p allocated hsa_signal=%p\n", s, (iter._hsa_signal)); + printf (" stream:%p allocated hsa_signal=%lu\n", s, (iter._hsa_signal.handle)); }); +#endif }; @@ -1640,7 +1647,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { -#ifdef USE_AM_TRACKER +#if USE_AM_TRACKER hc::am_memtracker_update(*ptr, device->_device_index, 0); #endif } @@ -1666,7 +1673,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { -#ifdef USE_AM_TRACKER +#if USE_AM_TRACKER hc::am_memtracker_update(*ptr, device->_device_index, 0); #endif } @@ -1752,10 +1759,15 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte // TODO - use uncached memcpy, someday. memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); - tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, 0, NULL, _completion_signal[bufferIndex]); +#else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); +#endif + tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p status=%x\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw @@ -1795,7 +1807,11 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, 0, NULL, _completion_signal[bufferIndex]); +#else hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); +#endif assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp0 += theseBytes; @@ -1876,7 +1892,11 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB device->_copy_lock[1].lock(); hsa_signal_store_relaxed(device->_copy_signal, 1); +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, 0, NULL, device->_copy_signal); +#else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); +#endif if (hsa_status == HSA_STATUS_SUCCESS) { hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); @@ -1901,7 +1921,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind hipError_t e = hipSuccess; -#if USE_ASYNC_COPY +#if USE_AM_TRACKER if (ihipIsValidDevice(stream->_device_index)) { ihipDevice_t *device = &g_devices[stream->_device_index]; @@ -1922,7 +1942,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind } -#if USE_ASYNC_COPY==0 +#if USE_AM_TRACKER==0 /** * @warning on HCC hipMemcpyAsync uses a synchronous copy. */ @@ -1944,7 +1964,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // Async - need to set up dependency on the last command queued to the device? -#if USE_ASYNC_COPY +#if USE_AM_TRACKER hipStream_t s = ihipSyncAndResolveStream(stream); @@ -1960,7 +1980,11 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp //stream->saveLastSignal(ihipSignal); +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, 0, NULL, ihip_signal->_hsa_signal); +#else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, ihip_signal->_hsa_signal); +#endif if (hsa_status == HSA_STATUS_SUCCESS) { From 8ed32daefab2725286b591c118b474eba54ca056 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 21:33:32 -0600 Subject: [PATCH 28/32] Remove HIP-local AM tracker (now in HCC) [ROCm/clr commit: b08e468c0661df5d2076d15fce654a222025da91] --- projects/clr/hipamd/include/hcc_detail/AM.h | 157 ---------- projects/clr/hipamd/src/hc_AM.cpp | 319 -------------------- projects/clr/hipamd/src/hip_hcc.cpp | 28 +- 3 files changed, 10 insertions(+), 494 deletions(-) delete mode 100644 projects/clr/hipamd/include/hcc_detail/AM.h delete mode 100644 projects/clr/hipamd/src/hc_AM.cpp diff --git a/projects/clr/hipamd/include/hcc_detail/AM.h b/projects/clr/hipamd/include/hcc_detail/AM.h deleted file mode 100644 index 74542789af..0000000000 --- a/projects/clr/hipamd/include/hcc_detail/AM.h +++ /dev/null @@ -1,157 +0,0 @@ -#pragma once - -#include - -typedef int am_status_t; -#define AM_SUCCESS 0 -// TODO - provide better mapping of HSA error conditions to HC error codes. -#define AM_ERROR_MISC -1 /** Misellaneous error */ - -// Flags for am_alloc API: -#define amHostPinned 0x1 - - -namespace hc { - -// Info for each pointer in the memtry tracker: -struct AmPointerInfo { - void * _hostPointer; ///< Host pointer. If host access is not allowed, NULL. - void * _devicePointer; ///< Device pointer. - size_t _sizeBytes; ///< Size of allocation. - hc::accelerator _acc; ///< Device / Accelerator to use. - bool _isInDeviceMem; ///< Memory is physically resident on a device (if false, memory is located on host) - bool _isAmManaged; ///< Memory was allocated by AM and should be freed when am_reset is called. - - int _appId; ///< App-specific storage. (Used by HIP to store deviceID.) - unsigned _appAllocationFlags; ///< App-specific allocation flags. (Used by HIP to store allocation flags.) - - AmPointerInfo() {}; - - AmPointerInfo(void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, bool isInDeviceMem, bool isAmManaged) : - _hostPointer(hostPointer), - _devicePointer(devicePointer), - _sizeBytes(sizeBytes), - _acc(acc), - _isInDeviceMem(isInDeviceMem), - _isAmManaged(isAmManaged), - _appId(-1), - _appAllocationFlags(0) {}; -}; -} - - - -namespace hc { - - -/** - * Allocate a block of @p size bytes of memory on the specified @p acc. - * - * The contents of the newly allocated block of memory are not initialized. - * - * If @p size == 0, 0 is returned. - * - * Flags must be 0. - * - * @return : On success, pointer to the newly allocated memory is returned. - * The pointer is typecast to the desired return type. - * - * If an error occurred trying to allocate the requested memory, 0 is returned. - * - * @see am_free, am_copy - */ -auto_voidp AM_alloc(size_t size, hc::accelerator acc, unsigned flags); - -/** - * Free a block of memory previously allocated with am_alloc. - * - * @return AM_SUCCESS - * @see am_alloc, am_copy - */ -am_status_t AM_free(void* ptr); - - -/** - * Copy @p size bytes of memory from @p src to @ dst. The memory areas (src+size and dst+size) must not overlap. - * - * @return AM_SUCCESS on error or AM_ERROR_MISC if an error occurs. - * @see am_alloc, am_free - */ -am_status_t AM_copy(void* dst, const void* src, size_t size); - - - -/** - * Return information about tracked pointer. - * - * AM tracks pointers when they are allocated or added to tracker with am_track_pointer. - * The tracker tracks the base pointer as well as the size of the allocation, and will - * find the information for a pointer anywhere in the tracked range. - * - * @returns AM_ERROR_MISC if pointer is not currently being tracked. - * @returns AM_SUCCESS if pointer is tracked and writes info to @p info. - * - * @see AM_memtracker_add, - */ -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); - - -/** - * Add a pointer to the memory tracker. - * - * @return AM_SUCCESS - * @see am_memtracker_getinfo - */ -am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); - - -/* - * Update info for an existing pointer in the memory tracker. - * - * @returns AM_ERROR_MISC if pointer is not found in tracker. - * @returns AM_SUCCESS if pointer is not found in tracker. - * - * @see am_memtracker_getinfo, am_memtracker_add - */ -am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags); - - -/** - * Remove @ptr from the tracker structure. - * - * @p ptr may be anywhere in a tracked memory range. - * - * @returns AM_ERROR_MISC if pointer is not found in tracker. - * @returns AM_SUCCESS if pointer is not found in tracker. - * - * @see am_memtracker_getinfo, am_memtracker_add - */ -am_status_t am_memtracker_remove(void* ptr); - -/** - * Remove all memory allocations associated with specified accelerator from the memory tracker. - * - * @returns Number of entries reset. - * @see am_memtracker_getinfo - */ -size_t am_memtracker_reset(hc::accelerator acc); - -/** - * Print the entries in the memory tracker table. - * - * Intended primarily for debug purposes. - * @see am_memtracker_getinfo - **/ -void am_memtracker_print(); - - -/** - * Return total sizes of device, host, and user memory allocated by the application - * - * User memory is registered with am_tracker_add. - **/ -void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize); - - -}; // namespace hc - diff --git a/projects/clr/hipamd/src/hc_AM.cpp b/projects/clr/hipamd/src/hc_AM.cpp deleted file mode 100644 index 272024cfe7..0000000000 --- a/projects/clr/hipamd/src/hc_AM.cpp +++ /dev/null @@ -1,319 +0,0 @@ - -#include "hc_am.hpp" -#include "hsa.h" - - -#include "hcc_detail/AM.h" // TODO - Remove me. - -#define DB_TRACKER 0 -#define MUTEX_LOCK 1 - -#if DB_TRACKER -#define mprintf( ...) {\ - fprintf (stderr, __VA_ARGS__);\ - }; -#else -#define mprintf( ...) -#endif - -//========================================================================================================= -// Pointer Tracker Structures: -//========================================================================================================= -#include -#include -//#include - -struct AmMemoryRange { - const void * _basePointer; - const void * _endPointer; - AmMemoryRange(const void *basePointer, size_t sizeBytes) : - _basePointer(basePointer), _endPointer((const unsigned char*)basePointer + sizeBytes - 1) {}; -}; - -// Functor to compare ranges: -struct AmMemoryRangeCompare { - // Return true is LHS range is less than RHS - used to order the - bool operator()(const AmMemoryRange &lhs, const AmMemoryRange &rhs) const - { - return lhs._endPointer < rhs._basePointer; - } - -}; - - -std::ostream &operator<<(std::ostream &os, const hc::AmPointerInfo &ap) -{ - os << "hostPointer:" << ap._hostPointer << " devicePointer:"<< ap._devicePointer << " sizeBytes:" << ap._sizeBytes - << " isInDeviceMem:" << ap._isInDeviceMem << " isAmManaged:" << ap._isAmManaged - << " appId:" << ap._appId << " appAllocFlags:" << ap._appAllocationFlags; - return os; -} - - -//------------------------------------------------------------------------------------------------- -// This structure tracks information for each pointer. -// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size -// will find the associated AmPointerInfo. -// The insertions and lookups use a self-balancing binary tree and should support O(logN) lookup speed. -// The structure is thread-safe - writers obtain a mutex before modifying the tree. Multiple simulatenous readers are supported. -class AmPointerTracker { -typedef std::map MapTrackerType; -public: - - void insert(void *pointer, const hc::AmPointerInfo &p); - int remove(void *pointer); - - MapTrackerType::iterator find(const void *hostPtr) ; - - MapTrackerType::iterator readerLockBegin() { _mutex.lock(); return _tracker.begin(); } ; - MapTrackerType::iterator end() { return _tracker.end(); } ; - void readerUnlock() { _mutex.unlock(); }; - - - size_t reset (hc::accelerator acc); - -private: - MapTrackerType _tracker; - std::mutex _mutex; - //std::shared_timed_mutex _mut; -}; - - -//--- -void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) -{ - std::lock_guard l (_mutex); - - mprintf ("insert: %p + %zu\n", pointer, p._sizeBytes); - _tracker.insert(std::make_pair(AmMemoryRange(pointer, p._sizeBytes), p)); -} - - -//--- -// Return 1 if removed or 0 if not found. -int AmPointerTracker::remove (void *pointer) -{ - std::lock_guard l (_mutex); - mprintf ("remove: %p\n", pointer); - return _tracker.erase(AmMemoryRange(pointer,1)); -} - - -//--- -AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void *pointer) -{ - std::lock_guard l (_mutex); - auto iter = _tracker.find(AmMemoryRange(pointer,1)); - mprintf ("find: %p\n", pointer); - return iter; -} - - -//--- -// Remove all tracked locations, and free the associated memory (if the range was originally allocated by AM). -// Returns count of ranges removed. -size_t AmPointerTracker::reset (hc::accelerator acc) -{ - std::lock_guard l (_mutex); - mprintf ("reset: \n"); - - size_t count = 0; - // relies on C++11 (erase returns iterator) - for (auto iter = _tracker.begin() ; iter != _tracker.end(); ) { - if (iter->second._acc == acc) { - if (iter->second._isAmManaged) { - hsa_memory_free(const_cast (iter->first._basePointer)); - } - count++; - - iter = _tracker.erase(iter); - } else { - iter++; - } - } - - return count; -} - - -//========================================================================================================= -// Global var defs: -//========================================================================================================= -AmPointerTracker g_amPointerTracker; // Track all am pointer allocations. - - -//========================================================================================================= -// API Definitions. -//========================================================================================================= -// -// - -namespace hc { - -// Allocate accelerator memory, return NULL if memory could not be allocated: -auto_voidp AM_alloc(size_t sizeBytes, hc::accelerator acc, unsigned flags) -{ - - void *ptr = NULL; - - if (sizeBytes != 0 ) { - if (acc.is_hsa_accelerator()) { - hsa_agent_t *hsa_agent = static_cast (acc.get_default_view().get_hsa_agent()); - hsa_region_t *alloc_region; - if (flags & amHostPinned) { - alloc_region = static_cast(acc.get_hsa_am_system_region()); - } else { - alloc_region = static_cast(acc.get_hsa_am_region()); - } - - if (alloc_region->handle != -1) { - - hsa_status_t s1 = hsa_memory_allocate(*alloc_region, sizeBytes, &ptr); - hsa_status_t s2 = hsa_memory_assign_agent(ptr, *hsa_agent, HSA_ACCESS_PERMISSION_RW); - - if ((s1 != HSA_STATUS_SUCCESS) || (s2 != HSA_STATUS_SUCCESS)) { - ptr = NULL; - } else { - if (flags & amHostPinned) { - g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, true /*isAMManaged*/)); - } else { - g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, true /*isAMManaged*/)); - } - } - } - } - } - - return ptr; -}; - - -am_status_t AM_free(void* ptr) -{ - am_status_t status = AM_SUCCESS; - - if (ptr != NULL) { - // See also tracker::reset which can free memory. - hsa_memory_free(ptr); - - int numRemoved = g_amPointerTracker.remove(ptr) ; - if (numRemoved == 0) { - status = AM_ERROR_MISC; - } - } - return status; -} - - - -am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) -{ - am_status_t am_status = AM_ERROR_MISC; - hsa_status_t err = hsa_memory_copy(dst, src, sizeBytes); - - if (err == HSA_STATUS_SUCCESS) { - am_status = AM_SUCCESS; - } else { - am_status = AM_ERROR_MISC; - } - - return am_status; -} - - -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr) -{ - auto infoI = g_amPointerTracker.find(ptr); - if (infoI != g_amPointerTracker.end()) { - *info = infoI->second; - return AM_SUCCESS; - } else { - return AM_ERROR_MISC; - } -} - -am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) -{ - if (isDeviceMem) { - g_amPointerTracker.insert(ptr, hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, false /*isAMManaged*/)); - } else { - g_amPointerTracker.insert(ptr, hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, false /*isAMManaged*/)); - } - - return AM_SUCCESS; -} - - -am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags) -{ - auto iter = g_amPointerTracker.find(ptr); - if (iter != g_amPointerTracker.end()) { - iter->second._appId = appId; - iter->second._appAllocationFlags = allocationFlags; - return AM_SUCCESS; - } else { - return AM_ERROR_MISC; - } -} - - -am_status_t am_memtracker_remove(void* ptr) -{ - am_status_t status = AM_SUCCESS; - - int numRemoved = g_amPointerTracker.remove(ptr) ; - if (numRemoved == 0) { - status = AM_ERROR_MISC; - } - - return status; -} - -//--- -void am_memtracker_print() -{ - std::ostream &os = std::cerr; - - //g_amPointerTracker.print(std::cerr); - for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) { - os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; - os << iter->second << std::endl; - } - - g_amPointerTracker.readerUnlock(); -} - - -//--- -void am_memtracker_sizeinfo(hc::accelerator acc, size_t *deviceMemSize, size_t *hostMemSize, size_t *userMemSize) -{ - *deviceMemSize = *hostMemSize = *userMemSize = 0; - for (auto iter = g_amPointerTracker.readerLockBegin() ; iter != g_amPointerTracker.end(); iter++) { - if (iter->second._acc == acc) { - size_t sizeBytes = iter->second._sizeBytes; - if (iter->second._isAmManaged) { - if (iter->second._isInDeviceMem) { - *deviceMemSize += sizeBytes; - } else { - *hostMemSize += sizeBytes; - } - } else { - *userMemSize += sizeBytes; - } - } - } - - g_amPointerTracker.readerUnlock(); -} - - -//--- -size_t am_memtracker_reset(hc::accelerator acc) -{ - return g_amPointerTracker.reset(acc); -} - - -} // end namespace hc. diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 83083022c2..7a32f91747 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -42,21 +42,13 @@ THE SOFTWARE. #define USE_AM_TRACKER 0 /* >0 = use new AM memory tracker features. 2=use HCC impl */ -#define USE_ROCR_V2 0 +#define USE_ROCR_V2 0 /* use the ROCR v2 async copy API with dst and src agents */ -#if ((USE_AM_TRACKER!=0) && (USE_AM_TRACKER!=2)) -#error (USE_AM_TRACKER must be 0 or 2) +#if (USE_AM_TRACKER) and (__hcc_workweek__ < 16074) +#error (USE_AM_TRACKER requries HCC version of 16074 or newer) #endif -#if USE_AM_TRACKER==1 -#include "hc_AM.cpp" -#define AM_ALLOC hc::AM_alloc -#define AM_FREE hc::AM_free -#else -#define AM_ALLOC hc::am_alloc -#define AM_FREE hc::am_free -#endif #define INLINE static inline @@ -247,9 +239,9 @@ ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsig { _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); - auto s = this; #if 0 + auto s = this; std::for_each(_signalPool.begin(), _signalPool.end(), [s](ihipSignal_t &iter) { printf (" stream:%p allocated hsa_signal=%lu\n", s, (iter._hsa_signal.handle)); @@ -1642,7 +1634,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (device) { const unsigned am_flags = 0; - *ptr = AM_ALLOC(sizeBytes, device->_acc, am_flags); + *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; @@ -1669,7 +1661,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) auto device = ihipGetTlsDefaultDevice(); if (device) { - *ptr = AM_ALLOC(sizeBytes, device->_acc, am_flags); + *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { @@ -1715,7 +1707,7 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuf for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. - _pinnedStagingBuffer[i] = AM_ALLOC(_bufferSize, device->_acc, amHostPinned); + _pinnedStagingBuffer[i] = hc::am_alloc(_bufferSize, device->_acc, amHostPinned); if (_pinnedStagingBuffer[i] == NULL) { throw; } @@ -1728,7 +1720,7 @@ StagingBuffer::~StagingBuffer() { for (int i=0; i<_numBuffers; i++) { if (_pinnedStagingBuffer[i]) { - AM_FREE(_pinnedStagingBuffer[i]); + hc::am_free(_pinnedStagingBuffer[i]); _pinnedStagingBuffer[i] = NULL; } hsa_signal_destroy(_completion_signal[i]); @@ -2112,7 +2104,7 @@ hipError_t hipFree(void* ptr) ihipWaitAllStreams(ihipGetTlsDefaultDevice()); if (ptr) { - AM_FREE(ptr); + hc::am_free(ptr); } return ihipLogStatus(hipSuccess); @@ -2126,7 +2118,7 @@ hipError_t hipFreeHost(void* ptr) if (ptr) { tprintf (TRACE_MEM, " %s: %p\n", __func__, ptr); - AM_FREE(ptr); + hc::am_free(ptr); } return ihipLogStatus(hipSuccess); From f19c6d8342615e733274fc3d7b512f0a263bd9f9 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 17 Feb 2016 23:03:37 -0600 Subject: [PATCH 29/32] Enable Tracker and ROCR by default, verify with HCC [ROCm/clr commit: 400dcb8bcb33b9c93a1b0841452b7d3570bee8ad] --- projects/clr/hipamd/src/hip_hcc.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 7a32f91747..f814f99ddf 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -41,14 +41,19 @@ THE SOFTWARE. -#define USE_AM_TRACKER 0 /* >0 = use new AM memory tracker features. 2=use HCC impl */ -#define USE_ROCR_V2 0 /* use the ROCR v2 async copy API with dst and src agents */ +#define USE_AM_TRACKER 1 /* >0 = use new AM memory tracker features. */ +#define USE_ROCR_V2 1 /* use the ROCR v2 async copy API with dst and src agents */ #if (USE_AM_TRACKER) and (__hcc_workweek__ < 16074) #error (USE_AM_TRACKER requries HCC version of 16074 or newer) #endif +#if (USE_ROCR_V2) and (USE_AM_TRACKER == 0) +#error (USE_ROCR_V2 requires USE_AM_TRACKER>0) +#endif + + #define INLINE static inline @@ -1478,7 +1483,8 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) hipError_t e = hipSuccess; #if USE_AM_TRACKER - hc::AmPointerInfo amPointerInfo; + hc::accelerator acc; + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); if (status == AM_SUCCESS) { @@ -1530,7 +1536,8 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi if (flags == 0) { e = hipErrorInvalidValue; } else { - hc::AmPointerInfo amPointerInfo; + hc::accelerator acc; + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer); if (status == AM_SUCCESS) { *devicePointer = amPointerInfo._devicePointer; @@ -1836,7 +1843,9 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte #if USE_AM_TRACKER void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { - hc::AmPointerInfo dstPtrInfo, srcPtrInfo; + hc::accelerator acc; + hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); + hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); From beae41517d43de7903e151e78bbfc0c8b5823b02 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 19 Feb 2016 01:56:17 -0600 Subject: [PATCH 30/32] Describe how to update HTML docs [ROCm/clr commit: 16ff0757a6b21d7c8257f38ef90f58f760da5755] --- .../clr/hipamd/tests/src/hipMemcpyAsync.cpp | 40 ++++++++++++++++--- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp index 6192940270..19f1a94761 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp @@ -20,7 +20,8 @@ void simpleNegTest() // Can't use default with async copy e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL); - HIPASSERT (e==hipErrorInvalidMemcpyDirection); + HIPASSERT (e==hipErrorInvalidMemcpyDirection); // TODO + HIPASSERT (e!= hipSuccess); // Not sure what happens here, the memory must be pinned. @@ -30,6 +31,33 @@ void simpleNegTest() //HIPASSERT (e==hipErrorInvalidValue); } + +//--- +//Send many async copies to the same stream. +//This requires runtime to keep track of many outstanding commands, and in the case of HCC requires growing/tracking the signal pool: +template +void test_manyCopies(int nElements, size_t numCopies, int nStreams) +{ + size_t Nbytes = nElements*sizeof(T); + printf ("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes)/1024.0/1024.0); + + int *A_d, *B_d, *C_d; + int *A_h, *B_h, *C_h; + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); + + size_t eachCopyBytes = Nbytes / numCopies; + + for (size_t i=0; i Date: Sat, 20 Feb 2016 11:01:43 -0600 Subject: [PATCH 31/32] Track last command to a stream. Passing simple tests. [ROCm/clr commit: d5c777268aa1968304a01ff96543bc4650c74cc7] --- .../hipamd/include/hcc_detail/hip_runtime.h | 9 +- projects/clr/hipamd/src/hip_hcc.cpp | 345 ++++++++++++------ .../clr/hipamd/tests/src/hipMemcpyAsync.cpp | 80 +++- projects/clr/hipamd/tests/src/test_common.h | 2 +- 4 files changed, 301 insertions(+), 135 deletions(-) diff --git a/projects/clr/hipamd/include/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hcc_detail/hip_runtime.h index 7c5a2f2e36..aeed53348e 100644 --- a/projects/clr/hipamd/include/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hcc_detail/hip_runtime.h @@ -481,7 +481,8 @@ __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); }; #ifdef __HCC_CPP__ -hc::accelerator_view *ihipLaunchKernel(hipStream_t stream); +hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av); +void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &cf); #if not defined(DISABLE_GRID_LAUNCH) #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \ @@ -496,12 +497,13 @@ do {\ lp.groupMemBytes = _groupMemBytes;\ hc::completion_future cf;\ lp.cf = &cf; \ - lp.av = (ihipLaunchKernel(_stream)); \ + hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \ if (HIP_TRACE_API) {\ fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \ #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\ }\ _kernelName (lp, __VA_ARGS__);\ + ihipPostLaunchKernel(trueStream, cf);\ } while(0) #else @@ -519,12 +521,13 @@ do {\ lp.groupMemBytes = _groupMemBytes;\ hc::completion_future cf;\ lp.cf = &cf; \ - lp.av = (ihipLaunchKernel(_stream)); \ + hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \ if (HIP_TRACE_API) {\ fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \ #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\ }\ _kernelName (lp, __VA_ARGS__);\ + ihipPostLaunchKernel(trueStream, cf);\ } while(0) /*end hipLaunchKernel */ #endif diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index f814f99ddf..2d7650b6ed 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -31,6 +31,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -91,28 +92,35 @@ struct ihipDevice_t; enum ihipCommand_t { ihipCommandKernel, - ihipCommandData, + ihipCommandCopyH2D, + ihipCommandCopyD2H, +}; + +const char* ihipCommandName[] = { + "Kernel", "CopyH2D", "CopyD2H" }; // Small wrapper around signals. // Designed to be used from stream. struct ihipSignal_t { - hsa_signal_t _hsa_signal; - int _refCnt; + hsa_signal_t _hsa_signal; // hsa signal handle + int _ref_cnt; // reference count, 0 == signal is free. + uint64_t _seq_id; // unique sequentially increasig ID. - ihipSignal_t() : _refCnt(0) { + ihipSignal_t() : _ref_cnt(0), _seq_id(0) { if (hsa_signal_create(1, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { throw; } + tprintf (TRACE_SYNC, " allocated hsa_signal=%lu\n", (_hsa_signal.handle)); } ~ihipSignal_t() { if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { throw; } - // _refCnt should be 0, unless we are shutting down... - _refCnt = 0; + // _ref_cnt should be 0, unless we are shutting down... + _ref_cnt = 0; }; }; @@ -120,23 +128,35 @@ struct ihipSignal_t { // Internal stream structure. class ihipStream_t { public: - unsigned _device_index; - hc::accelerator_view _av; - unsigned _flags; - ihipCommand_t _last_command; - //ihipStream_t() : _av(){ }; ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags); ~ihipStream_t(); + inline void wait(); + inline ihipDevice_t * getDevice() const; - ihipSignal_t * getSignal() ; + ihipSignal_t * getSignal() ; void releaseSignal(ihipSignal_t *signal) ; + inline bool preKernelCommand(); + inline void postKernelCommand(hc::completion_future &kernel_future); + inline int copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); + + //--- + unsigned _device_index; + hc::accelerator_view _av; + unsigned _flags; private: + void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); + + uint64_t _seq_signal_id; // Monotonically increasing unique signal id. + ihipCommand_t _last_command_type; // type of the last command + ihipSignal_t *_last_copy_signal; // signal of last copy command sent to the stream. Copy can be either H2D or D2H. + hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. + int _signalCursor; - std::vector _signalPool; + std::deque _signalPool; }; @@ -239,22 +259,20 @@ unsigned g_deviceCnt; //================================================================================================= //--- ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : - _device_index(device_index), _av(av), _flags(flags), _last_command(ihipCommandKernel), + _device_index(device_index), + _av(av), + _flags(flags), + _seq_signal_id(0), + _last_command_type(ihipCommandCopyH2D), + _last_copy_signal (NULL), _signalCursor(0) { + tprintf(TRACE_SYNC, " streamCreate: stream=%p\n", this); _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); - -#if 0 - auto s = this; - std::for_each(_signalPool.begin(), _signalPool.end(), - [s](ihipSignal_t &iter) { - printf (" stream:%p allocated hsa_signal=%lu\n", s, (iter._hsa_signal.handle)); - }); -#endif - }; + //--- ihipStream_t::~ihipStream_t() { @@ -262,6 +280,17 @@ ihipStream_t::~ihipStream_t() } +void ihipStream_t::wait() { + tprintf (TRACE_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_seq_id: 0x0 ); + _av.wait(); + if (_last_copy_signal) { + hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + // TODO-stream : reset ? + } + +}; + + //--- inline ihipDevice_t * ihipStream_t::getDevice() const { @@ -280,8 +309,9 @@ ihipSignal_t *ihipStream_t::getSignal() _signalCursor = 0; } - if (_signalPool[thisCursor]._refCnt == 0) { - _signalPool[thisCursor]._refCnt ++; // allocate it + if (_signalPool[thisCursor]._ref_cnt == 0) { + _signalPool[thisCursor]._ref_cnt ++; // allocate it + _signalPool[thisCursor]._seq_id = ++_seq_signal_id; // allocate it return &_signalPool[thisCursor]; } @@ -291,19 +321,19 @@ ihipSignal_t *ihipStream_t::getSignal() assert(numToScan == 0); // Have to grow the pool: - printf ("Grow signal pool\n"); _signalCursor = _signalPool.size(); // set to the beginning of the new entries: _signalPool.resize(_signalPool.size() * 2); + tprintf (TRACE_SYNC, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor); return getSignal(); // try again, - // Shouldnever reach here. + // Should never reach here. assert(0); } void ihipStream_t::releaseSignal(ihipSignal_t *signal) { - if (--signal->_refCnt <= 0) { + if (--signal->_ref_cnt <= 0) { // restore signal to the initial value 1 hsa_signal_store_release(signal->_hsa_signal, 1); } @@ -714,7 +744,7 @@ static inline void ihipWaitAllStreams(ihipDevice_t *device) { tprintf(TRACE_SYNC, "waitAllStream\n"); for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) { - (*streamI)->_av.wait(); + (*streamI)->wait(); } } @@ -730,7 +760,7 @@ inline void ihipWaitNullStream(ihipDevice_t *device) if (!(stream->_flags & hipStreamNonBlocking)) { // TODO-hcc - use blocking or active wait here? // TODO-sync - cudaDeviceBlockingSync - stream->_av.wait(); + stream->wait(); } } } @@ -753,17 +783,9 @@ inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) } } -#if 0 -inline hsa_status_t -HSABarrier::enqueueBarrier(hsa_queue_t* queue) { - hsa_status_t status = HSA_STATUS_SUCCESS; - hc::completion_future marker = stream->_av.create_marker(); - - // Create a signal to wait for the barrier to finish. - std::pair ret = Kalmar::ctx.getSignal(); - signal = ret.first; - signalIndex = ret.second; +void +ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) { // Obtain the write index for the command queue uint64_t index = hsa_queue_load_write_index_relaxed(queue); @@ -776,21 +798,20 @@ HSABarrier::enqueueBarrier(hsa_queue_t* queue) { // setup header uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; header |= 1 << HSA_PACKET_HEADER_BARRIER; - header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; barrier->header = header; - barrier->completion_signal = signal; + barrier->dep_signal[0] = depSignal->_hsa_signal; + barrier->completion_signal.handle = 0; + + // TODO - check queue overflow, return error: // Increment write index and ring doorbell to dispatch the kernel hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); - - isDispatched = true; - - return status; } -#endif + //-- //When the commands in a stream change types (ie kernel command follows a data command, @@ -798,36 +819,93 @@ HSABarrier::enqueueBarrier(hsa_queue_t* queue) { //into the stream to mimic CUDA stream semantics. (some hardware uses separate //queues for data commands and kernel commands, and no implicit ordering is provided). // -inline bool ihipCheckCommandSwitchSync(hipStream_t stream, ihipCommand_t new_command, hc::completion_future *marker) +inline bool ihipStream_t::preKernelCommand() { bool addedSync = false; // If switching command types, we need to add a barrier packet to synchronize things. - if (stream->_last_command != new_command) { - addedSync = true; - *marker = stream->_av.create_marker(); + if (_last_command_type != ihipCommandKernel) { + if (_last_copy_signal) { + addedSync = true; - tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted)\n", - (void*)stream, - stream->_last_command == ihipCommandKernel ? "Kernel" : "Data", - new_command == ihipCommandKernel ? "Kernel" : "Data"); - stream->_last_command = new_command; + hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); + this->enqueueBarrier(q, _last_copy_signal); + + tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted)\n", + this, + ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]) + } + _last_command_type = ihipCommandKernel; } return addedSync; } +//--- +inline void ihipStream_t::postKernelCommand(hc::completion_future &kernelFuture) +{ + _last_kernel_future = kernelFuture; +}; + + + +//--- +// Called whenever a copy command is set to the stream. +// Examines the last command sent to this stream and returns a signal to wait on, if required. +inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType) +{ + int needSync = 0; + // If switching command types, we need to add a barrier packet to synchronize things. + if (_last_command_type != copyType) { + needSync = 1; + + + if (_last_command_type == ihipCommandKernel) { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); + hsa_signal_t *hsaSignal = (static_cast (_last_kernel_future.get_native_handle())); + if (hsaSignal) { + *waitSignal = * hsaSignal; + } + } else if (_last_copy_signal) { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); + assert (_last_copy_signal->_ref_cnt > 0); + *waitSignal = _last_copy_signal->_hsa_signal; + } + + _last_command_type = copyType; + } + + _last_copy_signal = lastCopy; + + return needSync; +} + + + + + +// TODO - data-up to data-down: // Called just before a kernel is launched from hipLaunchKernel. // Allows runtime to track some information about the stream. -hc::accelerator_view *ihipLaunchKernel(hipStream_t stream) +hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av) { - stream = ihipSyncAndResolveStream(stream); - hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandKernel, &marker); + stream->preKernelCommand(); - return &(stream->_av); + *av = &stream->_av; + + return (stream); +} + + +//--- +//Called after kernel finishes execution. +void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFuture) +{ + stream->postKernelCommand(kernelFuture); } @@ -1202,7 +1280,7 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int { // Super-conservative version of this - TODO - remove me: - stream->_av.wait(); + stream->wait(); e = hipSuccess; } @@ -1220,7 +1298,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) ihipDevice_t *device = ihipGetTlsDefaultDevice(); ihipWaitNullStream(device); } else { - stream->_av.wait(); + stream->wait(); e = hipSuccess; } @@ -1389,7 +1467,7 @@ void ihipSetTs(hipEvent_t e) // already recorded, done: return; } else { - // Test this code: + // TODO - use completion-future functions to obtain ticks and timestamps: hsa_signal_t *sig = static_cast (eh->_marker.get_native_handle()); if (sig) { if (hsa_signal_load_acquire(*sig) == 0) { @@ -1694,8 +1772,9 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou } auto device = ihipGetTlsDefaultDevice(); - hc::completion_future marker; - ihipCheckCommandSwitchSync(device._null_stream, ihipCommandData, &marker); + //hsa_signal_t depSignal; + //int depSignalCnt = device._null_stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); + assert(0); // Need to properly synchronize the copy - do something with depSignal if != NULL. device->_acc.memcpy_symbol(symbolName, (void*) src,count, offset); #endif @@ -1762,7 +1841,12 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); #if USE_ROCR_V2 - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, 0, NULL, _completion_signal[bufferIndex]); + hsa_signal_t depSignal; + int depSignalCnt = 0; //stream->copyCommand(_completion_signal[bufferIndex], &depSignal, copyType); + + printf ("need sync\n"); + + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, _completion_signal[bufferIndex]); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); #endif @@ -1807,7 +1891,13 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); #if USE_ROCR_V2 - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, 0, NULL, _completion_signal[bufferIndex]); + hsa_signal_t depSignal; + // TODO + int depSignalCnt = 0; //stream->copyCommand(_completion_signal[bufferIndex], &depSignal, copyType); + + printf ("need sync\n"); + + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, depSignalCnt, depSignalCnt ? &depSignal:0, _completion_signal[bufferIndex]); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); #endif @@ -1867,6 +1957,14 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB } } +#if 0 + + //TODO + hsa_signal_t depSignal; + int dep_signals = stream->commandCopy(&depSignal, ); + pass to CopyHostToDevice +#endif + if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (useStagingBuffer) { std::lock_guard l (device->_copy_lock[0]); @@ -1918,7 +2016,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker); + hipError_t e = hipSuccess; @@ -1955,44 +2053,52 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp hipError_t e = hipSuccess; - stream = ihipSyncAndResolveStream(stream); - - hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker); - - // Dispatch async memory copy to synchronize with items in the specified stream. - - // Async - need to set up dependency on the last command queued to the device? - + stream = ihipSyncAndResolveStream(stream); #if USE_AM_TRACKER - - hipStream_t s = ihipSyncAndResolveStream(stream); - - if (s) { - ihipDevice_t *device = s->getDevice(); + if (stream) { + ihipDevice_t *device = stream->getDevice(); if (kind == hipMemcpyDefault) { e = hipErrorInvalidMemcpyDirection; + + } else if (kind == hipMemcpyHostToHost) { + tprintf (TRACE_COPY2, "H2H copy with memcpy"); + + memcpy(dst, src, sizeBytes); + } else { - // Let HSA runtime handle it: - // TODO - need buffer pool for the signals rather than lock: ihipSignal_t *ihip_signal = stream->getSignal(); - //stream->saveLastSignal(ihipSignal); + ihipCommand_t copyType; + if ((kind == hipMemcpyHostToDevice) || (kind == hipMemcpyDeviceToDevice)) { + copyType = ihipCommandCopyH2D; + } else if (kind == hipMemcpyDeviceToHost) { + copyType = ihipCommandCopyD2H; + } else { + e = hipErrorInvalidMemcpyDirection; + copyType = ihipCommandCopyD2H; + } #if USE_ROCR_V2 - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, 0, NULL, ihip_signal->_hsa_signal); + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(ihip_signal, &depSignal, copyType); + + tprintf (TRACE_SYNC, " copy-async, waitFor=%d(%lu) completion=%lu\n", depSignalCnt, depSignal.handle, ihip_signal->_seq_id); + + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsa_signal); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, ihip_signal->_hsa_signal); #endif - if (hsa_status == HSA_STATUS_SUCCESS) { + if (hsa_status == HSA_STATUS_SUCCESS) { if (HIP_LAUNCH_BLOCKING) { hsa_signal_wait_relaxed(ihip_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); stream->releaseSignal(ihip_signal); - } + } else { + //stream->releaseSignal(ihip_signal); + } } else { // This path can be hit if src or dst point to unpinned host memory. // TODO - does async-copy fall back to sync if input pointers are not pinned? @@ -2014,6 +2120,8 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp // TODO-sync: function is async unless target is pinned host memory - then these are fully sync. +/** @return #hipErrorInvalidValue + */ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream ) { std::call_once(hip_initialized, ihipInit); @@ -2021,37 +2129,42 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t e = hipSuccess; stream = ihipSyncAndResolveStream(stream); - hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker); + stream->preKernelCommand(); + if (stream) { - hc::completion_future cf ; + hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { - // use a faster word-per-workitem copy: - try { - value = value & 0xff; - unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - cf = ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); - } - catch (std::exception &ex) { - e = hipErrorInvalidValue; + if ((sizeBytes & 0x3) == 0) { + // use a faster word-per-workitem copy: + try { + value = value & 0xff; + unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; + cf = ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); + } + catch (std::exception &ex) { + e = hipErrorInvalidValue; + } + } else { + // use a slow byte-per-workitem copy: + try { + cf = ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); + } + catch (std::exception &ex) { + e = hipErrorInvalidValue; + } } + + stream->postKernelCommand(cf); + + + if (HIP_LAUNCH_BLOCKING) { + tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING wait for completion [stream:%p].\n", __func__, (void*)stream); + cf.wait(); + tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING completed [stream:%p].\n", __func__, (void*)stream); + } } else { - // use a slow byte-per-workitem copy: - try { - cf = ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); - } - catch (std::exception &ex) { - e = hipErrorInvalidValue; - } - } - - - if (HIP_LAUNCH_BLOCKING) { - tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING wait for completion [stream:%p].\n", __func__, (void*)stream); - cf.wait(); - tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING completed [stream:%p].\n", __func__, (void*)stream); + e = hipErrorInvalidValue; } diff --git a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp index 19f1a94761..8669b986d8 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp @@ -36,24 +36,62 @@ void simpleNegTest() //Send many async copies to the same stream. //This requires runtime to keep track of many outstanding commands, and in the case of HCC requires growing/tracking the signal pool: template -void test_manyCopies(int nElements, size_t numCopies, int nStreams) +void test_manyCopies(int nElements, int numCopies) { size_t Nbytes = nElements*sizeof(T); - printf ("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes)/1024.0/1024.0); + size_t eachCopyElements = nElements / numCopies; + size_t eachCopyBytes = eachCopyElements * sizeof(T); - int *A_d, *B_d, *C_d; - int *A_h, *B_h, *C_h; + printf ("-----------------------------------------------------------------------------------------------\n"); + printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", + __func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes); - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); + T *A_d; + T *A_h1, *A_h2; - size_t eachCopyBytes = Nbytes / numCopies; - - for (size_t i=0; i (i); } - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, true); + + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + //stream=0; // fixme TODO + + + for (int i=0; i(1024, 16); + test_manyCopies(1024, 4); + test_manyCopies(1024*4, 64); + } - test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version - test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version - test_chunkedAsyncExample(p_streams, false, false, true); // Some async - test_chunkedAsyncExample(p_streams, false, false, false); // All async + if (p_tests & 0x4) { + test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version + test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version + test_chunkedAsyncExample(p_streams, false, false, true); // Some async + test_chunkedAsyncExample(p_streams, false, false, false); // All async + } diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index 5b631d2c3a..f133696d78 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -25,7 +25,7 @@ printf (__VA_ARGS__);\ printf ("\n");\ printf ("error: TEST FAILED\n%s", KNRM );\ - exit(EXIT_FAILURE); + abort(); #define HIPCHECK(error) \ From da37035a9c3e4c5c89e5e0ee4fd568e56511e639 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 22 Feb 2016 23:15:24 -0600 Subject: [PATCH 32/32] Improve async copy implementation. - Add device-side signal waits when transitioning between command classes (Kernel, H2D copy, D2H copy). - Support waiting in staged memory copies as well. - Add several chicken bits to control implementation: - HIP_DISABLE_ENQ_BARRIER - HIP_DISABLE_BIDIR_MEMCPY - HIP_ONESHOT_COPY_DEP - Refactor signal pool to support efficient deallocation based on signsequnm. - Deallocate copy signals on eventSynchronize. - Improve copy tests, add pingpong. [ROCm/clr commit: 28990567fba77f0182183dda6ad1b175acc3ae0d] --- projects/clr/hipamd/src/hip_hcc.cpp | 524 +++++++++++------- .../clr/hipamd/tests/src/hipMemcpyAsync.cpp | 162 +++++- projects/clr/hipamd/tests/src/test_common.h | 2 +- 3 files changed, 462 insertions(+), 226 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 2d7650b6ed..3ba578d52c 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -65,6 +65,8 @@ THE SOFTWARE. //static const int debug = 0; static const int release = 1; +#define ENABLE_CHECKS 1 + int HIP_LAUNCH_BLOCKING = 0; int HIP_PRINT_ENV = 0; @@ -73,14 +75,25 @@ int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ -#define TRACE_API 0x1 /* trace API calls and return values */ -#define TRACE_SYNC 0x2 /* trace synchronization pieces */ -#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */ -#define TRACE_COPY2 0x8 /* trace memory copy commands. Detailed. */ + +//--- +// Chicken bits for disabling functionality to work around potential issues: +int HIP_DISABLE_ENQ_BARRIER = 1; +int HIP_DISABLE_BIDIR_MEMCPY = 1; +int HIP_ONESHOT_COPY_DEP = 1; // this is a good thing + + +//--- +//Debug flags: +#define TRACE_API 0x01 /* trace API calls and return values */ +#define TRACE_SYNC 0x02 /* trace synchronization pieces */ +#define TRACE_MEM 0x04 /* trace memory allocation / deallocation */ +#define TRACE_COPY2 0x08 /* trace memory copy commands. Detailed. */ +#define TRACE_SIGNAL 0x10 /* trace signal pool commands */ #define tprintf(trace_level, ...) {\ if (HIP_TRACE_API & trace_level) {\ - fprintf (stderr, "hiptrace%d: ", trace_level); \ + fprintf (stderr, "hiptrace%x: ", trace_level); \ fprintf (stderr, __VA_ARGS__);\ }\ } @@ -101,30 +114,28 @@ const char* ihipCommandName[] = { }; + +typedef uint64_t SIGSEQNUM; + +//--- // Small wrapper around signals. // Designed to be used from stream. +// TODO-someday refactor this class so it can be stored in a vector<> +// we already store the index here so we can use for garbage collection. struct ihipSignal_t { hsa_signal_t _hsa_signal; // hsa signal handle - int _ref_cnt; // reference count, 0 == signal is free. - uint64_t _seq_id; // unique sequentially increasig ID. + int _index; // Index in pool, used for garbage collection. + SIGSEQNUM _sig_id; // unique sequentially increasing ID. - ihipSignal_t() : _ref_cnt(0), _seq_id(0) { - if (hsa_signal_create(1, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { - throw; - } - tprintf (TRACE_SYNC, " allocated hsa_signal=%lu\n", (_hsa_signal.handle)); - } + ihipSignal_t(); + ~ihipSignal_t(); - ~ihipSignal_t() { - if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { - throw; - } - // _ref_cnt should be 0, unless we are shutting down... - _ref_cnt = 0; - }; + inline void release(); }; + + // Internal stream structure. class ihipStream_t { public: @@ -132,31 +143,38 @@ public: ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags); ~ihipStream_t(); + inline void reclaimSignals(SIGSEQNUM sigNum); + inline void waitAndReclaimOlder(ihipSignal_t *signal); inline void wait(); inline ihipDevice_t * getDevice() const; ihipSignal_t * getSignal() ; - void releaseSignal(ihipSignal_t *signal) ; inline bool preKernelCommand(); inline void postKernelCommand(hc::completion_future &kernel_future); inline int copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); + inline void resetToEmpty(); + + inline SIGSEQNUM lastCopySeqId() { return _last_copy_signal ? _last_copy_signal->_sig_id : 0; }; + //--- - unsigned _device_index; hc::accelerator_view _av; unsigned _flags; private: void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); - uint64_t _seq_signal_id; // Monotonically increasing unique signal id. + unsigned _device_index; ihipCommand_t _last_command_type; // type of the last command ihipSignal_t *_last_copy_signal; // signal of last copy command sent to the stream. Copy can be either H2D or D2H. hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. int _signalCursor; - std::deque _signalPool; + + SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id. + SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. + std::deque _signalPool; // Pool of signals for use by this stream. }; @@ -180,6 +198,8 @@ struct ihipEvent_t { hc::completion_future _marker; uint64_t _timestamp; // store timestamp, may be set on host or by marker. + + SIGSEQNUM _copy_seq_id; } ; @@ -191,8 +211,8 @@ struct StagingBuffer { StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) ; ~StagingBuffer(); - void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes); - void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); + void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); private: ihipDevice_t *_device; @@ -249,27 +269,57 @@ unsigned g_deviceCnt; //================================================================================================= +//================================================================================================= +//Forward Declarations: +//================================================================================================= +INLINE bool ihipIsValidDevice(unsigned deviceIndex); + //================================================================================================= // Implementation: //================================================================================================= +//================================================================================================= +// ihipSignal_t: +//================================================================================================= +// +//--- +ihipSignal_t::ihipSignal_t() : _sig_id(0) +{ + if (hsa_signal_create(0/*value*/, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; + } + tprintf (TRACE_SIGNAL, " allocated hsa_signal=%lu\n", (_hsa_signal.handle)); +} + +//--- +ihipSignal_t::~ihipSignal_t() +{ + tprintf (TRACE_SIGNAL, " destroy hsa_signal #%lu (#%lu)\n", (_hsa_signal.handle), _sig_id); + if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; // TODO + } +}; + + + //================================================================================================= // ihipStream_t: //================================================================================================= //--- ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : - _device_index(device_index), _av(av), _flags(flags), - _seq_signal_id(0), - _last_command_type(ihipCommandCopyH2D), - _last_copy_signal (NULL), - _signalCursor(0) + _device_index(device_index), + _last_copy_signal(0), + _signalCursor(0), + _stream_sig_id(0), + _oldest_live_sig_id(1) { tprintf(TRACE_SYNC, " streamCreate: stream=%p\n", this); _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); + resetToEmpty(); }; @@ -280,26 +330,62 @@ ihipStream_t::~ihipStream_t() } -void ihipStream_t::wait() { - tprintf (TRACE_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_seq_id: 0x0 ); +//--- +// Reset the stream to "empty" - next command will not set up an inpute dependency on any older signal. +void ihipStream_t::resetToEmpty() +{ + _last_command_type = ihipCommandCopyH2D; + _last_copy_signal = NULL; +} + +//--- +void ihipStream_t::reclaimSignals(SIGSEQNUM sigNum) +{ + tprintf(TRACE_SIGNAL, "reclaim signal #%lu\n", sigNum); + // Mark all signals older and including this one as available for + _oldest_live_sig_id = sigNum+1; +} + + +//--- +void ihipStream_t::waitAndReclaimOlder(ihipSignal_t *signal) +{ + hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + reclaimSignals(_last_copy_signal->_sig_id); + +} + + +//--- +//Wait for all queues kernels in the associated accelerator_view to complete. +void ihipStream_t::wait() +{ + tprintf (TRACE_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_sig_id: 0x0 ); _av.wait(); if (_last_copy_signal) { - hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - // TODO-stream : reset ? + this->waitAndReclaimOlder(_last_copy_signal); } + resetToEmpty(); }; //--- inline ihipDevice_t * ihipStream_t::getDevice() const { - return &g_devices[_device_index]; + if (ihipIsValidDevice(_device_index)) { + return &g_devices[_device_index]; + } else { + return NULL; + } }; +//--- // Allocate a new signal from the signal pool. -// Returned signals are initialized to a value of "1". +// Returned signals have value of 0. +// Signals are intended for use in this stream and are always reclaimed "in-order". ihipSignal_t *ihipStream_t::getSignal() { int numToScan = _signalPool.size(); @@ -309,21 +395,22 @@ ihipSignal_t *ihipStream_t::getSignal() _signalCursor = 0; } - if (_signalPool[thisCursor]._ref_cnt == 0) { - _signalPool[thisCursor]._ref_cnt ++; // allocate it - _signalPool[thisCursor]._seq_id = ++_seq_signal_id; // allocate it + if (_signalPool[thisCursor]._sig_id < _oldest_live_sig_id) { + _signalPool[thisCursor]._index = thisCursor; + _signalPool[thisCursor]._sig_id = ++_stream_sig_id; // allocate it. + + return &_signalPool[thisCursor]; } - numToScan--; - } while (numToScan) ; + } while (--numToScan) ; assert(numToScan == 0); // Have to grow the pool: _signalCursor = _signalPool.size(); // set to the beginning of the new entries: _signalPool.resize(_signalPool.size() * 2); - tprintf (TRACE_SYNC, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor); + tprintf (TRACE_SIGNAL, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor); return getSignal(); // try again, // Should never reach here. @@ -331,14 +418,113 @@ ihipSignal_t *ihipStream_t::getSignal() } -void ihipStream_t::releaseSignal(ihipSignal_t *signal) +//--- +void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) { - if (--signal->_ref_cnt <= 0) { - // restore signal to the initial value 1 - hsa_signal_store_release(signal->_hsa_signal, 1); - } + + // Obtain the write index for the command queue + uint64_t index = hsa_queue_load_write_index_relaxed(queue); + const uint32_t queueMask = queue->size - 1; + + // Define the barrier packet to be at the calculated queue index address + hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]); + memset(barrier, 0, sizeof(hsa_barrier_and_packet_t)); + + // setup header + uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + header |= 1 << HSA_PACKET_HEADER_BARRIER; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + barrier->header = header; + + barrier->dep_signal[0] = depSignal->_hsa_signal; + + barrier->completion_signal.handle = 0; + + // TODO - check queue overflow, return error: + // Increment write index and ring doorbell to dispatch the kernel + hsa_queue_store_write_index_relaxed(queue, index+1); + hsa_signal_store_relaxed(queue->doorbell_signal, index); } + +//-- +//When the commands in a stream change types (ie kernel command follows a data command, +//or data command follows a kernel command), then we need to add a barrier packet +//into the stream to mimic CUDA stream semantics. (some hardware uses separate +//queues for data commands and kernel commands, and no implicit ordering is provided). +// +inline bool ihipStream_t::preKernelCommand() +{ + bool addedSync = false; + // If switching command types, we need to add a barrier packet to synchronize things. + if (_last_command_type != ihipCommandKernel) { + if (_last_copy_signal) { + addedSync = true; + + hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); + if (! HIP_DISABLE_ENQ_BARRIER) { + this->enqueueBarrier(q, _last_copy_signal); + tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel], _last_copy_signal->_sig_id) + + } else { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (wait for previous...)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]); + this->waitAndReclaimOlder(_last_copy_signal); + } + } + _last_command_type = ihipCommandKernel; + } + + return addedSync; +} + + +//--- +inline void ihipStream_t::postKernelCommand(hc::completion_future &kernelFuture) +{ + _last_kernel_future = kernelFuture; +}; + + + +//--- +// Called whenever a copy command is set to the stream. +// Examines the last command sent to this stream and returns a signal to wait on, if required. +inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType) +{ + int needSync = 0; + + waitSignal->handle = 0; + // If switching command types, we need to add a barrier packet to synchronize things. + if (_last_command_type != copyType) { + + + if (_last_command_type == ihipCommandKernel) { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); + needSync = 1; + hsa_signal_t *hsaSignal = (static_cast (_last_kernel_future.get_native_handle())); + if (hsaSignal) { + *waitSignal = * hsaSignal; + } + } else if (_last_copy_signal) { + needSync = 1; + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy #%lu)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType], _last_copy_signal->_sig_id); + *waitSignal = _last_copy_signal->_hsa_signal; + } + + _last_command_type = copyType; + } + + _last_copy_signal = lastCopy; + + return needSync; +} + + //================================================================================================= // //Reset the device - this is called from hipDeviceReset. @@ -680,6 +866,10 @@ void ihipInit() READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction"); READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); + READ_ENV_I(release, HIP_DISABLE_ENQ_BARRIER, 0, "Disable enqueue of barrier packet - instead wait for copy completion on host."); + READ_ENV_I(release, HIP_DISABLE_BIDIR_MEMCPY, 0, "Disable simultaneous H2D memcpy and D2H memcpy to same device"); + READ_ENV_I(release, HIP_ONESHOT_COPY_DEP, 0, "If set, only set the copy input dependency for the first copy command in a staged copy. If clear, set the dep for each copy."); + /* * Build a table of valid compute devices. */ @@ -784,103 +974,6 @@ inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) } -void -ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) { - - // Obtain the write index for the command queue - uint64_t index = hsa_queue_load_write_index_relaxed(queue); - const uint32_t queueMask = queue->size - 1; - - // Define the barrier packet to be at the calculated queue index address - hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]); - memset(barrier, 0, sizeof(hsa_barrier_and_packet_t)); - - // setup header - uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - header |= 1 << HSA_PACKET_HEADER_BARRIER; - //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; - barrier->header = header; - - barrier->dep_signal[0] = depSignal->_hsa_signal; - - barrier->completion_signal.handle = 0; - - // TODO - check queue overflow, return error: - // Increment write index and ring doorbell to dispatch the kernel - hsa_queue_store_write_index_relaxed(queue, index+1); - hsa_signal_store_relaxed(queue->doorbell_signal, index); -} - - -//-- -//When the commands in a stream change types (ie kernel command follows a data command, -//or data command follows a kernel command), then we need to add a barrier packet -//into the stream to mimic CUDA stream semantics. (some hardware uses separate -//queues for data commands and kernel commands, and no implicit ordering is provided). -// -inline bool ihipStream_t::preKernelCommand() -{ - bool addedSync = false; - // If switching command types, we need to add a barrier packet to synchronize things. - if (_last_command_type != ihipCommandKernel) { - if (_last_copy_signal) { - addedSync = true; - - hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); - this->enqueueBarrier(q, _last_copy_signal); - - tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted)\n", - this, - ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]) - } - _last_command_type = ihipCommandKernel; - } - - return addedSync; -} - - -//--- -inline void ihipStream_t::postKernelCommand(hc::completion_future &kernelFuture) -{ - _last_kernel_future = kernelFuture; -}; - - - -//--- -// Called whenever a copy command is set to the stream. -// Examines the last command sent to this stream and returns a signal to wait on, if required. -inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType) -{ - int needSync = 0; - // If switching command types, we need to add a barrier packet to synchronize things. - if (_last_command_type != copyType) { - needSync = 1; - - - if (_last_command_type == ihipCommandKernel) { - tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", - this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); - hsa_signal_t *hsaSignal = (static_cast (_last_kernel_future.get_native_handle())); - if (hsaSignal) { - *waitSignal = * hsaSignal; - } - } else if (_last_copy_signal) { - tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy)\n", - this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); - assert (_last_copy_signal->_ref_cnt > 0); - *waitSignal = _last_copy_signal->_hsa_signal; - } - - _last_command_type = copyType; - } - - _last_copy_signal = lastCopy; - - return needSync; -} @@ -906,6 +999,9 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av) void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFuture) { stream->postKernelCommand(kernelFuture); + if (HIP_LAUNCH_BLOCKING) { + tprintf(TRACE_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream); + } } @@ -1317,20 +1413,16 @@ hipError_t hipStreamDestroy(hipStream_t stream) hipError_t e = hipSuccess; - if (ihipIsValidDevice(stream->_device_index)) { - - ihipDevice_t *device = &g_devices[stream->_device_index]; + ihipDevice_t *device = stream->getDevice(); + if (device) { device->_streams.remove(stream); - delete stream; - - e = hipSuccess; } else { e = hipErrorInvalidResourceHandle; } - return ihipLogStatus(hipSuccess); + return ihipLogStatus(e); } @@ -1371,6 +1463,8 @@ hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags) eh->_state = hipEventStatusCreated; eh->_stream = NULL; eh->_flags = flags; + eh->_timestamp = 0; + eh->_copy_seq_id = 0; } else { e = hipErrorInvalidValue; } @@ -1405,6 +1499,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) // Clear timestamps eh->_timestamp = 0; eh->_marker = stream->_av.create_marker(); + eh->_copy_seq_id = stream->lastCopySeqId(); return ihipLogStatus(hipSuccess); } @@ -1452,6 +1547,8 @@ hipError_t hipEventSynchronize(hipEvent_t event) #else eh->_marker.wait(); #endif + eh->_stream->reclaimSignals(eh->_copy_seq_id); + return ihipLogStatus(hipSuccess); } } else { @@ -1636,7 +1733,7 @@ template hc::completion_future ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes) { - int wg = std::min((unsigned)8, g_devices[stream->_device_index]._compute_units); + int wg = std::min((unsigned)8, stream->getDevice()->_compute_units); const int threads_per_wg = 256; int threads = wg * threads_per_wg; @@ -1673,7 +1770,7 @@ template hc::completion_future ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) { - int wg = std::min((unsigned)8, g_devices[stream->_device_index]._compute_units); + int wg = std::min((unsigned)8, stream->getDevice()->_compute_units); const int threads_per_wg = 256; int threads = wg * threads_per_wg; @@ -1815,7 +1912,11 @@ StagingBuffer::~StagingBuffer() //--- -void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) +//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy +//IN: dst - dest pointer - must be accessible from host CPU. +//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _device) +//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. +void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { const char *srcp = static_cast (src); char *dstp = static_cast (dst); @@ -1830,10 +1931,10 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; - tprintf (TRACE_COPY2, "waiting... on completion signal\n"); + tprintf (TRACE_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); + tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); // TODO - use uncached memcpy, someday. memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); @@ -1841,16 +1942,11 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); #if USE_ROCR_V2 - hsa_signal_t depSignal; - int depSignalCnt = 0; //stream->copyCommand(_completion_signal[bufferIndex], &depSignal, copyType); - - printf ("need sync\n"); - - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); #endif - tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p status=%x\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); + tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw @@ -1859,6 +1955,10 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte if (++bufferIndex >= _numBuffers) { bufferIndex = 0; } + + if (HIP_ONESHOT_COPY_DEP) { + waitFor = NULL; // TODO - don't need dependency after first copy submitted? + } } @@ -1868,7 +1968,11 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte } //--- -void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes) +//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy +//IN: dst - dest pointer - must be accessible from agent this buffer is assocaited with (via _device). +//IN: src - src pointer for copy. Must be accessible from host CPU. +//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. +void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { const char *srcp0 = static_cast (src); char *dstp1 = static_cast (dst); @@ -1888,22 +1992,21 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; - tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); + tprintf (TRACE_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); #if USE_ROCR_V2 - hsa_signal_t depSignal; - // TODO - int depSignalCnt = 0; //stream->copyCommand(_completion_signal[bufferIndex], &depSignal, copyType); - - printf ("need sync\n"); - - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, depSignalCnt, depSignalCnt ? &depSignal:0, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); #endif assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp0 += theseBytes; + + + if (HIP_ONESHOT_COPY_DEP) { + waitFor = NULL; // TODO - don't need dependency after first copy submitted? + } } // Now unload the staging buffers: @@ -1914,7 +2017,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - tprintf (TRACE_COPY2, "D2H: copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); + tprintf (TRACE_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); dstp1 += theseBytes; @@ -1931,8 +2034,14 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte #if USE_AM_TRACKER -void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { + ihipDevice_t *device = stream->getDevice(); + + if (device == NULL) { + throw; + } + hc::accelerator acc; hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); @@ -1940,9 +2049,9 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); - bool useStagingBuffer = true; // TODO - remove when new copy bakes a bit. + bool useStagingBuffer = true; - // Resolve default to a specific Kind, since we use different algorithms: + // Resolve default to a specific Kind so we know which algorithm to use: if (kind == hipMemcpyDefault) { bool dstIsHost = (dstNotTracked || !dstPtrInfo._isInDeviceMem); bool srcIsHost = (srcNotTracked || !srcPtrInfo._isInDeviceMem); @@ -1957,26 +2066,29 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB } } -#if 0 - - //TODO - hsa_signal_t depSignal; - int dep_signals = stream->commandCopy(&depSignal, ); - pass to CopyHostToDevice -#endif if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (useStagingBuffer) { std::lock_guard l (device->_copy_lock[0]); - device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes); + //printf ("staged-copy- read dep signals\n"); + + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + + // The copy waits for inputs and then completes before returning. + stream->resetToEmpty(); } else { // TODO - remove, slow path. hc::am_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { if (useStagingBuffer) { - std::lock_guard l (device->_copy_lock[1]); - device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes); + std::lock_guard l (device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1]); + //printf ("staged-copy- read dep signals\n"); + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyD2H); + device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } else { // TODO - remove, slow path. hc::am_copy(dst, src, sizeBytes); @@ -1988,7 +2100,7 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB // Let HSA runtime handle it: // TODO - need buffer pool for the signals: - device->_copy_lock[1].lock(); + device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY? 0:1].lock(); hsa_signal_store_relaxed(device->_copy_signal, 1); #if USE_ROCR_V2 @@ -2001,7 +2113,7 @@ void ihipSyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeB hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } - device->_copy_lock[1].unlock(); + device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1].unlock(); } } @@ -2017,17 +2129,13 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind hc::completion_future marker; - hipError_t e = hipSuccess; #if USE_AM_TRACKER - if (ihipIsValidDevice(stream->_device_index)) { - - ihipDevice_t *device = &g_devices[stream->_device_index]; - - ihipSyncCopy(device, dst, src, sizeBytes, kind); - - } else { + try { + ihipSyncCopy(stream, dst, src, sizeBytes, kind); + } + catch (...) { e = hipErrorInvalidResourceHandle; } @@ -2046,6 +2154,9 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind * @warning on HCC hipMemcpyAsync uses a synchronous copy. */ #endif +/** + * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue + */ //--- hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { @@ -2059,7 +2170,10 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp if (stream) { ihipDevice_t *device = stream->getDevice(); - if (kind == hipMemcpyDefault) { + if (device == NULL) { + e = hipErrorInvalidDevice; + + } else if (kind == hipMemcpyDefault) { e = hipErrorInvalidMemcpyDirection; } else if (kind == hipMemcpyHostToHost) { @@ -2069,6 +2183,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp } else { ihipSignal_t *ihip_signal = stream->getSignal(); + hsa_signal_store_relaxed(ihip_signal->_hsa_signal, 1); ihipCommand_t copyType; if ((kind == hipMemcpyHostToDevice) || (kind == hipMemcpyDeviceToDevice)) { @@ -2084,7 +2199,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp hsa_signal_t depSignal; int depSignalCnt = stream->copyCommand(ihip_signal, &depSignal, copyType); - tprintf (TRACE_SYNC, " copy-async, waitFor=%d(%lu) completion=%lu\n", depSignalCnt, depSignal.handle, ihip_signal->_seq_id); + tprintf (TRACE_SYNC, " copy-async, waitFor=%lu completion=#%lu(%lu)\n", depSignalCnt? depSignal.handle:0x0, ihip_signal->_sig_id, ihip_signal->_hsa_signal.handle); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsa_signal); #else @@ -2093,15 +2208,14 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp if (hsa_status == HSA_STATUS_SUCCESS) { + // TODO-stream - fix release-signal calls here. if (HIP_LAUNCH_BLOCKING) { - hsa_signal_wait_relaxed(ihip_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - stream->releaseSignal(ihip_signal); - } else { - //stream->releaseSignal(ihip_signal); - } + tprintf(TRACE_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes); + stream->wait(); + } } else { // This path can be hit if src or dst point to unpinned host memory. - // TODO - does async-copy fall back to sync if input pointers are not pinned? + // TODO-stream - does async-copy fall back to sync if input pointers are not pinned? e = hipErrorInvalidValue; } } diff --git a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp index 8669b986d8..4b92e2fc1e 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp @@ -31,15 +31,129 @@ void simpleNegTest() //HIPASSERT (e==hipErrorInvalidValue); } +class Pinned; +class Unpinned; + +template struct HostTraits; + +template<> +struct HostTraits +{ + static const char *Name() { return "Pinned"; } ; + + static void *Alloc(size_t sizeBytes) { + void *p; + HIPCHECK(hipMallocHost(&p, sizeBytes)); + return p; + }; +}; + + +template +__global__ void +addK (hipLaunchParm lp, T *A, T K, size_t numElements) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (size_t i=offset; i +void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) +{ + HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. + size_t Nbytes = numElements*sizeof(T); + size_t eachCopyElements = numElements / numInflight; + size_t eachCopyBytes = eachCopyElements * sizeof(T); + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + + printf ("-----------------------------------------------------------------------------------------------\n"); + printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n", + __func__, HostTraits::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); + + T *A_h; + T *A_d; + + A_h = (T*)(HostTraits::Alloc(Nbytes)); + HIPCHECK(hipMalloc(&A_d, Nbytes)); + + // Initialize the host array: + const T initValue = 13; + const T deviceConst = 2; + const T hostConst = 10000; + for (size_t i=0; i, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2, numElements); + + for (int i=0; i (i); } - hipStream_t stream; - HIPCHECK (hipStreamCreate(&stream)); - //stream=0; // fixme TODO for (int i=0; i(1024, 16); - test_manyCopies(1024, 4); - test_manyCopies(1024*4, 64); + if (p_tests & 0x02) { + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + test_manyInflightCopies(stream, 1024, 16, true); + test_manyInflightCopies(stream, 1024, 4, true); // verify we re-use the same entries instead of growing pool. + test_manyInflightCopies(stream, 1024*8, 64, false); + + HIPCHECK(hipStreamDestroy(stream)); } - if (p_tests & 0x4) { + if (p_tests & 0x04) { test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version test_chunkedAsyncExample(p_streams, false, false, true); // Some async test_chunkedAsyncExample(p_streams, false, false, false); // All async } + if (p_tests & 0x08) { + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + test_pingpong(stream, 1024*1024*32, 1, 1, false); + test_pingpong(stream, 1024*1024*32, 1, 10, false); + + HIPCHECK(hipStreamDestroy(stream)); + } passed(); diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index f133696d78..1bf89f1604 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -88,7 +88,7 @@ vectorADD(hipLaunchParm lp, size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i