diff --git a/include/hcc_detail/AM.h b/include/hcc_detail/AM.h index 1cfcf2dfb2..d41fed317a 100644 --- a/include/hcc_detail/AM.h +++ b/include/hcc_detail/AM.h @@ -15,22 +15,27 @@ namespace hc { // This is the data that is maintained for each pointer: struct AmPointerInfo { - bool _isDeviceMem; - void * _hostPointer; - void * _devicePointer; - size_t _sizeBytes; - hc::accelerator _acc; - unsigned _allocationFlags; + void * _hostPointer; ///< Host pointer. If host access is not allowed, NULL. + void * _devicePointer; ///< Device pointer. + size_t _sizeBytes; ///< Size of allocation. + hc::accelerator _acc; ///< Device / Accelerator to use. + bool _isInDeviceMem; ///< Memory is physically resident on a device (if false, memory is located on host) + bool _isAmManaged; ///< Memory was allocated by AM and should be freed when am_reset is called. + + int _appId; ///< App-specific storage. Used by HIP to store deviceID. + unsigned _appAllocationFlags; ///< App-specific allocation flags. Used by HIP to store allocation flags. AmPointerInfo() {}; - AmPointerInfo(bool isDeviceMem, void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, unsigned allocationFlags) : - _isDeviceMem(isDeviceMem), + AmPointerInfo(void *hostPointer, void *devicePointer, size_t sizeBytes, hc::accelerator acc, bool isInDeviceMem, bool isAmManaged) : _hostPointer(hostPointer), _devicePointer(devicePointer), _sizeBytes(sizeBytes), _acc(acc), - _allocationFlags(allocationFlags) {}; + _isInDeviceMem(isInDeviceMem), + _isAmManaged(isAmManaged), + _appId(-1), + _appAllocationFlags(0) {}; }; } @@ -73,19 +78,46 @@ am_status_t AM_free(void* ptr); */ am_status_t AM_copy(void* dst, const void* src, size_t size); -am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr); +/** + * Return information about tracked pointer. + * + * AM tracks pointers when they are allocated or added to tracker with am_track_pointer. + * The tracker tracks the base pointer as well as the size of the allocation, and will + * find the information for a pointer anywhere in the tracked range. + * + * @returns AM_ERROR_MISC if pointer is not currently being tracked. + * @returns AM_SUCCESS if pointer is tracked and writes info to @p info. + * + * @see AM_memtracker_add, + */ +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr); +//TODO-doc +am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); -// TODO-implement these: -//am_status_t AM_track_pointer(void* ptr, size_t size, bool isDeviceMem=false, unsigned allocationFlags=0x0); -//am_status_t AM_untrack_pointer(void* ptr); +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem=false); + +/** + * Remove the pointer from the tracker structure. + * + * @p ptr may be anywhere in a tracked memory range. + * + * @returns AM_ERROR_MISC if pointer is not found in tracker. + * @returns AM_SUCCESS if pointer is not found in tracker. + */ +am_status_t am_memtracker_remove(void* ptr); + +/** + * Remove all memory allocations associated with specified accelerator. + */ +size_t am_memtracker_reset(hc::accelerator acc); /** * Prints the contents of the memory tracker table to stderr * * Intended primarily for debug purposes. **/ -void AM_print_tracker(); +void am_memtracker_print(); }; // namespace hc diff --git a/src/hc_AM.cpp b/src/hc_AM.cpp index 87e29e4bcc..36c8abf193 100644 --- a/src/hc_AM.cpp +++ b/src/hc_AM.cpp @@ -5,7 +5,8 @@ #include "hcc_detail/AM.h" // TODO - Remove me. -#define DB_TRACKER 1 +#define DB_TRACKER 0 +#define MUTEX_LOCK 1 #if DB_TRACKER #define mprintf( ...) {\ @@ -43,14 +44,16 @@ struct AmMemoryRangeCompare { std::ostream &operator<<(std::ostream &os, const hc::AmPointerInfo &ap) { os << "hostPointer:" << ap._hostPointer << " devicePointer:"<< ap._devicePointer << " sizeBytes:" << ap._sizeBytes - << " isDeviceMem:" << ap._isDeviceMem << " allocFlags:" << ap._allocationFlags; + << " isInDeviceMem:" << ap._isInDeviceMem << " isAmManaged:" << ap._isAmManaged + << " appId:" << ap._appId << " appAllocFlags:" << ap._appAllocationFlags; return os; } - +//------------------------------------------------------------------------------------------------- // This structure tracks information for each pointer. -// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size will find the associated AmPointerInfo. +// Uses memory-range-based lookups - so pointers that exist anywhere in the range of hostPtr + size +// will find the associated AmPointerInfo. // The insertions and lookups use a self-balancing binary tree and should support O(logN) lookup speed. // The structure is thread-safe - writers obtain a mutex before modifying the tree. Multiple simulatenous readers are supported. class AmPointerTracker { @@ -64,9 +67,18 @@ public: MapTrackerType::iterator end() { return _tracker.end(); }; + size_t reset (hc::accelerator acc); + std::ostream & print (std::ostream &os); private: + // TODO - use or remove. + inline void writeLock(); + inline void writeUnlock(); + inline void readLock(); + inline void readUnlock(); + MapTrackerType _tracker; + std::mutex _mutex; //std::shared_timed_mutex _mut; }; @@ -74,11 +86,10 @@ private: //--- void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) { - // TODO-mutex - write lock. + std::lock_guard l (_mutex); + mprintf ("insert: %p + %zu\n", pointer, p._sizeBytes); _tracker.insert(std::make_pair(AmMemoryRange(pointer, p._sizeBytes), p)); - - } @@ -87,6 +98,7 @@ void AmPointerTracker::insert (void *pointer, const hc::AmPointerInfo &p) int AmPointerTracker::remove (void *pointer) { // TODO-mutex - write lock. + std::lock_guard l (_mutex); mprintf ("remove: %p\n", pointer); return _tracker.erase(AmMemoryRange(pointer,1)); } @@ -96,14 +108,17 @@ int AmPointerTracker::remove (void *pointer) AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointer) { // TODO-mutex- read lock + std::lock_guard l (_mutex); auto iter = _tracker.find(AmMemoryRange(pointer,1)); mprintf ("find: %p\n", pointer); return iter; } +//--- std::ostream & AmPointerTracker::print (std::ostream &os) { + std::lock_guard l (_mutex); for (auto iter = _tracker.begin() ; iter != _tracker.end(); iter++) { os << " " << iter->first._basePointer << "..." << iter->first._endPointer << ":: "; os << iter->second << std::endl; @@ -112,6 +127,65 @@ std::ostream & AmPointerTracker::print (std::ostream &os) return os; } +//--- +// Remove all tracked locations, and free the associated memory (if the range was originally allocated by AM). +// Returns count of ranges removed. +size_t AmPointerTracker::reset (hc::accelerator acc) +{ + std::lock_guard l (_mutex); + mprintf ("reset: \n"); + + size_t count = 0; + // relies on C++11 (erase returns iterator) + for (auto iter = _tracker.begin() ; iter != _tracker.end(); ) { + if (iter->second._acc == acc) { + if (iter->second._isAmManaged) { + hsa_memory_free(iter->first._basePointer); + } + count++; + + iter = _tracker.erase(iter); + } else { + iter++; + } + } + + return count; +} + + + +//--- +void AmPointerTracker::writeLock () +{ + _mutex.lock(); +} + + +//--- +void AmPointerTracker::writeUnlock () +{ + _mutex.unlock(); +} + + +//--- +// TODO - support multiple concurrent reader +void AmPointerTracker::readLock () +{ + _mutex.lock(); +} + + +//--- +// TODO - support multiple concurrent reader +void AmPointerTracker::readUnlock () +{ + _mutex.unlock(); +} + + + //========================================================================================================= // Global var defs: @@ -153,10 +227,10 @@ auto_voidp AM_alloc(size_t sizeBytes, hc::accelerator acc, unsigned flags) } else { if (flags & amHostPinned) { g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(false/*isDevice*/, ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, true /*isAMManaged*/)); } else { g_amPointerTracker.insert(ptr, - hc::AmPointerInfo(true/*isDevice*/, NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, flags)); + hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, true /*isAMManaged*/)); } } } @@ -172,9 +246,10 @@ am_status_t AM_free(void* ptr) am_status_t status = AM_SUCCESS; if (ptr != NULL) { + // See also tracker::reset which can free memory. hsa_memory_free(ptr); - size_t numRemoved = g_amPointerTracker.remove(ptr) ; + int numRemoved = g_amPointerTracker.remove(ptr) ; if (numRemoved == 0) { status = AM_ERROR_MISC; } @@ -199,7 +274,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) } -am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr) +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) { auto infoI = g_amPointerTracker.find(ptr); if (infoI != g_amPointerTracker.end()) { @@ -210,10 +285,46 @@ am_status_t AM_get_pointer_info(hc::AmPointerInfo *info, void *ptr) } } -void AM_print_tracker() + +am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); + + +am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, bool isDeviceMem) +{ + if (isDeviceMem) { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(ptr/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, true/*isDevice*/, false /*isAMManaged*/)); + } else { + g_amPointerTracker.insert(ptr, hc::AmPointerInfo(NULL/*hostPointer*/, ptr /*devicePointer*/, sizeBytes, acc, false/*isDevice*/, false /*isAMManaged*/)); + } + + return AM_SUCCESS; +} + + +am_status_t am_memtracker_remove(void* ptr) +{ + am_status_t status = AM_SUCCESS; + + int numRemoved = g_amPointerTracker.remove(ptr) ; + if (numRemoved == 0) { + status = AM_ERROR_MISC; + } + + return status; +} + +//--- +void am_memtracker_print() { g_amPointerTracker.print(std::cerr); } +//--- +size_t am_memtracker_reset(hc::accelerator acc) +{ + return g_amPointerTracker.reset(acc); +} + + } // end namespace hc. diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index e9ee4c41dc..a4246dc9cb 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -44,8 +44,8 @@ THE SOFTWARE. #include "hc_AM.cpp" #define USE_PINNED_HOST (__hcc_workweek__ >= 1601) - #define USE_ASYNC_COPY 0 +#define USE_AM_TRACKER 1 /* use new AM memory tracker features */ #define INLINE static inline @@ -802,6 +802,13 @@ hipError_t hipDeviceReset(void) // It should destroy and clean up all resources allocated with the default device in the current process. // and needs to destroy all queues as well. // +#if USE_AM_TRACKER + // TODO - remove bug above. + ihipDevice_t *device = ihipGetTlsDefaultDevice(); + if (device) { + am_memtracker_reset(device->_acc); + } +#endif return ihipLogStatus(hipSuccess); } @@ -1281,14 +1288,14 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) hipError_t e = hipSuccess; hc::AmPointerInfo amPointerInfo; - am_status_t status = hc::AM_get_pointer_info(&amPointerInfo, ptr); + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); if (status == AM_SUCCESS) { - attributes->memoryType = amPointerInfo._isDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost; + attributes->memoryType = amPointerInfo._isInDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost; attributes->hostPointer = amPointerInfo._hostPointer; attributes->devicePointer = amPointerInfo._devicePointer; attributes->isManaged = 0; - attributes->allocationFlags = amPointerInfo._allocationFlags; + attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = -1; diff --git a/tests/src/hipPointerAttrib.cpp b/tests/src/hipPointerAttrib.cpp index 9d147d8183..93d503af65 100644 --- a/tests/src/hipPointerAttrib.cpp +++ b/tests/src/hipPointerAttrib.cpp @@ -75,7 +75,7 @@ void resetAttribs(hipPointerAttribute_t *attribs) }; -void printAttribs(hipPointerAttribute_t *attribs) +void printAttribs(const hipPointerAttribute_t *attribs) { printf ("hostPointer:%p devicePointer:%p memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n", attribs->hostPointer, @@ -99,8 +99,13 @@ inline int zrand(int max) //================================================================================================= // //Run through a couple simple cases to test lookups and hostd pointer arithmetic: -void simpleTests() +void testSimple() { + printf ("\n"); + printf ("===========================================================================\n"); + printf ("Simple Tests\n"); + printf ("===========================================================================\n"); + char *A_d; char *A_Pinned_h; char *A_OSAlloc_h; @@ -179,8 +184,24 @@ void simpleTests() } +void resetTracker () +{ + if (p_verbose & 0x1) { + printf ("info: reset tracker for all devices in platform\n"); + } + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + + // Clean up: + for (int i=0; i reference(numAllocs); HIPASSERT(minSize > 0); @@ -244,14 +264,15 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) reference[i]._attrib.memoryType = hipMemoryTypeHost; reference[i]._attrib.devicePointer = ptr; reference[i]._attrib.hostPointer = ptr; - reference[i]._attrib.allocationFlags = 1; // TODO-randomize these. + reference[i]._attrib.allocationFlags = 0; // TODO-randomize these. } reference[i]._pointer = ptr; } #ifdef __HIP_PLATFORM_HCC__ if (p_verbose & 0x2) { - hc::AM_print_tracker(); + printf ("Tracker after insertions:\n"); + hc::am_memtracker_print(); } #endif @@ -265,27 +286,143 @@ void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize) checkPointer(ref, i, 2, (char *)ref._pointer + ref._sizeBytes-1); } + if (ref._attrib.memoryType == hipMemoryTypeDevice) { + hipFree(ref._pointer); + } else { + hipFreeHost(ref._pointer); + } + + } + + + +#ifdef __HIP_PLATFORM_HCC__ + if (p_verbose & 0x2) { + printf ("Tracker after cleanup:\n"); + hc::am_memtracker_print(); + } +#endif +} + + +void testMultiThreaded_1(bool serialize=false) +{ + printf ("\n===========================================================================\n"); + printf ("MultiThreaded_1\n"); + if (serialize) printf ("[SERIALIZE]\n"); + printf ("===========================================================================\n"); + std::thread t1(clusterAllocs, 1000, 101, 1000); + if (serialize) t1.join(); + + std::thread t2(clusterAllocs, 1000, 11, 100); + if (serialize) t2.join(); + + std::thread t3(clusterAllocs, 1000, 5, 10); + if (serialize) t3.join(); + + std::thread t4(clusterAllocs, 1000, 1, 4); + if (serialize) t4.join(); + + if (!serialize) { + t1.join(); + t2.join(); + t3.join(); + t4.join(); + } + + resetTracker(); +} + + +///================================================================================================ + + +// Add pointers to tracker very quickly. +void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib) +{ + int count = 0; + + for (int count=0; count< 1000000; count++) { + hipPointerAttribute_t a; + hipError_t e = hipPointerGetAttributes(&a, ptr); + if ((e != hipSuccess) || (a!= *refAttrib)) { + printf("Test %d (err=%d)\n", count, e); + HIPCHECK(e); + + printf(" ref :: "); printAttribs(refAttrib); + printf(" getattr:: "); printAttribs(&a); + } } } -void testMultiThreaded() +enum Dir {Up, Down}; +void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir removeDir) { - std::thread t1(clusterAllocs, 1000, 101, 1000); - std::thread t2(clusterAllocs, 1000, 11, 100); - std::thread t3(clusterAllocs, 1000, 5, 10); - std::thread t4(clusterAllocs, 1000, 1, 4); + const size_t bufferSize = 16; + size_t maxSize = numBuffers*bufferSize; + HIPASSERT((maxSize % bufferSize) == 0); // loop logic assumes this is true + + + for (int i=0; i inflight(2); + + printf ("\n===========================================================================\n"); + printf ("MultiThreaded_2\n"); + printf ("===========================================================================\n"); + + hipSetDevice(0); + hipDeviceReset(); + + // Create some entries in the tracker: + for (int i=0; i<1000; i++) { + void *C_d; + HIPCHECK(hipMalloc(&C_d, 32)); + } + + + // Allocate a pointer that we will repeatedly lookup: + void *A_d; + HIPCHECK(hipMalloc(&A_d, 10000)); + hipPointerAttribute_t attrib1; + HIPCHECK(hipPointerGetAttributes(&attrib1, A_d)); + std::thread t1(thread_query, A_d, &attrib1); + + std::thread t2(thread_noise_generator, 10000, 1000, Up, Up); t1.join(); t2.join(); - t3.join(); - t4.join(); + + hipSetDevice(0); + hipDeviceReset(); } int main(int argc, char *argv[]) { - N= 1000000; HipTest::parseStandardArguments(argc, argv, true); @@ -296,22 +433,34 @@ int main(int argc, char *argv[]) printf ("N=%zu (%6.2f MB) device=%d\n", N, Nbytes/(1024.0*1024.0), p_gpuDevice); - if (p_tests & 0x1) { - simpleTests(); + if (p_tests & 0x01) { + testSimple(); } - if (p_tests & 0x2) { + if (p_tests & 0x02) { srand(0x100); + printf ("\n===========================================================================\n"); clusterAllocs(100, 1024*1, 1024*1024); + resetTracker(); } - if (p_tests & 0x4) { + if (p_tests & 0x04) { srand(0x200); + printf ("\n===========================================================================\n"); clusterAllocs(1000, 1, 10); // Many tiny allocations; + resetTracker(); } - if (p_tests & 0x8) { - testMultiThreaded(); + if (p_tests & 0x08) { + srand(0x300); + testMultiThreaded_1(true); + testMultiThreaded_1(false); + } + + if (p_tests & 0x10) { + srand(0x400); + testMultiThreaded_2(); + resetTracker(); } printf ("\n");