diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 5c8aaf5659..05fa2818b6 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -519,9 +519,11 @@ public: // "Allocate" a stream ID: ihipStream_t::SeqNum_t incStreamId() { return _stream_id++; }; - void recomputePeerAgents(); - void addPeer(ihipDevice_t *peer); - void removePeer(ihipDevice_t *peer); + bool addPeer(ihipDevice_t *peer); + bool removePeer(ihipDevice_t *peer); + + uint32_t peerCnt() const { return _peerCnt; }; + uint32_t peerAgents() const { return _peerAgents; }; private: @@ -532,6 +534,8 @@ private: std::list _peers; // list of enabled peer devices. uint32_t _peerCnt; // number of enabled peers hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.) +private: + void recomputePeerAgents(); }; // Note Mutex selected based on DeviceMutex @@ -560,6 +564,8 @@ public: // Functions: void locked_waitAllStreams(); void locked_syncDefaultStream(bool waitOnSelf); + ihipDeviceCritical_t &criticalData() { return _criticalData; }; // TODO, move private. Fix P2P. + public: // Data, set at initialization: unsigned _device_index; // index into g_devices. diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index 56b3560427..f33895cd98 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -913,21 +913,21 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, int deviceId, int peerDev /** - * @brief Disable registering memory on peerDevice for direct access from the current device. + * @brief Disable direct access from current device's virtual address space to memory allocations physically located on a peer device. * - * If there are any allocations on peerDevice which were registered in the current device using hipPeerRegister() then these allocations will be automatically unregistered. * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device. * - * @param [in] peerDevice + * @param [in] peerDeviceId + * * TODO:cudaErrorPeerAccessNotEnabled and cudaErrorInvalidDevice error not supported in HIP, return hipErrorUnknown * Returns #hipSuccess, #hipErrorUnknown */ hipError_t hipDeviceDisablePeerAccess (int peerDeviceId); /** - * @brief Enable registering memory on peerDevice for direct access from the current device. + * @brief Enable direct access from current device's virtual address space to memory allocations physically located on a peer device. * - * @param [in] peerDevice + * @param [in] peerDeviceId * @param [in] flags * * TODO:cudaErrorInvalidDevice error not supported in HIP, return hipErrorUnknown @@ -939,14 +939,14 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags); * @brief Copies memory from one device to memory on another device. * * @param [out] dst - Destination device pointer. - * @param [in] dstDevice - Destination device + * @param [in] dstDeviceId - Destination device * @param [in] src - Source device pointer - * @param [in] srcDevice - Source device + * @param [in] srcDeviceId - Source device * @param [in] sizeBytes - Size of memory copy in bytes * * Returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice */ -hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes ); +hipError_t hipMemcpyPeer (void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t sizeBytes); /** * @brief Copies memory from one device to memory on another device. @@ -961,7 +961,7 @@ hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcD * Returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice */ #if __cplusplus -hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream=0 ); +hipError_t hipMemcpyPeerAsync ( void* dst, int dstDeviceId, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream=0 ); #else hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream); #endif diff --git a/include/hip_runtime_api.h b/include/hip_runtime_api.h index bef881d066..ca49ab5d13 100644 --- a/include/hip_runtime_api.h +++ b/include/hip_runtime_api.h @@ -138,7 +138,7 @@ typedef struct hipPointerAttribute_t { * @enum * @ingroup Enumerations */ -// Developer note - when updating these, update the hipErrorName and hipErrorString functions +// Developer note - when updating these, update the hipErrorName and hipErrorString functions in NVCC and HCC paths typedef enum hipError_t { hipSuccess = 0 ///< Successful completion. ,hipErrorMemoryAllocation ///< Memory allocation error. @@ -155,6 +155,8 @@ typedef enum hipError_t { ,hipErrorNoDevice ///< Call to hipGetDeviceCount returned 0 devices ,hipErrorNotReady ///< Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery. ,hipErrorUnknown ///< Unknown error. + ,hipErrorPeerAccessNotEnabled ///< Peer access was never enabled from the current device. + ,hipErrorPeerAccessAlreadyEnabled ///< Peer access was already enabled from the current device. ,hipErrorRuntimeMemory ///< HSA runtime memory call returned error. Typically not seen in production systems. ,hipErrorRuntimeOther ///< HSA runtime call other than memory returned error. Typically not seen in production systems. ,hipErrorTbd ///< Marker that more error codes are needed. diff --git a/src/hip_error.cpp b/src/hip_error.cpp index 655ab3b8c7..7c723b1aa2 100644 --- a/src/hip_error.cpp +++ b/src/hip_error.cpp @@ -65,7 +65,7 @@ const char *hipGetErrorName(hipError_t hip_error) //--- const char *hipGetErrorString(hipError_t hip_error) { - std::call_once(hip_initialized, ihipInit); + HIP_INIT_API(hip_error); // TODO - return a message explaining the error. // TODO - This should be set up to return the same string reported in the the doxygen comments, somehow. diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index bbf6538817..992dbca724 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -211,24 +211,31 @@ void ihipDeviceCriticalBase_t::recomputePeerAgents() template<> -void ihipDeviceCriticalBase_t::addPeer(ihipDevice_t *peer) +bool ihipDeviceCriticalBase_t::addPeer(ihipDevice_t *peer) { auto match = std::find(_peers.begin(), _peers.end(), peer); if (match != std::end(_peers)) { _peers.push_back(peer); recomputePeerAgents(); - return; + return true; } // If we get here - peer was already on list, silently ignore. + return false; } template<> -void ihipDeviceCriticalBase_t::removePeer(ihipDevice_t *peer) +bool ihipDeviceCriticalBase_t::removePeer(ihipDevice_t *peer) { - _peers.remove(peer); - recomputePeerAgents(); + auto match = std::find(_peers.begin(), _peers.end(), peer); + if (match != std::end(_peers)) { + _peers.remove(peer); + recomputePeerAgents(); + return true; + } else { + return false; + } } //------------------------------------------------------------------------------------------------- @@ -1073,22 +1080,25 @@ void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFutur const char *ihipErrorString(hipError_t hip_error) { switch (hip_error) { - case hipSuccess : return "hipSuccess"; - case hipErrorMemoryAllocation : return "hipErrorMemoryAllocation"; - case hipErrorMemoryFree : return "hipErrorMemoryFree"; - case hipErrorUnknownSymbol : return "hipErrorUnknownSymbol"; - case hipErrorOutOfResources : return "hipErrorOutOfResources"; - case hipErrorInvalidValue : return "hipErrorInvalidValue"; - case hipErrorInvalidResourceHandle : return "hipErrorInvalidResourceHandle"; - case hipErrorInvalidDevice : return "hipErrorInvalidDevice"; - case hipErrorInvalidMemcpyDirection : return "hipErrorInvalidMemcpyDirection"; - case hipErrorNoDevice : return "hipErrorNoDevice"; - case hipErrorNotReady : return "hipErrorNotReady"; - case hipErrorRuntimeMemory : return "hipErrorRuntimeMemory"; - case hipErrorRuntimeOther : return "hipErrorRuntimeOther"; - case hipErrorUnknown : return "hipErrorUnknown"; - case hipErrorTbd : return "hipErrorTbd"; - default : return "hipErrorUnknown"; + case hipSuccess : return "hipSuccess"; + case hipErrorMemoryAllocation : return "hipErrorMemoryAllocation"; + case hipErrorMemoryFree : return "hipErrorMemoryFree"; + case hipErrorUnknownSymbol : return "hipErrorUnknownSymbol"; + case hipErrorOutOfResources : return "hipErrorOutOfResources"; + case hipErrorInvalidValue : return "hipErrorInvalidValue"; + case hipErrorInvalidResourceHandle : return "hipErrorInvalidResourceHandle"; + case hipErrorInvalidDevice : return "hipErrorInvalidDevice"; + case hipErrorInvalidMemcpyDirection : return "hipErrorInvalidMemcpyDirection"; + case hipErrorNoDevice : return "hipErrorNoDevice"; + case hipErrorNotReady : return "hipErrorNotReady"; + case hipErrorPeerAccessNotEnabled : return "hipErrorPeerAccessNotEnabled"; + case hipErrorPeerAccessAlreadyEnabled : return "hipErrorPeerAccessAlreadyEnabled"; + + case hipErrorRuntimeMemory : return "hipErrorRuntimeMemory"; + case hipErrorRuntimeOther : return "hipErrorRuntimeOther"; + case hipErrorUnknown : return "hipErrorUnknown"; + case hipErrorTbd : return "hipErrorTbd"; + default : return "hipErrorUnknown"; }; }; diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index c4a8a1bcf3..de86688ede 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -26,7 +26,7 @@ THE SOFTWARE. * @warning HCC returns 0 in *canAccessPeer ; Need to update this function when RT supports P2P */ //--- -hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int deviceId, int peerDeviceId) +hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, int deviceId, int peerDeviceId) { HIP_INIT_API(canAccessPeer, deviceId, peerDeviceId); @@ -38,7 +38,7 @@ hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int deviceId, int peerD if ((device != NULL) && (peerDevice != NULL)) { #if USE_PEER_TO_PEER==2 - *canAccessPeer = peerDevice->_acc.is_peer(device->_acc); + *canAccessPeer = peerDevice->_acc.get_is_peer(device->_acc); #else *canAccessPeer = 0; #endif @@ -56,16 +56,32 @@ hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int deviceId, int peerD } -/** - * warning Need to update this function when RT supports P2P - */ //--- -hipError_t hipDeviceDisablePeerAccess ( int peerDevice ) +hipError_t hipDeviceDisablePeerAccess (int peerDeviceId) { - HIP_INIT_API(peerDevice); + HIP_INIT_API(peerDeviceId); - // TODO-p2p - return ihipLogStatus(hipSuccess); + hipError_t err = hipSuccess; +#if USE_PEER_TO_PEER + + auto thisDevice = ihipGetTlsDefaultDevice(); + auto peerDevice = ihipGetDevice(peerDeviceId); + if ((thisDevice != NULL) && (peerDevice != NULL)) { + LockedAccessor_DeviceCrit_t crit(thisDevice->criticalData()); + bool changed = crit->removePeer(peerDevice); + if (changed) { +#if USE_PEER_TO_PEER==2 + am_memtracker_update_peers(device->_acc, crit->peerCnt(), crit->peerAgents()); +#endif + } else { + err = hipErrorPeerAccessNotEnabled; // never enabled P2P access. + } + } else { + err = hipErrorInvalidDevice; + } +#endif + + return ihipLogStatus(err); }; @@ -74,18 +90,27 @@ hipError_t hipDeviceDisablePeerAccess ( int peerDevice ) */ //--- // Enable registering memory on peerDevice for direct access from the current device. -hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags) +hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags) { - std::call_once(hip_initialized, ihipInit); + HIP_INIT_API(peerDeviceId, flags); hipError_t err = hipSuccess; #if USE_PEER_TO_PEER if (flags != 0) { err = hipErrorInvalidValue; } else { + auto thisDevice = ihipGetTlsDefaultDevice(); auto peerDevice = ihipGetDevice(peerDeviceId); - if (peerDevice != NULL) { - + if ((thisDevice != NULL) && (peerDevice != NULL)) { + LockedAccessor_DeviceCrit_t crit(thisDevice->criticalData()); + bool isNewPeer = crit->addPeer(peerDevice); + if (isNewPeer) { +#if USE_PEER_TO_PEER==2 + am_memtracker_update_peers(device->_acc, crit->peerCnt(), crit->peerAgents()); +#endif + } else { + err = hipErrorPeerAccessAlreadyEnabled; + } } else { err = hipErrorInvalidDevice; } @@ -97,9 +122,10 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags) //--- -hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes ) +hipError_t hipMemcpyPeer (void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes) { - std::call_once(hip_initialized, ihipInit); + HIP_INIT_API(dst, dstDevice, src, srcDevice, sizeBytes); + // HCC has a unified memory architecture so device specifiers are not required. return hipMemcpy(dst, src, sizeBytes, hipMemcpyDefault); }; @@ -109,9 +135,9 @@ hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcD * @bug This function uses a synchronous copy */ //--- -hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream ) +hipError_t hipMemcpyPeerAsync (void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream) { - std::call_once(hip_initialized, ihipInit); + HIP_INIT_API(dst, dstDevice, src, srcDevice, sizeBytes, stream); // HCC has a unified memory architecture so device specifiers are not required. return hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDefault, stream); };