Merge branch 'privatestaging' of https://github.com/AMDComputeLibraries/HIP-privatestaging into privatestaging
[ROCm/hip commit: bdd26bd1f1]
This commit is contained in:
+102
-58
@@ -1,45 +1,70 @@
|
||||
cmake_minimum_required(VERSION 2.6)
|
||||
cmake_minimum_required(VERSION 2.8.3)
|
||||
project(hip_hcc)
|
||||
|
||||
#############################
|
||||
# Setup version information
|
||||
#############################
|
||||
set(HIP_VERSION_MAJOR "0")
|
||||
set(HIP_VERSION_MINOR "84")
|
||||
set(HIP_VERSION_PATCH "0")
|
||||
|
||||
#############################
|
||||
# Configure variables
|
||||
#############################
|
||||
# Determine HIP_PLATFORM
|
||||
if(NOT DEFINED HIP_PLATFORM)
|
||||
if(NOT DEFINED ENV{HIP_PLATFORM})
|
||||
execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/hipconfig --platform
|
||||
OUTPUT_VARIABLE HIP_PLATFORM
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
else()
|
||||
set(HIP_PLATFORM $ENV{HIP_PLATFORM} CACHE STRING "HIP Platform")
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "HIP Platform: " ${HIP_PLATFORM})
|
||||
|
||||
# If HIP_PLATFORM is hcc, we need HCC_HOME and HSA_PATH to be defined
|
||||
if(HIP_PLATFORM STREQUAL "hcc")
|
||||
# Determine HCC_HOME
|
||||
if(NOT DEFINED HCC_HOME)
|
||||
if(NOT DEFINED ENV{HCC_HOME})
|
||||
set(HCC_HOME "/opt/hcc" CACHE PATH "Path to which HCC has been installed")
|
||||
else()
|
||||
set(HCC_HOME $ENV{HCC_HOME} CACHE PATH "Path to which HCC has been installed")
|
||||
endif()
|
||||
endif()
|
||||
if(IS_ABSOLUTE ${HCC_HOME} AND EXISTS ${HCC_HOME} AND IS_DIRECTORY ${HCC_HOME})
|
||||
message(STATUS "Looking for HCC in: " ${HCC_HOME})
|
||||
else()
|
||||
message(FATAL_ERROR "Don't know where to find HCC. Please specify abolute path using -DHCC_HOME")
|
||||
endif()
|
||||
|
||||
# Determine HSA_PATH
|
||||
if(NOT DEFINED HSA_PATH)
|
||||
if(NOT DEFINED ENV{HSA_PATH})
|
||||
set(HSA_PATH "/opt/hsa" CACHE PATH "Path to which HSA runtime has been installed")
|
||||
else()
|
||||
set(HSA_PATH $ENV{HSA_PATH} CACHE PATH "Path to which HSA runtime has been installed")
|
||||
endif()
|
||||
endif()
|
||||
if(IS_ABSOLUTE ${HSA_PATH} AND EXISTS ${HSA_PATH} AND IS_DIRECTORY ${HSA_PATH})
|
||||
message(STATUS "Looking for HSA runtime in: " ${HSA_PATH})
|
||||
else()
|
||||
message(FATAL_ERROR "Don't know where to find HSA runtime. Please specify absolute path using -DHSA_PATH")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Set default build type
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED HCC_HOME)
|
||||
if(NOT DEFINED ENV{HCC_HOME})
|
||||
set(HCC_HOME "/opt/hcc" CACHE PATH "Path to which HCC has been installed")
|
||||
else()
|
||||
set(HCC_HOME $ENV{HCC_HOME} CACHE PATH "Path to which HCC has been installed")
|
||||
endif()
|
||||
endif()
|
||||
if(IS_ABSOLUTE ${HCC_HOME} AND EXISTS ${HCC_HOME} AND IS_DIRECTORY ${HCC_HOME})
|
||||
message(STATUS "Looking for HCC in: " ${HCC_HOME})
|
||||
else()
|
||||
message(FATAL_ERROR "Don't know where to find HCC. Please specify abolute path using -DHCC_HOME")
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED HSA_PATH)
|
||||
if(NOT DEFINED ENV{HSA_PATH})
|
||||
set(HSA_PATH "/opt/hsa" CACHE PATH "Path to which HSA runtime has been installed")
|
||||
else()
|
||||
set(HSA_PATH $ENV{HSA_PATH} CACHE PATH "Path to which HSA runtime has been installed")
|
||||
endif()
|
||||
endif()
|
||||
if(IS_ABSOLUTE ${HSA_PATH} AND EXISTS ${HSA_PATH} AND IS_DIRECTORY ${HSA_PATH})
|
||||
message(STATUS "Looking for HSA runtime in: " ${HSA_PATH})
|
||||
else()
|
||||
message(FATAL_ERROR "Don't know where to find HSA runtime. Please specify absolute path using -DHSA_PATH")
|
||||
endif()
|
||||
|
||||
# Determine HIP install path
|
||||
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND CMAKE_INSTALL_PREFIX MATCHES "/usr/local")
|
||||
if(CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_SOURCE_DIR} CACHE PATH "Installation path for HIP" FORCE)
|
||||
elseif(CMAKE_BUILD_TYPE MATCHES Release)
|
||||
set(CMAKE_INSTALL_PREFIX "/opt/hip" CACHE PATH "Installation path for HIP" FORCE)
|
||||
set(CMAKE_INSTALL_PREFIX "/opt/rocm/hip" CACHE PATH "Installation path for HIP" FORCE)
|
||||
else()
|
||||
message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release")
|
||||
endif()
|
||||
@@ -50,51 +75,70 @@ else()
|
||||
message(FATAL_ERROR "Don't know where to install HIP. Please specify absolute path using -DCMAKE_INSTALL_PREFIX")
|
||||
endif()
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR}/include)
|
||||
|
||||
set(CMAKE_CXX_COMPILER "${HCC_HOME}/bin/hcc")
|
||||
set(CMAKE_C_COMPILER "${HCC_HOME}/bin/hcc")
|
||||
|
||||
set(CMAKE_CXX_FLAGS " -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -stdlib=libc++ ")
|
||||
set(CMAKE_C_FLAGS " -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -stdlib=libc++ ")
|
||||
|
||||
set(SOURCE_FILES src/device_util.cpp
|
||||
src/hip_hcc.cpp
|
||||
src/hip_device.cpp
|
||||
src/hip_error.cpp
|
||||
src/hip_event.cpp
|
||||
src/hip_memory.cpp
|
||||
src/hip_peer.cpp
|
||||
src/hip_stream.cpp
|
||||
src/staging_buffer.cpp)
|
||||
|
||||
# Set if we need to build shared or static library
|
||||
if(NOT DEFINED ENV{HIP_USE_SHARED_LIBRARY})
|
||||
set(HIP_USE_SHARED_LIBRARY 0)
|
||||
else()
|
||||
set(HIP_USE_SHARED_LIBRARY $ENV{HIP_USE_SHARED_LIBRARY})
|
||||
endif()
|
||||
|
||||
#add_library(hip_hcc STATIC ${SOURCE_FILES})
|
||||
if(${HIP_USE_SHARED_LIBRARY} EQUAL 1)
|
||||
add_library(hip_hcc SHARED ${SOURCE_FILES})
|
||||
else()
|
||||
add_library(hip_hcc OBJECT ${SOURCE_FILES})
|
||||
#############################
|
||||
# Build steps
|
||||
#############################
|
||||
# Build hip_hcc if platform is hcc
|
||||
if(HIP_PLATFORM STREQUAL "hcc")
|
||||
include_directories(${PROJECT_SOURCE_DIR}/include)
|
||||
|
||||
set(CMAKE_CXX_COMPILER "${HCC_HOME}/bin/hcc")
|
||||
set(CMAKE_C_COMPILER "${HCC_HOME}/bin/hcc")
|
||||
|
||||
set(CMAKE_CXX_FLAGS " -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -stdlib=libc++ ")
|
||||
set(CMAKE_C_FLAGS " -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -stdlib=libc++ ")
|
||||
|
||||
set(SOURCE_FILES src/device_util.cpp
|
||||
src/hip_hcc.cpp
|
||||
src/hip_device.cpp
|
||||
src/hip_error.cpp
|
||||
src/hip_event.cpp
|
||||
src/hip_memory.cpp
|
||||
src/hip_peer.cpp
|
||||
src/hip_stream.cpp
|
||||
src/staging_buffer.cpp)
|
||||
|
||||
if(${HIP_USE_SHARED_LIBRARY} EQUAL 1)
|
||||
add_library(hip_hcc SHARED ${SOURCE_FILES})
|
||||
else()
|
||||
#add_library(hip_hcc STATIC ${SOURCE_FILES})
|
||||
add_library(hip_hcc OBJECT ${SOURCE_FILES})
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
#install(TARGETS hip_hcc DESTINATION lib)
|
||||
if(${HIP_USE_SHARED_LIBRARY} EQUAL 1)
|
||||
install(TARGETS hip_hcc DESTINATION lib)
|
||||
else()
|
||||
install(DIRECTORY ${PROJECT_BINARY_DIR}/CMakeFiles/hip_hcc.dir/src/ DESTINATION lib)
|
||||
#############################
|
||||
# Install steps
|
||||
#############################
|
||||
# Install hip_hcc if platform is hcc
|
||||
if(HIP_PLATFORM STREQUAL "hcc")
|
||||
if(${HIP_USE_SHARED_LIBRARY} EQUAL 1)
|
||||
install(TARGETS hip_hcc DESTINATION lib)
|
||||
else()
|
||||
#install(TARGETS hip_hcc DESTINATION lib)
|
||||
install(DIRECTORY ${PROJECT_BINARY_DIR}/CMakeFiles/hip_hcc.dir/src/ DESTINATION lib)
|
||||
endif()
|
||||
endif()
|
||||
set(EXECUTE_COMMAND test ${CMAKE_INSTALL_PREFIX} -ef ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
execute_process(COMMAND ${EXECUTE_COMMAND} RESULT_VARIABLE INSTALL_SOURCE)
|
||||
|
||||
# Install src, bin, include if necessary
|
||||
execute_process(COMMAND test ${CMAKE_INSTALL_PREFIX} -ef ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
RESULT_VARIABLE INSTALL_SOURCE)
|
||||
if(NOT ${INSTALL_SOURCE} EQUAL 0)
|
||||
install(DIRECTORY src DESTINATION .)
|
||||
install(DIRECTORY bin DESTINATION . USE_SOURCE_PERMISSIONS)
|
||||
install(DIRECTORY include DESTINATION .)
|
||||
endif()
|
||||
|
||||
#############################
|
||||
# Packaging steps
|
||||
#############################
|
||||
set(CPACK_SET_DESTDIR TRUE)
|
||||
set(CPACK_INSTALL_PREFIX "/opt/rocm/hip")
|
||||
set(CPACK_PACKAGE_NAME "hip")
|
||||
|
||||
@@ -55,8 +55,8 @@ When adding a new HIP feature, add a new unit test as well.
|
||||
See [tests/README.md](README.md) for more information.
|
||||
|
||||
## Development Flow
|
||||
The Unit testing environment automatically rebuilds libhip_hcc.a and the tests when a change it made to the HIP source, and this
|
||||
is a great place to develop new features alongside the associated test.
|
||||
It is recommended that developers set the flag HIP_BUILD_LOCAL=1 so that the unit testing environment automatically rebuilds libhip_hcc.a and the tests when a change it made to the HIP source.
|
||||
Directed tests provide a great place to develop new features alongside the associated test.
|
||||
|
||||
For applications and benchmarks outside the directed test environment, developments should use a two-step development flow:
|
||||
- #1. Compile, link, and install HCC. See [Installation](README.md#Installation) notes.
|
||||
|
||||
+14
-10
@@ -34,30 +34,34 @@ Make sure HIP_PATH is pointed to `/where/to/install/hip` and PATH includes `$HIP
|
||||
## How do I get set up?
|
||||
|
||||
### Prerequisites - Choose Your Platform
|
||||
HIP code can be developed either on AMD HSA or Boltzmann platform using hcc compiler, or a CUDA platform with nvcc installed:
|
||||
HIP code can be developed either on AMD ROCm platform using hcc compiler, or a CUDA platform with nvcc installed:
|
||||
|
||||
#### AMD (hcc):
|
||||
|
||||
* Install [hcc](https://bitbucket.org/multicoreware/hcc/wiki/Home) including supporting HSA kernel and runtime driver stack
|
||||
* By default HIP looks for hcc in /opt/hcc (can be overridden by setting HCC_HOME environment variable)
|
||||
* By default HIP looks for HSA in /opt/hsa (can be overridden by setting HSA_PATH environment variable)
|
||||
* By default HIP looks for hcc in /opt/rocm/hcc (can be overridden by setting HCC_HOME environment variable)
|
||||
* By default HIP looks for HSA in /opt/rocm/hsa (can be overridden by setting HSA_PATH environment variable)
|
||||
* Ensure that ROCR runtime is installed and added to LD_LIBRARY_PATH
|
||||
* Install HIP (from this GitHub repot). By default HIP is installed into /opt/rocm/hip (can be overridden by setting HIP_PATH environment variable).
|
||||
|
||||
* Optionally, consider adding /opt/rocm/bin to your path to make it easier to use the tools.
|
||||
|
||||
#### NVIDIA (nvcc)
|
||||
* Install CUDA SDK from manufacturer website
|
||||
* By default HIP looks for CUDA SDK in /usr/local/cuda (can be overriden by setting CUDA_PATH env variable)
|
||||
|
||||
### Add HIP/bin to your path.
|
||||
For example, if this repot is cloned to ~/HIP, and you are running bash:
|
||||
```
|
||||
> export PATH=$PATH:~/HIP/bin
|
||||
|
||||
#### Verify your installation
|
||||
Run hipconfig (instructions below assume default installation path) :
|
||||
```
|
||||
Verify your can find hipconfig (one of the hip tools in bin dir):
|
||||
```
|
||||
> hipconfig -pn
|
||||
/home/me/HIP
|
||||
> /opt/rocm/bin/hipconfig --full
|
||||
```
|
||||
|
||||
Compile and run the [square sample](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/tree/master/samples/0_Intro/square).
|
||||
|
||||
|
||||
|
||||
### HCC Options
|
||||
|
||||
#### Compiling CodeXL markers for HIP Functions
|
||||
|
||||
@@ -196,7 +196,7 @@ if ($needHipHcc) {
|
||||
$HIP_USE_SHARED_LIBRARY = $ENV{'HIP_USE_SHARED_LIBRARY'};
|
||||
$HIP_USE_SHARED_LIBRARY = 0 unless defined $HIP_USE_SHARED_LIBRARY;
|
||||
|
||||
#$HIPLDFLAGS .= " -L/opt/hip/lib -lhip_hcc" ;
|
||||
#$HIPLDFLAGS .= " -L/opt/rocm/hip/lib -lhip_hcc" ;
|
||||
if ($HIP_USE_SHARED_LIBRARY) {
|
||||
$HIPLDFLAGS .= " -L$HIP_PATH/lib -Wl,--rpath=$HIP_PATH/lib -lhip_hcc";
|
||||
} else {
|
||||
|
||||
@@ -107,9 +107,15 @@ HIP is a portable C++ language that supports a strong subset of the CUDA run-tim
|
||||
|
||||
A C++ dialect, hc is supported by the AMD HCC compiler. It provides C++ run time, C++ kernel-launch APIs (parallel_for_each), C++ kernel language, and several memory-management options, including pointers, arrays and array_view (with implicit data synchronization). It's intended to be a leading indicator of the ISO C++ standard.
|
||||
|
||||
|
||||
### HIP detected my platform (hcc vs nvcc) incorrectly - what should I do?
|
||||
|
||||
HIP will set the platform to HCC if it sees that the AMD graphics driver is installed and has detected an AMD GPU.
|
||||
Sometimes this isn't what you want - you can force HIP to recognize the platform by setting HIP_PLATFORM to hcc (or nvcc)
|
||||
```
|
||||
export HIP_PLATFORM=hcc
|
||||
|
||||
```
|
||||
One symptom of this problem is the message "error: 'unknown error'(11) at square.hipref.cpp:56". This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as nvcc. HIP may be able to compile the application using the nvcc tool-chain, but will generate this error at runtime since the platform does not have a CUDA device. The fix is to set HIP_PLATFORM=hcc and rebuild the issue.
|
||||
|
||||
If you see issues related to incorrect platform detection, please file an issue with the GitHub issue tracker so we can improve HIP's platform detection logic.
|
||||
|
||||
@@ -35,6 +35,14 @@ THE SOFTWARE.
|
||||
//Use the new HCC accelerator_view::copy instead of am_copy
|
||||
#define USE_AV_COPY 0
|
||||
|
||||
// Compile peer-to-peer support.
|
||||
// >= 2 : use HCC hc:accelerator::get_is_peer
|
||||
// >= 3 : use hc::am_memtracker_update_peers(...)
|
||||
#define USE_PEER_TO_PEER 0
|
||||
|
||||
// Use new lock API in HCC:
|
||||
#define USE_HCC_LOCK 0
|
||||
|
||||
//#define INLINE static inline
|
||||
|
||||
//---
|
||||
@@ -494,11 +502,23 @@ struct ihipEvent_t {
|
||||
// will lock the mutex on construction and unlock on destruction.
|
||||
//
|
||||
// MUTEX_TYPE is template argument so can easily convert to FakeMutex for performance or stress testing.
|
||||
template <typename MUTEX_TYPE>
|
||||
template <class MUTEX_TYPE>
|
||||
class ihipDeviceCriticalBase_t : LockedBase<MUTEX_TYPE>
|
||||
{
|
||||
public:
|
||||
ihipDeviceCriticalBase_t() : _stream_id(0) {};
|
||||
ihipDeviceCriticalBase_t() : _stream_id(0), _peerAgents(nullptr) {};
|
||||
|
||||
void init(unsigned deviceCnt) {
|
||||
assert(_peerAgents == nullptr);
|
||||
_peerAgents = new hsa_agent_t[deviceCnt];
|
||||
};
|
||||
|
||||
~ihipDeviceCriticalBase_t() {
|
||||
if (_peerAgents != nullptr) {
|
||||
delete _peerAgents;
|
||||
_peerAgents = nullptr;
|
||||
}
|
||||
}
|
||||
friend class LockedAccessor<ihipDeviceCriticalBase_t>;
|
||||
|
||||
std::list<ihipStream_t*> &streams() { return _streams; };
|
||||
@@ -507,10 +527,24 @@ public:
|
||||
// "Allocate" a stream ID:
|
||||
ihipStream_t::SeqNum_t incStreamId() { return _stream_id++; };
|
||||
|
||||
bool addPeer(ihipDevice_t *peer);
|
||||
bool removePeer(ihipDevice_t *peer);
|
||||
void resetPeers(ihipDevice_t *thisDevice);
|
||||
|
||||
uint32_t peerCnt() const { return _peerCnt; };
|
||||
hsa_agent_t *peerAgents() const { return _peerAgents; };
|
||||
|
||||
|
||||
private:
|
||||
std::list<ihipStream_t*> _streams; // streams associated with this device.
|
||||
ihipStream_t::SeqNum_t _stream_id;
|
||||
|
||||
// These reflect the currently Enabled set of peers for this GPU:
|
||||
std::list<ihipDevice_t*> _peers; // list of enabled peer devices.
|
||||
uint32_t _peerCnt; // number of enabled peers
|
||||
hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.)
|
||||
private:
|
||||
void recomputePeerAgents();
|
||||
};
|
||||
|
||||
// Note Mutex selected based on DeviceMutex
|
||||
@@ -530,7 +564,7 @@ class ihipDevice_t
|
||||
{
|
||||
public: // Functions:
|
||||
ihipDevice_t() {}; // note: calls constructor for _criticalData
|
||||
void init(unsigned device_index, hc::accelerator &acc, unsigned flags);
|
||||
void init(unsigned device_index, unsigned deviceCnt, hc::accelerator &acc, unsigned flags);
|
||||
~ihipDevice_t();
|
||||
|
||||
void locked_addStream(ihipStream_t *s);
|
||||
@@ -539,6 +573,8 @@ public: // Functions:
|
||||
void locked_waitAllStreams();
|
||||
void locked_syncDefaultStream(bool waitOnSelf);
|
||||
|
||||
ihipDeviceCritical_t &criticalData() { return _criticalData; }; // TODO, move private. Fix P2P.
|
||||
|
||||
public: // Data, set at initialization:
|
||||
unsigned _device_index; // index into g_devices.
|
||||
|
||||
|
||||
@@ -24,7 +24,9 @@ THE SOFTWARE.
|
||||
* @brief Contains definitions of APIs for HIP runtime.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
//#pragma once
|
||||
#ifndef HIP_RUNTIME_H
|
||||
#define HIP_RUNTIME_H
|
||||
|
||||
//---
|
||||
// Top part of file can be compiled with any compiler
|
||||
@@ -574,4 +576,4 @@ do {\
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -19,8 +19,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
//#pragma once
|
||||
#ifndef HIP_RUNTIME_API_H
|
||||
#define HIP_RUNTIME_API_H
|
||||
/**
|
||||
* @file hcc_detail/hip_runtime_api.h
|
||||
* @brief Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language extensions (-hc mode) ; those functions in hip_runtime.h.
|
||||
@@ -907,46 +908,58 @@ hipError_t hipMemGetInfo (size_t * free, size_t * total) ;
|
||||
*
|
||||
* Returns "1" in @p canAccessPeer if the specified @p device is capable
|
||||
* of directly accessing memory physically located on peerDevice , or "0" if not.
|
||||
*
|
||||
* Returns "0" in @p canAccessPeer if deviceId == peerDeviceId, and both are valid devices : a device is not a peer of itself.
|
||||
*
|
||||
*
|
||||
*
|
||||
* @returns #hipSuccess,
|
||||
* @returns #hipErrorInvalidDevice if deviceId or peerDeviceId are not valid devices
|
||||
*/
|
||||
hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int device, int peerDevice );
|
||||
|
||||
hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, int deviceId, int peerDeviceId);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Disables registering memory on peerDevice for direct access from the current device.
|
||||
* @brief Enable direct access from current device's virtual address space to memory allocations physically located on a peer device.
|
||||
*
|
||||
* If there are any allocations on peerDevice which were registered in the current device using hipPeerRegister() then these allocations will be automatically unregistered.
|
||||
* Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device.
|
||||
* Memory which already allocated on peer device will be mapped into the address space of the current device. In addition, all
|
||||
* future memory allocations on peerDeviceId will be mapped into the address space of the current device when the memory is allocated.
|
||||
* The peer memory remains accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
|
||||
*
|
||||
* @param [in] peerDevice
|
||||
* TODO:cudaErrorPeerAccessNotEnabled and cudaErrorInvalidDevice error not supported in HIP, return hipErrorUnknown
|
||||
* Returns #hipSuccess, #hipErrorUnknown
|
||||
*/
|
||||
hipError_t hipDeviceDisablePeerAccess ( int peerDevice );
|
||||
|
||||
/**
|
||||
* @brief Enables registering memory on peerDevice for direct access from the current device.
|
||||
*
|
||||
* @param [in] peerDevice
|
||||
* @param [in] peerDeviceId
|
||||
* @param [in] flags
|
||||
*
|
||||
* TODO:cudaErrorInvalidDevice error not supported in HIP, return hipErrorUnknown
|
||||
* Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue, #hipErrorUnknown
|
||||
* Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
|
||||
* @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
|
||||
*/
|
||||
hipError_t hipDeviceEnablePeerAccess ( int peerDevice, unsigned int flags );
|
||||
hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Disable direct access from current device's virtual address space to memory allocations physically located on a peer device.
|
||||
*
|
||||
* Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device.
|
||||
*
|
||||
* @param [in] peerDeviceId
|
||||
*
|
||||
* Returns #hipSuccess, #hipErrorPeerAccessNotEnabled
|
||||
*/
|
||||
hipError_t hipDeviceDisablePeerAccess (int peerDeviceId);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Copies memory from one device to memory on another device.
|
||||
*
|
||||
* @param [out] dst - Destination device pointer.
|
||||
* @param [in] dstDevice - Destination device
|
||||
* @param [in] dstDeviceId - Destination device
|
||||
* @param [in] src - Source device pointer
|
||||
* @param [in] srcDevice - Source device
|
||||
* @param [in] srcDeviceId - Source device
|
||||
* @param [in] sizeBytes - Size of memory copy in bytes
|
||||
*
|
||||
* Returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
|
||||
*/
|
||||
hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes );
|
||||
hipError_t hipMemcpyPeer (void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t sizeBytes);
|
||||
|
||||
/**
|
||||
* @brief Copies memory from one device to memory on another device.
|
||||
@@ -961,7 +974,7 @@ hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcD
|
||||
* Returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
|
||||
*/
|
||||
#if __cplusplus
|
||||
hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream=0 );
|
||||
hipError_t hipMemcpyPeerAsync ( void* dst, int dstDeviceId, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream=0 );
|
||||
#else
|
||||
hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream);
|
||||
#endif
|
||||
@@ -1053,3 +1066,5 @@ hipError_t hipDriverGetVersion(int *driverVersion) ;
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
@@ -19,7 +19,12 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
//#pragma once
|
||||
|
||||
#ifndef HIP_TEXTURE_H
|
||||
#define HIP_TEXTURE_H
|
||||
|
||||
/**
|
||||
* @file hcc_detail/hip_texture.h
|
||||
* @brief HIP C++ Texture API for hcc compiler
|
||||
@@ -201,3 +206,6 @@ hipError_t hipUnbindTexture(struct texture<T, dim, readMode> *tex)
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -25,6 +25,9 @@ THE SOFTWARE.
|
||||
* @brief Defines the different newt vector types for HIP runtime.
|
||||
*/
|
||||
|
||||
#ifndef HIP_VECTOR_TYPES_H
|
||||
#define HIP_VECTOR_TYPES_H
|
||||
|
||||
#if defined (__HCC__) && (__hcc_workweek__ < 16032)
|
||||
#error("This version of HIP requires a newer version of HCC.");
|
||||
#endif
|
||||
@@ -112,7 +115,7 @@ typedef hc::short_vector::double2 double2;
|
||||
typedef hc::short_vector::double3 double3;
|
||||
typedef hc::short_vector::double4 double4;
|
||||
|
||||
/*
|
||||
|
||||
///---
|
||||
// Inline functions for creating vector types from basic types
|
||||
#define ONE_COMPONENT_ACCESS(T, VT) inline VT make_ ##VT (T x) { VT t; t.x = x; return t; };
|
||||
@@ -195,4 +198,7 @@ ONE_COMPONENT_ACCESS (double, double1);
|
||||
TWO_COMPONENT_ACCESS (double, double2);
|
||||
THREE_COMPONENT_ACCESS(double, double3);
|
||||
FOUR_COMPONENT_ACCESS (double, double4);
|
||||
*/
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -25,6 +25,9 @@ THE SOFTWARE.
|
||||
* @brief TODO-doc
|
||||
*/
|
||||
|
||||
#ifndef HOST_DEFINES_H
|
||||
#define HOST_DEFINES_H
|
||||
|
||||
#ifdef __HCC__
|
||||
/**
|
||||
* Function and kernel markers
|
||||
@@ -67,3 +70,5 @@ THE SOFTWARE.
|
||||
#define __constant__
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -17,7 +17,9 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
//#pragma once
|
||||
#ifndef STAGING_BUFFER_H
|
||||
#define STAGING_BUFFER_H
|
||||
|
||||
#include "hsa.h"
|
||||
|
||||
@@ -58,3 +60,5 @@ private:
|
||||
hsa_signal_t _completion_signal[_max_buffers];
|
||||
std::mutex _copy_lock; // provide thread-safe access
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -16,7 +16,10 @@ LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#pragma once
|
||||
//#pragma once
|
||||
|
||||
#ifndef TRACE_HELPER_H
|
||||
#define TRACE_HELPER_H
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
@@ -116,3 +119,5 @@ inline std::string ToString(T first, Args... args)
|
||||
{
|
||||
return ToString(first) + ", " + ToString(args...) ;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../include
|
||||
@@ -138,7 +138,7 @@ typedef struct hipPointerAttribute_t {
|
||||
* @enum
|
||||
* @ingroup Enumerations
|
||||
*/
|
||||
// Developer note - when updating these, update the hipErrorName and hipErrorString functions
|
||||
// Developer note - when updating these, update the hipErrorName and hipErrorString functions in NVCC and HCC paths
|
||||
typedef enum hipError_t {
|
||||
hipSuccess = 0 ///< Successful completion.
|
||||
,hipErrorMemoryAllocation ///< Memory allocation error.
|
||||
@@ -155,6 +155,8 @@ typedef enum hipError_t {
|
||||
,hipErrorNoDevice ///< Call to hipGetDeviceCount returned 0 devices
|
||||
,hipErrorNotReady ///< Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery.
|
||||
,hipErrorUnknown ///< Unknown error.
|
||||
,hipErrorPeerAccessNotEnabled ///< Peer access was never enabled from the current device.
|
||||
,hipErrorPeerAccessAlreadyEnabled ///< Peer access was already enabled from the current device.
|
||||
,hipErrorRuntimeMemory ///< HSA runtime memory call returned error. Typically not seen in production systems.
|
||||
,hipErrorRuntimeOther ///< HSA runtime call other than memory returned error. Typically not seen in production systems.
|
||||
,hipErrorTbd ///< Marker that more error codes are needed.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#Dependencies : [MYHIP]/bin must be in user's path.
|
||||
|
||||
HIP_PATH=../../..
|
||||
HIP_PATH?=../../..
|
||||
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
|
||||
@@ -24,11 +24,14 @@ THE SOFTWARE.
|
||||
#include <hip_runtime.h>
|
||||
|
||||
|
||||
#define CHECK(error) \
|
||||
#define CHECK(cmd) \
|
||||
{\
|
||||
hipError_t error = cmd;\
|
||||
if (error != hipSuccess) { \
|
||||
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE);\
|
||||
}
|
||||
}\
|
||||
}
|
||||
|
||||
void __global__
|
||||
bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t N)
|
||||
|
||||
@@ -0,0 +1,66 @@
|
||||
HCC_HOME?=/opt/rocm/hcc
|
||||
HCC = $(HCC_HOME)/bin/hcc
|
||||
|
||||
HCC_CFLAGS= `$(HCC_HOME)/bin/hcc-config --cxxflags`
|
||||
HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags`
|
||||
|
||||
CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I/opt/hcc/include
|
||||
CPPAMP_LDFLAGS= -std=c++amp -L/opt/hcc/lib -Wl,--rpath=/opt/hcc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive
|
||||
|
||||
HIP_PATH?=/opt/rocm/hip
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
|
||||
|
||||
ifneq (${HIP_PLATFORM}, hcc)
|
||||
$(error hcc_dialects requires hcc compiler and only runs on hcc platform)
|
||||
endif
|
||||
|
||||
|
||||
TARGETS=vadd_hc_arrayview vadd_hc_array vadd_amp_arrayview vadd_hip
|
||||
|
||||
all: $(TARGETS)
|
||||
|
||||
clean:
|
||||
rm -f $(TARGETS) *.o
|
||||
|
||||
run: $(TARGETS)
|
||||
@for t in $(TARGETS); do\
|
||||
echo "Running $$t"; \
|
||||
./$$t; \
|
||||
done
|
||||
|
||||
|
||||
# HCC version:
|
||||
vadd_hc_arrayview.o: vadd_hc_arrayview.cpp
|
||||
$(HCC) $(HCC_CFLAGS) -c $< -o $@
|
||||
vadd_hc_arrayview: vadd_hc_arrayview.o
|
||||
$(HCC) $(HCC_LDFLAGS) $< -o $@
|
||||
|
||||
|
||||
# HCC version, using explicit arrays:
|
||||
vadd_hc_array.o: vadd_hc_array.cpp
|
||||
$(HCC) $(HCC_CFLAGS) -c $< -o $@
|
||||
vadd_hc_array: vadd_hc_array.o
|
||||
$(HCC) $(HCC_LDFLAGS) $< -o $@
|
||||
|
||||
|
||||
# HCC version, using AM (accelerator memory) pointer
|
||||
vadd_hc_am.o: vadd_hc_am.cpp
|
||||
$(HCC) $(HCC_CFLAGS) -c $< -o $@
|
||||
vadd_hc_am: vadd_hc_am.o
|
||||
$(HCC) $(HCC_LDFLAGS) $< -o $@
|
||||
|
||||
|
||||
|
||||
# HIP version:
|
||||
vadd_hip.o: vadd_hip.cpp
|
||||
$(HIPCC) -c $< -o $@
|
||||
vadd_hip: vadd_hip.o
|
||||
$(HIPCC) $< -o $@
|
||||
|
||||
|
||||
# AMP version:
|
||||
vadd_amp_arrayview.o: vadd_amp_arrayview.cpp
|
||||
$(HCC) $(CPPAMP_CFLAGS) -c $< -o $@
|
||||
vadd_amp_arrayview: vadd_amp_arrayview.o
|
||||
$(HCC) $(CPPAMP_LDFLAGS) $< -o $@
|
||||
@@ -0,0 +1,48 @@
|
||||
// Simple test showing how to use C++AMP syntax with array_view.
|
||||
// The code uses AMP's array_view class, which provides automatic data synchronization
|
||||
// of data between the host and the accelerator. As noted below, the HCC runtime
|
||||
// will automatically copy data to and from the host, without the user needing
|
||||
// to manually perform such copies. This is an excellent mode for developers
|
||||
// new to GPU programming and matches the memory models provided by recent systems where
|
||||
// CPU and GPU share the same memory pool. Advanced programmers may prefer
|
||||
// more explicit control over the data movement - shown in the other vadd_hc_array and
|
||||
// vadd_hc_am examples.
|
||||
// This example shows the similarity between C++AMP and and HC for simple cases where
|
||||
// implicit data transfer is used - really the only difference is the namespace.
|
||||
// Other examples show some of the more advanced controls.
|
||||
|
||||
#include <amp.h>
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int sizeElements = 1000000;
|
||||
|
||||
// Allocate auto-managed host/device views of data:
|
||||
concurrency::array_view<float> A(sizeElements);
|
||||
concurrency::array_view<float> B(sizeElements);
|
||||
concurrency::array_view<float> C(sizeElements);
|
||||
|
||||
// Initialize host data
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
A[i] = 1.618f * i;
|
||||
B[i] = 3.142f * i;
|
||||
}
|
||||
C.discard_data(); // tell runtime not to copy CPU host data.
|
||||
|
||||
|
||||
// Launch kernel onto default accelerator
|
||||
// The HCC runtime will ensure that A and B are available on the accelerator before launching the kernel.
|
||||
concurrency::parallel_for_each(concurrency::extent<1> (sizeElements),
|
||||
[=] (concurrency::index<1> idx) restrict(amp) {
|
||||
int i = idx[0];
|
||||
C[i] = A[i] + B[i];
|
||||
});
|
||||
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
float ref= 1.618f * i + 3.142f * i;
|
||||
// Because C is an array_view, the HCC runtime will copy C back to host at first access here:
|
||||
if (C[i] != ref) {
|
||||
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
// Simple test showing how to use HC syntax with AM (accelerator memory).
|
||||
// AM provides a set of c-style memory management routines for allocating,
|
||||
// freeing, and copying memory. am_alloc returns a device pointer
|
||||
// which can only be used on the device. The programmer has full control
|
||||
// over when data is copied.
|
||||
|
||||
#include <hc.hpp>
|
||||
#include <hc_am.hpp>
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int sizeElements = 1000000;
|
||||
size_t sizeBytes = sizeElements * sizeof(float);
|
||||
|
||||
// Allocate host memory
|
||||
float *A_h = (float*)malloc(sizeBytes);
|
||||
float *B_h = (float*)malloc(sizeBytes);
|
||||
float *C_h = (float*)malloc(sizeBytes);
|
||||
|
||||
// Allocate device pointers:
|
||||
// Unlike array_view, these must be explicitly managed by user:
|
||||
hc::accelerator acc; // grab default accelerator where we want to allocate memory:
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
|
||||
float *A_d, *B_d, *C_d;
|
||||
A_d = hc::am_alloc(sizeBytes, acc, 0);
|
||||
B_d = hc::am_alloc(sizeBytes, acc, 0);
|
||||
C_d = hc::am_alloc(sizeBytes, acc, 0);
|
||||
|
||||
// Initialize host data
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
A_h[i] = 1.618f * i;
|
||||
B_h[i] = 3.142f * i;
|
||||
}
|
||||
|
||||
av.copy(A_h, A_d); // C++ copy H2D
|
||||
av.copy(B_h, B_d); //C++ copy H2D
|
||||
|
||||
// Launch kernel onto AV.
|
||||
// Because the kernel PFE and the copies are submitted to same AV, they will execute in order
|
||||
// and we don't need additional synchronization to ensure the copies complete before the PFE begins.
|
||||
hc::parallel_for_each(av, hc::extent<1> (sizeElements),
|
||||
[&] (hc::index<1> idx) [[hc]] {
|
||||
int i = idx[0];
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
});
|
||||
|
||||
|
||||
// This copy is in same AV as the kernel and thus will wait for the kernel to finish before executing.
|
||||
av.copy(C_d, C_h); // C++ copy D2H
|
||||
|
||||
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
float ref= 1.618f * i + 3.142f * i;
|
||||
if (C_h[i] != ref) {
|
||||
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
// Simple test showing how to use HC syntax with array.
|
||||
// Array provides a type-safe C++ mechanism to allocate accelerator memory.
|
||||
// Like array_view, hc::array provides multi-dimensional indexing capability,
|
||||
// and is typed. However, unlike array_view, hc::array does not provide
|
||||
// automatic data management capabilities - instead the programmer
|
||||
// takes the reins and controls when copies are executed.
|
||||
|
||||
#include <hc.hpp>
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int sizeElements = 1000000;
|
||||
size_t sizeBytes = sizeElements * sizeof(float);
|
||||
|
||||
// Allocate host memory
|
||||
float *A_h = (float*)malloc(sizeBytes);
|
||||
float *B_h = (float*)malloc(sizeBytes);
|
||||
float *C_h = (float*)malloc(sizeBytes);
|
||||
|
||||
// Allocate device arrays<>
|
||||
// Unlike array_view, these must be explicitly managed by user:
|
||||
hc::array<float> A_d(sizeElements);
|
||||
hc::array<float> B_d(sizeElements);
|
||||
hc::array<float> C_d(sizeElements);
|
||||
|
||||
// Initialize host data
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
A_h[i] = 1.618f * i;
|
||||
B_h[i] = 3.142f * i;
|
||||
}
|
||||
|
||||
hc::copy(A_h, A_d); // C++ copy H2D
|
||||
hc::copy(B_h, B_d); // C++ copy H2D
|
||||
|
||||
// Launch kernel onto default accelerator:
|
||||
// array<> types are not implicitly copied, so we performed copies above.
|
||||
hc::parallel_for_each(hc::extent<1> (sizeElements),
|
||||
[&] (hc::index<1> idx) [[hc]] {
|
||||
int i = idx[0];
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
});
|
||||
|
||||
// HCC runtime knows that C_d depends on previous PFE and will force the copy to wait for the PFE to complte.
|
||||
hc::copy(C_d, C_h); // C++ copy D2H
|
||||
|
||||
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
float ref= 1.618f * i + 3.142f * i;
|
||||
if (C_h[i] != ref) {
|
||||
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
#include <hc.hpp>
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int size = 1000000;
|
||||
|
||||
// Allocate auto-managed host/device views of data:
|
||||
hc::array_view<float> A(size);
|
||||
hc::array_view<float> B(size);
|
||||
hc::array_view<float> C(size);
|
||||
|
||||
// Initialize host data
|
||||
for (int i=0; i<size; i++) {
|
||||
A[i] = 1.618f * i;
|
||||
B[i] = 3.142f * i;
|
||||
}
|
||||
C.discard_data(); // tell runtime not to copy CPU host data.
|
||||
|
||||
|
||||
// Launch kernel onto default accelerator:
|
||||
hc::parallel_for_each(hc::extent<1> (size),
|
||||
[=] (hc::index<1> idx) [[hc]] {
|
||||
int i = idx[0];
|
||||
C[i] = A[i] + B[i];
|
||||
});
|
||||
|
||||
for (int i=0; i<size; i++) {
|
||||
float ref= 1.618f * i + 3.142f * i;
|
||||
if (C[i] != ref) {
|
||||
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
// Simple test showing how to use HC syntax with array_view.
|
||||
// The code uses AMP's array_view class, which provides automatic data synchronization
|
||||
// of data between the host and the accelerator. As noted below, the HCC runtime
|
||||
// will automatically copy data to and from the host, without the user needing
|
||||
// to manually perform such copies. This is an excellent mode for developers
|
||||
// new to GPU programming and matches the memory models provided by recent systems where
|
||||
// CPU and GPU share the same memory pool. Advanced programmers may prefer
|
||||
// more explicit control over the data movement - shown in the other vadd_hc_array and
|
||||
// vadd_hc_am examples.
|
||||
// This example shows the similarity between C++AMP and and HC for simple cases where
|
||||
// implicit data transfer is used - really the only difference is the namespace.
|
||||
// Other examples show some of the more advanced controls.
|
||||
|
||||
#include <hc.hpp>
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int sizeElements = 1000000;
|
||||
|
||||
// Allocate auto-managed host/device views of data:
|
||||
hc::array_view<float> A(sizeElements);
|
||||
hc::array_view<float> B(sizeElements);
|
||||
hc::array_view<float> C(sizeElements);
|
||||
|
||||
// Initialize host data
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
A[i] = 1.618f * i;
|
||||
B[i] = 3.142f * i;
|
||||
}
|
||||
C.discard_data(); // tell runtime not to copy CPU host data.
|
||||
|
||||
|
||||
// Launch kernel onto default accelerator:
|
||||
// The HCC runtime will ensure that A and B are available on the accelerator before launching the kernel.
|
||||
hc::parallel_for_each(hc::extent<1> (sizeElements),
|
||||
[=] (hc::index<1> idx) [[hc]] {
|
||||
int i = idx[0];
|
||||
C[i] = A[i] + B[i];
|
||||
});
|
||||
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
float ref= 1.618f * i + 3.142f * i;
|
||||
// Because C is an array_view, the HCC runtime will copy C back to host at first access here:
|
||||
if (C[i] != ref) {
|
||||
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
#include <hip_runtime.h>
|
||||
|
||||
__global__ void vadd_hip(hipLaunchParm lp, const float *a, const float *b, float *c, int N)
|
||||
{
|
||||
int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
|
||||
if (idx < N) {
|
||||
c[idx] = a[idx] + b[idx];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int sizeElements = 1000000;
|
||||
size_t sizeBytes = sizeElements * sizeof(float);
|
||||
|
||||
// Allocate host memory
|
||||
float *A_h = (float*)malloc(sizeBytes);
|
||||
float *B_h = (float*)malloc(sizeBytes);
|
||||
float *C_h = (float*)malloc(sizeBytes);
|
||||
|
||||
// Allocate device memory:
|
||||
float *A_d, *B_d, *C_d;
|
||||
hipMalloc(&A_d, sizeBytes);
|
||||
hipMalloc(&B_d, sizeBytes);
|
||||
hipMalloc(&C_d, sizeBytes);
|
||||
|
||||
// Initialize host data
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
A_h[i] = 1.618f * i;
|
||||
B_h[i] = 3.142f * i;
|
||||
}
|
||||
|
||||
hipMemcpy(A_d, A_h, sizeBytes, hipMemcpyHostToDevice);
|
||||
hipMemcpy(B_d, B_h, sizeBytes, hipMemcpyHostToDevice);
|
||||
|
||||
// Launch kernel onto default accelerator:
|
||||
int blockSize = 256; // pick arbitrary block size
|
||||
int blocks = (sizeElements+blockSize-1)/blockSize; // round up to launch enough blocks
|
||||
hipLaunchKernel(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
|
||||
|
||||
hipMemcpy(C_h, C_d, sizeBytes, hipMemcpyDeviceToHost);
|
||||
|
||||
for (int i=0; i<sizeElements; i++) {
|
||||
float ref= 1.618f * i + 3.142f * i;
|
||||
if (C_h[i] != ref) {
|
||||
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
HIP_PATH=../../..
|
||||
HIP_PATH?=../../..
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
all: square.hip.out
|
||||
|
||||
@@ -22,11 +22,14 @@ THE SOFTWARE.
|
||||
#include <stdio.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define CHECK(error) \
|
||||
#define CHECK(cmd) \
|
||||
{\
|
||||
hipError_t error = cmd;\
|
||||
if (error != cudaSuccess) { \
|
||||
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE);\
|
||||
}
|
||||
}\
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
|
||||
@@ -22,11 +22,14 @@ THE SOFTWARE.
|
||||
#include <stdio.h>
|
||||
#include <hip_runtime.h>
|
||||
|
||||
#define CHECK(error) \
|
||||
#define CHECK(cmd) \
|
||||
{\
|
||||
hipError_t error = cmd;\
|
||||
if (error != hipSuccess) { \
|
||||
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE);\
|
||||
}
|
||||
}\
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
HIP_PATH=../../..
|
||||
HIP_PATH?=../../..
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE=hipBusBandwidth
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
HIP_PATH=../../..
|
||||
HIP_PATH?=../../..
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE=hipDispatchLatency
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
HIP_PATH=../../..
|
||||
HIP_PATH?=../../..
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE=hipInfo
|
||||
|
||||
@@ -120,8 +120,25 @@ void printDeviceProp (int deviceId)
|
||||
cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl;
|
||||
cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl;
|
||||
cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl;
|
||||
|
||||
int deviceCnt;
|
||||
hipGetDeviceCount(&deviceCnt);
|
||||
cout << setw(w1) << "peers: ";
|
||||
for (int i=0; i<deviceCnt; i++) {
|
||||
int isPeer;
|
||||
hipDeviceCanAccessPeer(&isPeer, i, deviceId);
|
||||
if (isPeer) {
|
||||
cout << "device#" << i << " ";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
|
||||
|
||||
|
||||
cout << endl;
|
||||
|
||||
|
||||
size_t free, total;
|
||||
hipMemGetInfo(&free, &total);
|
||||
|
||||
|
||||
@@ -174,6 +174,7 @@ hipError_t hipDeviceReset(void)
|
||||
if (device) {
|
||||
//---
|
||||
//Wait for pending activity to complete? TODO - check if this is required behavior:
|
||||
//TODO, also we have small window between wait and reset.
|
||||
|
||||
device->locked_waitAllStreams();
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ const char *hipGetErrorName(hipError_t hip_error)
|
||||
//---
|
||||
const char *hipGetErrorString(hipError_t hip_error)
|
||||
{
|
||||
std::call_once(hip_initialized, ihipInit);
|
||||
HIP_INIT_API(hip_error);
|
||||
|
||||
// TODO - return a message explaining the error.
|
||||
// TODO - This should be set up to return the same string reported in the the doxygen comments, somehow.
|
||||
|
||||
@@ -198,6 +198,58 @@ void ihipStream_t::locked_wait(bool assertQueueEmpty)
|
||||
};
|
||||
|
||||
|
||||
// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
|
||||
// The packed _peerAgents can efficiently be used on each memory allocation.
|
||||
template<>
|
||||
void ihipDeviceCriticalBase_t<DeviceMutex>::recomputePeerAgents()
|
||||
{
|
||||
_peerCnt = 0;
|
||||
std::for_each (_peers.begin(), _peers.end(), [this](ihipDevice_t* device) {
|
||||
_peerAgents[_peerCnt++] = device->_hsa_agent;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
bool ihipDeviceCriticalBase_t<DeviceMutex>::addPeer(ihipDevice_t *peer)
|
||||
{
|
||||
auto match = std::find(_peers.begin(), _peers.end(), peer);
|
||||
if (match == std::end(_peers)) {
|
||||
// Not already a peer, let's update the list:
|
||||
_peers.push_back(peer);
|
||||
recomputePeerAgents();
|
||||
return true;
|
||||
}
|
||||
|
||||
// If we get here - peer was already on list, silently ignore.
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
bool ihipDeviceCriticalBase_t<DeviceMutex>::removePeer(ihipDevice_t *peer)
|
||||
{
|
||||
auto match = std::find(_peers.begin(), _peers.end(), peer);
|
||||
if (match != std::end(_peers)) {
|
||||
// Found a valid peer, let's remove it.
|
||||
_peers.remove(peer);
|
||||
recomputePeerAgents();
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
void ihipDeviceCriticalBase_t<DeviceMutex>::resetPeers(ihipDevice_t *thisDevice)
|
||||
{
|
||||
_peers.clear();
|
||||
_peerCnt = 0;
|
||||
addPeer(thisDevice); // peer-list always contains self agent.
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
|
||||
//---
|
||||
ihipDevice_t * ihipStream_t::getDevice() const
|
||||
@@ -401,14 +453,18 @@ void ihipDevice_t::locked_reset()
|
||||
// Reset and remove streams:
|
||||
crit->streams().clear();
|
||||
|
||||
// This resest peer list to just me:
|
||||
crit->resetPeers(this);
|
||||
|
||||
// Reset and release all memory stored in the tracker:
|
||||
// Reset will remove peer mapping so don't need to do this explicitly.
|
||||
am_memtracker_reset(_acc);
|
||||
|
||||
};
|
||||
|
||||
|
||||
//---
|
||||
void ihipDevice_t::init(unsigned device_index, hc::accelerator &acc, unsigned flags)
|
||||
void ihipDevice_t::init(unsigned device_index, unsigned deviceCnt, hc::accelerator &acc, unsigned flags)
|
||||
{
|
||||
_device_index = device_index;
|
||||
_device_flags = flags;
|
||||
@@ -428,8 +484,13 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator &acc, unsigned fl
|
||||
|
||||
getProperties(&_props);
|
||||
|
||||
_criticalData.init(deviceCnt);
|
||||
|
||||
locked_reset();
|
||||
|
||||
_default_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault);
|
||||
locked_addStream(_default_stream);
|
||||
|
||||
|
||||
tprintf(DB_SYNC, "created device with default_stream=%p\n", _default_stream);
|
||||
|
||||
@@ -904,7 +965,7 @@ void ihipInit()
|
||||
//If device is not in visible devices list, ignore
|
||||
continue;
|
||||
}
|
||||
g_devices[g_deviceCnt].init(g_deviceCnt, accs[i], hipDeviceMapHost);
|
||||
g_devices[g_deviceCnt].init(g_deviceCnt, deviceCnt, accs[i], hipDeviceMapHost);
|
||||
g_deviceCnt++;
|
||||
}
|
||||
}
|
||||
@@ -1029,22 +1090,25 @@ void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFutur
|
||||
const char *ihipErrorString(hipError_t hip_error)
|
||||
{
|
||||
switch (hip_error) {
|
||||
case hipSuccess : return "hipSuccess";
|
||||
case hipErrorMemoryAllocation : return "hipErrorMemoryAllocation";
|
||||
case hipErrorMemoryFree : return "hipErrorMemoryFree";
|
||||
case hipErrorUnknownSymbol : return "hipErrorUnknownSymbol";
|
||||
case hipErrorOutOfResources : return "hipErrorOutOfResources";
|
||||
case hipErrorInvalidValue : return "hipErrorInvalidValue";
|
||||
case hipErrorInvalidResourceHandle : return "hipErrorInvalidResourceHandle";
|
||||
case hipErrorInvalidDevice : return "hipErrorInvalidDevice";
|
||||
case hipErrorInvalidMemcpyDirection : return "hipErrorInvalidMemcpyDirection";
|
||||
case hipErrorNoDevice : return "hipErrorNoDevice";
|
||||
case hipErrorNotReady : return "hipErrorNotReady";
|
||||
case hipErrorRuntimeMemory : return "hipErrorRuntimeMemory";
|
||||
case hipErrorRuntimeOther : return "hipErrorRuntimeOther";
|
||||
case hipErrorUnknown : return "hipErrorUnknown";
|
||||
case hipErrorTbd : return "hipErrorTbd";
|
||||
default : return "hipErrorUnknown";
|
||||
case hipSuccess : return "hipSuccess";
|
||||
case hipErrorMemoryAllocation : return "hipErrorMemoryAllocation";
|
||||
case hipErrorMemoryFree : return "hipErrorMemoryFree";
|
||||
case hipErrorUnknownSymbol : return "hipErrorUnknownSymbol";
|
||||
case hipErrorOutOfResources : return "hipErrorOutOfResources";
|
||||
case hipErrorInvalidValue : return "hipErrorInvalidValue";
|
||||
case hipErrorInvalidResourceHandle : return "hipErrorInvalidResourceHandle";
|
||||
case hipErrorInvalidDevice : return "hipErrorInvalidDevice";
|
||||
case hipErrorInvalidMemcpyDirection : return "hipErrorInvalidMemcpyDirection";
|
||||
case hipErrorNoDevice : return "hipErrorNoDevice";
|
||||
case hipErrorNotReady : return "hipErrorNotReady";
|
||||
case hipErrorPeerAccessNotEnabled : return "hipErrorPeerAccessNotEnabled";
|
||||
case hipErrorPeerAccessAlreadyEnabled : return "hipErrorPeerAccessAlreadyEnabled";
|
||||
|
||||
case hipErrorRuntimeMemory : return "hipErrorRuntimeMemory";
|
||||
case hipErrorRuntimeOther : return "hipErrorRuntimeOther";
|
||||
case hipErrorUnknown : return "hipErrorUnknown";
|
||||
case hipErrorTbd : return "hipErrorTbd";
|
||||
default : return "hipErrorUnknown";
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@@ -130,6 +130,15 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
} else {
|
||||
hc::am_memtracker_update(*ptr, device->_device_index, 0);
|
||||
{
|
||||
LockedAccessor_DeviceCrit_t crit(device->criticalData());
|
||||
if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved:
|
||||
hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
@@ -139,29 +148,6 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipMallocHost(void** ptr, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_API(ptr, sizeBytes);
|
||||
|
||||
hipError_t hip_status = hipSuccess;
|
||||
|
||||
const unsigned am_flags = amHostPinned;
|
||||
auto device = ihipGetTlsDefaultDevice();
|
||||
|
||||
if (device) {
|
||||
*ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags);
|
||||
if (sizeBytes && (*ptr == NULL)) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
} else {
|
||||
hc::am_memtracker_update(*ptr, device->_device_index, 0);
|
||||
}
|
||||
|
||||
tprintf (DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr);
|
||||
}
|
||||
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
{
|
||||
@@ -186,6 +172,16 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}else{
|
||||
hc::am_memtracker_update(*ptr, device->_device_index, flags);
|
||||
{
|
||||
// TODO - allow_access only works for device memory, need to change am_alloc to allocate host directly.
|
||||
LockedAccessor_DeviceCrit_t crit(device->criticalData());
|
||||
if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved:
|
||||
hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tprintf(DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr);
|
||||
}
|
||||
@@ -194,6 +190,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
// TODO - remove me, this is deprecated.
|
||||
hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
{
|
||||
@@ -201,6 +198,15 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
};
|
||||
|
||||
|
||||
//---
|
||||
// TODO - remove me, this is deprecated.
|
||||
hipError_t hipMallocHost(void** ptr, size_t sizeBytes)
|
||||
{
|
||||
return hipHostMalloc(ptr, sizeBytes, 0);
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr)
|
||||
{
|
||||
HIP_INIT_API(flagsPtr, hostPtr);
|
||||
@@ -225,6 +231,8 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr)
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(hostPtr, sizeBytes, flags);
|
||||
@@ -238,7 +246,11 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
|
||||
}
|
||||
if(device){
|
||||
if(flags == hipHostRegisterDefault){
|
||||
#if USE_HCC_LOCK
|
||||
am_status_t am_status = hc::am_memtracker_host_memory_lock(device->_acc, hostPtr, sizeBytes);
|
||||
#else
|
||||
am_status_t am_status = AM_ERROR_MISC;
|
||||
#endif
|
||||
// hsa_status_t hsa_status = hsa_amd_memory_lock(hostPtr, sizeBytes, &device->_hsa_agent, 1, &srcPtr);
|
||||
if(am_status == AM_SUCCESS){
|
||||
hip_status = hipSuccess;
|
||||
@@ -436,8 +448,6 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total)
|
||||
// TODO - replace with kernel-level for reporting free memory:
|
||||
size_t deviceMemSize, hostMemSize, userMemSize;
|
||||
hc::am_memtracker_sizeinfo(hipDevice->_acc, &deviceMemSize, &hostMemSize, &userMemSize);
|
||||
printf ("deviceMemSize=%zu\n", deviceMemSize);
|
||||
|
||||
*free = hipDevice->_props.totalGlobalMem - deviceMemSize;
|
||||
}
|
||||
|
||||
|
||||
@@ -17,6 +17,8 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hc_am.hpp>
|
||||
|
||||
#include "hip_runtime.h"
|
||||
#include "hcc_detail/hip_hcc.h"
|
||||
#include "hcc_detail/trace_helper.h"
|
||||
@@ -25,44 +27,111 @@ THE SOFTWARE.
|
||||
* @warning HCC returns 0 in *canAccessPeer ; Need to update this function when RT supports P2P
|
||||
*/
|
||||
//---
|
||||
hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int device, int peerDevice )
|
||||
hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, int deviceId, int peerDeviceId)
|
||||
{
|
||||
HIP_INIT_API(canAccessPeer, device, peerDevice);
|
||||
HIP_INIT_API(canAccessPeer, deviceId, peerDeviceId);
|
||||
|
||||
*canAccessPeer = false;
|
||||
return ihipLogStatus(hipSuccess);
|
||||
hipError_t err = hipSuccess;
|
||||
|
||||
auto thisDevice = ihipGetDevice(deviceId);
|
||||
auto peerDevice = ihipGetDevice(peerDeviceId);
|
||||
|
||||
if ((thisDevice != NULL) && (peerDevice != NULL)) {
|
||||
if (deviceId == peerDeviceId) {
|
||||
*canAccessPeer = 0;
|
||||
} else {
|
||||
#if USE_PEER_TO_PEER>=2
|
||||
*canAccessPeer = peerDevice->_acc.get_is_peer(thisDevice->_acc);
|
||||
#else
|
||||
*canAccessPeer = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
} else {
|
||||
*canAccessPeer = 0;
|
||||
err = hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
|
||||
return ihipLogStatus(err);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @warning Need to update this function when RT supports P2P
|
||||
*/
|
||||
//---
|
||||
hipError_t hipDeviceDisablePeerAccess ( int peerDevice )
|
||||
hipError_t hipDeviceDisablePeerAccess (int peerDeviceId)
|
||||
{
|
||||
HIP_INIT_API(peerDevice);
|
||||
HIP_INIT_API(peerDeviceId);
|
||||
|
||||
// TODO-p2p
|
||||
return ihipLogStatus(hipSuccess);
|
||||
hipError_t err = hipSuccess;
|
||||
|
||||
auto thisDevice = ihipGetTlsDefaultDevice();
|
||||
auto peerDevice = ihipGetDevice(peerDeviceId);
|
||||
if ((thisDevice != NULL) && (peerDevice != NULL)) {
|
||||
#if USE_PEER_TO_PEER>=2
|
||||
bool canAccessPeer = peerDevice->_acc.get_is_peer(thisDevice->_acc);
|
||||
#else
|
||||
bool canAccessPeer = 0;
|
||||
#endif
|
||||
if (! canAccessPeer) {
|
||||
err = hipErrorInvalidDevice; // P2P not allowed between these devices.
|
||||
} else if (thisDevice == peerDevice) {
|
||||
err = hipErrorInvalidDevice; // Can't disable peer access to self.
|
||||
} else {
|
||||
LockedAccessor_DeviceCrit_t crit(thisDevice->criticalData());
|
||||
bool changed = crit->removePeer(peerDevice);
|
||||
if (changed) {
|
||||
#if USE_PEER_TO_PEER>=3
|
||||
// Update the peers for all memory already saved in the tracker:
|
||||
am_memtracker_update_peers(thisDevice->_acc, crit->peerCnt(), crit->peerAgents());
|
||||
#endif
|
||||
} else {
|
||||
err = hipErrorPeerAccessNotEnabled; // never enabled P2P access.
|
||||
}
|
||||
}
|
||||
} else {
|
||||
err = hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
return ihipLogStatus(err);
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @warning Need to update this function when RT supports P2P
|
||||
*/
|
||||
//---
|
||||
hipError_t hipDeviceEnablePeerAccess ( int peerDevice, unsigned int flags )
|
||||
// Enable registering memory on peerDevice for direct access from the current device.
|
||||
hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags)
|
||||
{
|
||||
std::call_once(hip_initialized, ihipInit);
|
||||
// TODO-p2p
|
||||
return ihipLogStatus(hipSuccess);
|
||||
HIP_INIT_API(peerDeviceId, flags);
|
||||
|
||||
hipError_t err = hipSuccess;
|
||||
if (flags != 0) {
|
||||
err = hipErrorInvalidValue;
|
||||
} else {
|
||||
auto thisDevice = ihipGetTlsDefaultDevice();
|
||||
auto peerDevice = ihipGetDevice(peerDeviceId);
|
||||
if ((thisDevice != NULL) && (peerDevice != NULL)) {
|
||||
LockedAccessor_DeviceCrit_t crit(thisDevice->criticalData());
|
||||
bool isNewPeer = crit->addPeer(peerDevice);
|
||||
if (isNewPeer) {
|
||||
#if USE_PEER_TO_PEER>=3
|
||||
am_memtracker_update_peers(thisDevice->_acc, crit->peerCnt(), crit->peerAgents());
|
||||
#endif
|
||||
} else {
|
||||
err = hipErrorPeerAccessAlreadyEnabled;
|
||||
}
|
||||
} else {
|
||||
err = hipErrorInvalidDevice;
|
||||
}
|
||||
}
|
||||
|
||||
return ihipLogStatus(err);
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes )
|
||||
hipError_t hipMemcpyPeer (void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes)
|
||||
{
|
||||
std::call_once(hip_initialized, ihipInit);
|
||||
HIP_INIT_API(dst, dstDevice, src, srcDevice, sizeBytes);
|
||||
|
||||
// HCC has a unified memory architecture so device specifiers are not required.
|
||||
return hipMemcpy(dst, src, sizeBytes, hipMemcpyDefault);
|
||||
};
|
||||
@@ -72,9 +141,9 @@ hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcD
|
||||
* @bug This function uses a synchronous copy
|
||||
*/
|
||||
//---
|
||||
hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream )
|
||||
hipError_t hipMemcpyPeerAsync (void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream)
|
||||
{
|
||||
std::call_once(hip_initialized, ihipInit);
|
||||
HIP_INIT_API(dst, dstDevice, src, srcDevice, sizeBytes, stream);
|
||||
// HCC has a unified memory architecture so device specifiers are not required.
|
||||
return hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDefault, stream);
|
||||
};
|
||||
|
||||
@@ -8,6 +8,10 @@ include_directories( ${PROJECT_SOURCE_DIR}/include )
|
||||
set (HIP_Unit_Test_VERSION_MAJOR 1)
|
||||
set (HIP_Unit_Test_VERSION_MINOR 0)
|
||||
|
||||
if(NOT DEFINED HIP_MULTI_GPU)
|
||||
set(HIP_MULTI_GPU 0 CACHE BOOL "Run tests requiring more than one GPU")
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED HIP_BUILD_LOCAL)
|
||||
if(NOT DEFINED ENV{HIP_BUILD_LOCAL})
|
||||
set(HIP_BUILD_LOCAL 1 CACHE BOOL "Build HIP in local folder")
|
||||
@@ -156,20 +160,21 @@ make_hip_executable (hipSimpleAtomicsTest hipSimpleAtomicsTest.cpp)
|
||||
make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisionMathHost.cpp hipDoublePrecisionMathHost.cpp)
|
||||
make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp)
|
||||
make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp)
|
||||
#TODO - re-enable. This uses the pointer add feature.
|
||||
make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp)
|
||||
make_hip_executable (hipMultiThreadStreams1 hipMultiThreadStreams1.cpp)
|
||||
make_hip_executable (hipMultiThreadStreams2 hipMultiThreadStreams2.cpp)
|
||||
make_hip_executable (hipHostAlloc hipHostAlloc.cpp)
|
||||
make_hip_executable (hipStreamL5 hipStreamL5.cpp)
|
||||
make_hip_executable (hipHostGetFlags hipHostGetFlags.cpp)
|
||||
make_hip_executable (hipHostRegister hipHostRegister.cpp)
|
||||
#TODO - re-enable. This requires working hipHostRegister call, waiting on HCC feature.
|
||||
#make_hip_executable (hipHostRegister hipHostRegister.cpp)
|
||||
make_hip_executable (hipRandomMemcpyAsync hipRandomMemcpyAsync.cpp)
|
||||
make_hip_executable (hipMemoryAllocate hipMemoryAllocate.cpp)
|
||||
make_hip_executable (hipFuncSetDeviceFlags hipFuncSetDeviceFlags.cpp)
|
||||
make_hip_executable (hipFuncGetDevice hipFuncGetDevice.cpp)
|
||||
make_hip_executable (hipFuncSetDevice hipFuncSetDevice.cpp)
|
||||
make_hip_executable (hipFuncDeviceSynchronize hipFuncDeviceSynchronize.cpp)
|
||||
make_hip_executable (hipPeerToPeer_simple hipPeerToPeer_simple.cpp)
|
||||
|
||||
make_hip_executable (hipMultiThreadDevice hipMultiThreadDevice.cpp)
|
||||
|
||||
@@ -218,4 +223,11 @@ make_named_test (hipMultiThreadDevice "hipMultiThreadDevice-serial" --tests 0x1)
|
||||
make_named_test (hipMultiThreadDevice "hipMultiThreadDevice-pyramid" --tests 0x4)
|
||||
make_named_test (hipMultiThreadDevice "hipMultiThreadDevice-nearzero" --tests 0x10)
|
||||
|
||||
if (${HIP_MULTI_GPU})
|
||||
make_test(hipPeerToPeer_simple " ") # use current device for copy, this fails.
|
||||
make_test(hipPeerToPeer_simple --memcpyWithPeer)
|
||||
make_test(hipPeerToPeer_simple --mirrorPeers) # mirror mapping: test to ensure mirror doesn't destroy orig mapping.
|
||||
|
||||
endif()
|
||||
|
||||
make_hipify_test(specialFunc.cu )
|
||||
|
||||
@@ -116,12 +116,12 @@ int main(int argc, char *argv[])
|
||||
/*disable, this takess a while and if the next one works then no need to run serial*/
|
||||
if (1 && (p_tests & 0x2)) {
|
||||
printf ("\ntest 0x2 : serialized multiThread_pyramid(1) \n");
|
||||
multiThread_pyramid(true, 10);
|
||||
multiThread_pyramid(true, 3);
|
||||
}
|
||||
|
||||
if (p_tests & 0x4) {
|
||||
printf ("\ntest 0x4 : parallel multiThread_pyramid(1) \n");
|
||||
multiThread_pyramid(false, 10);
|
||||
multiThread_pyramid(false, 3);
|
||||
}
|
||||
|
||||
//if (p_tests & 0x8) {
|
||||
|
||||
@@ -0,0 +1,256 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Simple test for memset.
|
||||
// Also serves as a template for other tests.
|
||||
|
||||
#include "hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
bool p_memcpyWithPeer = false; // use the peer device for the P2P copy
|
||||
bool p_mirrorPeers = false; // in addition to mapping current to peer space, map peer to current space.
|
||||
int p_peerDevice = -1; // explicly specify which peer to use, else use p_gpuDevice + 1.
|
||||
|
||||
|
||||
int g_currentDevice;
|
||||
int g_peerDevice;
|
||||
|
||||
void parseMyArguments(int argc, char *argv[])
|
||||
{
|
||||
int more_argc = HipTest::parseStandardArguments(argc, argv, false);
|
||||
// parse args for this test:
|
||||
for (int i = 1; i < more_argc; i++) {
|
||||
const char *arg = argv[i];
|
||||
|
||||
if (!strcmp(arg, "--memcpyWithPeer")) {
|
||||
p_memcpyWithPeer = true;
|
||||
} else if (!strcmp(arg, "--mirrorPeers")) {
|
||||
p_mirrorPeers = true;
|
||||
} else if (!strcmp(arg, "--peerDevice")) {
|
||||
if (++i >= argc || !HipTest::parseInt(argv[i], &p_peerDevice)) {
|
||||
failed("Bad peerDevice argument");
|
||||
}
|
||||
} else {
|
||||
failed("Bad argument '%s'", arg);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
// Sets globals g_currentDevice, g_peerDevice
|
||||
void setupPeerTests()
|
||||
{
|
||||
int deviceCnt;
|
||||
|
||||
HIPCHECK(hipGetDeviceCount(&deviceCnt));
|
||||
|
||||
g_currentDevice = p_gpuDevice;
|
||||
g_peerDevice = (p_peerDevice == -1) ? ((g_currentDevice + 1) % deviceCnt) : p_peerDevice;
|
||||
|
||||
printf ("N=%zu device=%d peerDevice=%d (%d devices total)\n", N, g_currentDevice, g_peerDevice, deviceCnt);
|
||||
|
||||
// Must be on a multi-gpu system:
|
||||
assert (g_currentDevice != g_peerDevice);
|
||||
|
||||
int canAccessPeer;
|
||||
HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer, g_currentDevice, g_peerDevice));
|
||||
printf ("dev#%d canAccessPeer:#%d=%d\n", g_currentDevice, g_peerDevice, canAccessPeer);
|
||||
|
||||
assert(canAccessPeer);
|
||||
|
||||
HIPCHECK (hipSetDevice(g_currentDevice));
|
||||
HIPCHECK(hipDeviceReset());
|
||||
HIPCHECK (hipSetDevice(g_peerDevice));
|
||||
HIPCHECK(hipDeviceReset());
|
||||
}
|
||||
|
||||
//---
|
||||
// Test which enables peer2peer first, then allocates the memory.
|
||||
void enablePeerFirst()
|
||||
{
|
||||
printf ("\n==testing: %s\n", __func__);
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(g_currentDevice));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(g_peerDevice, 0));
|
||||
|
||||
if (p_mirrorPeers) {
|
||||
int canAccessPeer;
|
||||
HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer, g_peerDevice, g_currentDevice));
|
||||
assert(canAccessPeer);
|
||||
|
||||
HIPCHECK(hipSetDevice(g_peerDevice));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(g_currentDevice, 0));
|
||||
}
|
||||
|
||||
size_t Nbytes = N*sizeof(char);
|
||||
|
||||
char *A_d0, *A_d1;
|
||||
char *A_h;
|
||||
|
||||
A_h = (char*)malloc(Nbytes);
|
||||
|
||||
// allocate and initialize memory on device0
|
||||
HIPCHECK (hipSetDevice(g_currentDevice));
|
||||
HIPCHECK (hipMalloc(&A_d0, Nbytes) );
|
||||
HIPCHECK ( hipMemset(A_d0, memsetval, Nbytes) );
|
||||
|
||||
// allocate and initialize memory on peer device
|
||||
HIPCHECK (hipSetDevice(g_peerDevice));
|
||||
HIPCHECK (hipMalloc(&A_d1, Nbytes) );
|
||||
HIPCHECK ( hipMemset(A_d1, 0x13, Nbytes) );
|
||||
|
||||
|
||||
|
||||
// Device0 push to device1, using P2P:
|
||||
HIPCHECK (hipSetDevice(p_memcpyWithPeer ? g_peerDevice : g_currentDevice));
|
||||
HIPCHECK (hipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault));
|
||||
|
||||
// Copy data back to host:
|
||||
HIPCHECK (hipSetDevice(g_peerDevice));
|
||||
HIPCHECK (hipMemcpy(A_h, A_d1, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Check host data:
|
||||
for (int i=0; i<N; i++) {
|
||||
if (A_h[i] != memsetval) {
|
||||
failed("mismatch at index:%d computed:0x%02x, golden memsetval:0x%02x\n", i, (int)A_h[i], (int)memsetval);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
// Test which allocated memory first, then enables peer2peer.
|
||||
// Enabling peer needs to scan all allocated memory and enable peer access.
|
||||
void allocMemoryFirst()
|
||||
{
|
||||
printf ("\n==testing: %s\n", __func__);
|
||||
|
||||
setupPeerTests();
|
||||
|
||||
size_t Nbytes = N*sizeof(char);
|
||||
|
||||
char *A_d0, *A_d1;
|
||||
char *A_h;
|
||||
|
||||
A_h = (char*)malloc(Nbytes);
|
||||
|
||||
//---
|
||||
// allocate and initialize memory on device0
|
||||
HIPCHECK (hipSetDevice(g_currentDevice));
|
||||
HIPCHECK (hipMalloc(&A_d0, Nbytes) );
|
||||
HIPCHECK ( hipMemset(A_d0, memsetval, Nbytes) );
|
||||
|
||||
// allocate and initialize memory on peer device
|
||||
HIPCHECK (hipSetDevice(g_peerDevice));
|
||||
HIPCHECK (hipMalloc(&A_d1, Nbytes) );
|
||||
HIPCHECK ( hipMemset(A_d1, 0x13, Nbytes) );
|
||||
|
||||
|
||||
//---
|
||||
//Enable peer access, for memory already allocated:
|
||||
HIPCHECK(hipSetDevice(g_currentDevice));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(g_peerDevice, 0));
|
||||
|
||||
if (p_mirrorPeers) {
|
||||
int canAccessPeer;
|
||||
HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer, g_peerDevice, g_currentDevice));
|
||||
assert(canAccessPeer);
|
||||
|
||||
HIPCHECK(hipSetDevice(g_peerDevice));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(g_currentDevice, 0));
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
// Copies to test functionality:
|
||||
// Device0 push to device1, using P2P:
|
||||
HIPCHECK (hipSetDevice(p_memcpyWithPeer ? g_peerDevice : g_currentDevice));
|
||||
HIPCHECK (hipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault));
|
||||
|
||||
// Copy data back to host:
|
||||
HIPCHECK (hipSetDevice(g_peerDevice));
|
||||
HIPCHECK (hipMemcpy(A_h, A_d1, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
|
||||
//---
|
||||
// Check host data:
|
||||
for (int i=0; i<N; i++) {
|
||||
if (A_h[i] != memsetval) {
|
||||
failed("mismatch at index:%d computed:0x%02x, golden memsetval:0x%02x\n", i, (int)A_h[i], (int)memsetval);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void simpleNegative()
|
||||
{
|
||||
printf ("\n==testing: %s\n", __func__);
|
||||
|
||||
setupPeerTests();
|
||||
|
||||
int deviceId;
|
||||
HIPCHECK (hipGetDevice(&deviceId));
|
||||
|
||||
//---
|
||||
//-- self is not a peer
|
||||
int canAccessPeer;
|
||||
hipError_t e = hipDeviceCanAccessPeer(&canAccessPeer, deviceId, deviceId);
|
||||
HIPASSERT( e == hipSuccess); // no error returned, it doesn't hurt to ask.
|
||||
HIPASSERT (canAccessPeer == 0); // but self is not a peer.
|
||||
|
||||
e = hipSuccess;
|
||||
//---
|
||||
// Enable same device twice in a row:
|
||||
HIPCHECK(hipSetDevice(g_currentDevice));
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(g_peerDevice, 0));
|
||||
e =(hipDeviceEnablePeerAccess(g_peerDevice, 0));
|
||||
HIPASSERT (e == hipErrorPeerAccessAlreadyEnabled);
|
||||
|
||||
//---
|
||||
// try disabling twice in a row
|
||||
HIPCHECK(hipDeviceDisablePeerAccess(g_peerDevice));
|
||||
e =(hipDeviceDisablePeerAccess(g_peerDevice));
|
||||
HIPASSERT (e == hipErrorPeerAccessNotEnabled);
|
||||
|
||||
|
||||
// More tests here:
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
parseMyArguments(argc, argv);
|
||||
|
||||
if (p_tests & 0x1) {
|
||||
enablePeerFirst();
|
||||
}
|
||||
|
||||
if (p_tests & 0x2) {
|
||||
allocMemoryFirst();
|
||||
}
|
||||
|
||||
if (p_tests & 0x4) {
|
||||
simpleNegative();
|
||||
}
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -111,7 +111,7 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg)
|
||||
failed("Bad iterations argument");
|
||||
}
|
||||
|
||||
} else if (!strcmp(arg, "--gpu") || (!strcmp(arg, "-g"))) {
|
||||
} else if (!strcmp(arg, "--gpu") || (!strcmp(arg, "-gpuDevice")) || (!strcmp(arg, "-g"))) {
|
||||
if (++i >= argc || !HipTest::parseInt(argv[i], &p_gpuDevice)) {
|
||||
failed("Bad gpuDevice argument");
|
||||
}
|
||||
|
||||
@@ -158,6 +158,12 @@ syn keyword hipFunctionName hipStreamSynchronize
|
||||
syn keyword hipFunctionName hipThreadExit
|
||||
syn keyword hipFunctionName hipThreadSynchronize
|
||||
syn keyword hipFunctionName hipUnbindTexture
|
||||
syn keyword hipFunctionName hipDeviceCanAccessPeer
|
||||
syn keyword hipFunctionName hipDeviceEnablePeerAccess
|
||||
syn keyword hipFunctionName hipDeviceDisablePeerAccess
|
||||
syn keyword hipFunctionName hipMemcpyPeer
|
||||
syn keyword hipFunctionName hipMemcpyPeerAsync
|
||||
|
||||
|
||||
" HIP Flags
|
||||
syn keyword hipFlags hipFilterModePoint
|
||||
|
||||
Reference in New Issue
Block a user