hipHostMalloc allocation are mapped to all devices by default.

Support hipHostMallocPortable flag.
Default flags are hipHostMallocPortable | hipHostMallocMapped.

Also:
-refactor tests to move addCount and addCountReverse into HipTest
namespace.
-test multi-GPU host memory.
Этот коммит содержится в:
Ben Sander
2017-05-10 17:32:25 -05:00
родитель ae9fdf9bc1
Коммит ff9bed6535
7 изменённых файлов: 181 добавлений и 96 удалений
+9
Просмотреть файл
@@ -118,6 +118,7 @@ bool g_visible_device = false;
unsigned g_deviceCnt;
std::vector<int> g_hip_visible_devices;
hsa_agent_t g_cpu_agent;
hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
unsigned g_numLogicalThreads;
std::atomic<int> g_lastShortTid(1);
@@ -1389,6 +1390,14 @@ void ihipInit()
g_deviceCnt++;
}
}
g_allAgents = static_cast<hsa_agent_t*> (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t)));
g_allAgents[0] = g_cpu_agent;
for (int i=0; i<g_deviceCnt; i++) {
g_allAgents[i+1] = g_deviceArray[i]->_hsaAgent;
}
g_numLogicalThreads = std::thread::hardware_concurrency();
// If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized
+1
Просмотреть файл
@@ -826,6 +826,7 @@ private: // Critical data, protected with locked access:
extern std::once_flag hip_initialized;
extern unsigned g_deviceCnt;
extern hsa_agent_t g_cpu_agent ; // the CPU agent.
extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
//=================================================================================================
// Extern functions:
+35 -23
Просмотреть файл
@@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK
}
// return 0 on success or -1 on error:
int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags)
{
int ret = 0;
auto device = ctx->getWriteableDevice();
hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
int peerCnt=0;
{
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
// the peerCnt always stores self so make sure the trace actually
peerCnt = crit->peerCnt();
tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1);
if (peerCnt > 1) {
//printf ("peer self access\n");
if (shareWithAll) {
hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr);
tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt);
if (s != HSA_STATUS_SUCCESS) {
ret = -1;
}
} else {
int peerCnt=0;
{
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
// the peerCnt always stores self so make sure the trace actually
peerCnt = crit->peerCnt();
tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1);
if (peerCnt > 1) {
// TODOD - remove me:
for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
};
//printf ("peer self access\n");
hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
if (s != HSA_STATUS_SUCCESS) {
ret = -1;
// TODOD - remove me:
for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
};
hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
if (s != HSA_STATUS_SUCCESS) {
ret = -1;
}
}
}
}
@@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
// Allocate a new pointer with am_alloc and share with all valid peers.
// Returns null-ptr if a memory error occurs (either allocation or sharing)
void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags)
void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags)
{
void *ptr = nullptr;
@@ -108,7 +117,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig
msg, ptr, static_cast<char*>(ptr)+sizeBytes, sizeBytes, device->_deviceId);
if (ptr != nullptr) {
int r = sharePtr(ptr, ctx, hipFlags);
int r = sharePtr(ptr, ctx, shareWithAll, hipFlags);
if (r != 0) {
ptr = nullptr;
}
@@ -220,7 +229,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
} else {
auto device = ctx->getWriteableDevice();
*ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/);
*ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/);
if(sizeBytes && (*ptr == NULL)){
hip_status = hipErrorMemoryAllocation;
@@ -253,7 +262,8 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
} else {
unsigned trueFlags = flags;
if (flags == hipHostMallocDefault) {
trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined;
// HCC/ROCM provide a modern system with unified memory and should set both of these flags by default:
trueFlags = hipHostMallocMapped | hipHostMallocPortable;
}
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined;
@@ -265,8 +275,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
auto device = ctx->getWriteableDevice();
unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
*ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host",
sizeBytes, ctx, amFlags, flags);
sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags);
if(sizeBytes && (*ptr == NULL)){
hip_status = hipErrorMemoryAllocation;
}
@@ -314,7 +326,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
auto device = ctx->getWriteableDevice();
const unsigned am_flags = 0;
*ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0);
*ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0);
if (sizeBytes && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
@@ -373,7 +385,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
hip_status = hipErrorUnknown;
break;
}
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0);
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0);
if (size && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
}
+1 -1
Просмотреть файл
@@ -99,7 +99,7 @@ inline int zrand(int max)
//=================================================================================================
// Functins to run tests
// Functions to run tests
//=================================================================================================
//--
//Run through a couple simple cases to test lookups and host pointer arithmetic:
+95 -34
Просмотреть файл
@@ -25,45 +25,106 @@ THE SOFTWARE.
#include"test_common.h"
#define SIZE 1024*1024*256
#define NUM_ELEMENTS 1024*1024*64
#define SIZE NUM_ELEMENTS*sizeof(int)
int main(){
float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
B = (float*)malloc(SIZE);
hipMalloc((void**)&Ad, SIZE);
hipHostMalloc((void**)&B, SIZE);
hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
HIPCHECK_API(hipFree(Ad) , hipSuccess);
HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
HIPCHECK_API(hipFree(B) , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
HIPCHECK_API(hipHostFree(C) , hipSuccess);
int p_count = 4;
HIPCHECK_API(hipFree(NULL) , hipSuccess);
HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
void multiGpuHostAlloc(int allocDevice)
{
int numDevices;
HIPCHECK(hipGetDeviceCount(&numDevices));
assert(numDevices > 1);
printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices);
HIPCHECK(hipSetDevice(allocDevice));
int *Ah, *Ch;
hipHostMalloc((void**)&Ah, SIZE);
hipHostMalloc((void**)&Ch, SIZE);
const int init = -1;
for (size_t i=0; i<NUM_ELEMENTS; i++) {
Ah[i] = init;
Ch[i] = -2;
}
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, NUM_ELEMENTS);
// The host memory allocations should be visible on all of the devices - verify by launching a kernel here that reads those devices:
for (int i=0; i<numDevices; i++) {
HIPCHECK(hipSetDevice(i));
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0/*_stream*/, Ah, Ch, NUM_ELEMENTS, p_count);
HIPCHECK(hipDeviceSynchronize());
};
int expected = init + p_count;
for (size_t i=0; i<NUM_ELEMENTS; i++) {
if (Ch[i] != expected) {
failed("for Ch[%zu] (%d) != expected(%d)\n", i, Ch[i], expected);
}
}
}
int main(int argc, char *argv[])
{
int more_argc = HipTest::parseStandardArguments(argc, argv, false);
//assert(more_argc == 0);
{
// Some negative testing - request a too-big allocation and verify it fails:
// Someday when we support virtual memory may need to refactor these:
size_t tooBig = 128LL*1024*1024*1024*1024; // 128 TB;
void *p;
HIPCHECK_API ( hipMalloc(&p, tooBig), hipErrorMemoryAllocation );
HIPCHECK_API ( hipHostMalloc(&p, tooBig), hipErrorMemoryAllocation );
float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
B = (float*)malloc(SIZE);
hipMalloc((void**)&Ad, SIZE);
hipHostMalloc((void**)&B, SIZE);
hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
HIPCHECK_API(hipFree(Ad) , hipSuccess);
HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
HIPCHECK_API(hipFree(B) , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
HIPCHECK_API(hipHostFree(C) , hipSuccess);
HIPCHECK_API(hipFree(NULL) , hipSuccess);
HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
{
// Some negative testing - request a too-big allocation and verify it fails:
// Someday when we support virtual memory may need to refactor these:
size_t tooBig = 128LL*1024*1024*1024*1024; // 128 TB;
void *p;
HIPCHECK_API ( hipMalloc(&p, tooBig), hipErrorMemoryAllocation );
HIPCHECK_API ( hipHostMalloc(&p, tooBig), hipErrorMemoryAllocation );
}
}
{
int numDevices;
HIPCHECK(hipGetDeviceCount(&numDevices));
assert(numDevices > 1);
multiGpuHostAlloc(0);
multiGpuHostAlloc(1);
}
passed();
+2 -38
Просмотреть файл
@@ -35,42 +35,6 @@ unsigned p_count = 100;
template <typename T>
__global__ void
addCount( const T *A_d,
T *C_d,
size_t NELEM,
int count)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
for (size_t i=offset; i<NELEM; i+=stride) {
C_d[i] = A_d[i] + (T)count;
}
}
}
template <typename T>
__global__ void
addCountReverse( const T *A_d,
T *C_d,
int64_t NELEM,
int count)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
C_d[i] = A_d[i] + (T)count;
}
}
}
//------
@@ -171,9 +135,9 @@ void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
if (_reverse) {
hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
} else {
hipLaunchKernelGGL(addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
}
HIPCHECK(hipEventRecord(_event, _stream));
+38
Просмотреть файл
@@ -146,6 +146,44 @@ vectorADD(hipLaunchParm lp,
}
template <typename T>
__global__ void
addCount( const T *A_d,
T *C_d,
size_t NELEM,
int count)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
for (size_t i=offset; i<NELEM; i+=stride) {
C_d[i] = A_d[i] + (T)count;
}
}
}
template <typename T>
__global__ void
addCountReverse( const T *A_d,
T *C_d,
int64_t NELEM,
int count)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
C_d[i] = A_d[i] + (T)count;
}
}
}
template <typename T>
void initArraysForHost(T **A_h, T **B_h, T **C_h,
size_t N, bool usePinnedHost=false)