hipHostMalloc allocation are mapped to all devices by default.
Support hipHostMallocPortable flag. Default flags are hipHostMallocPortable | hipHostMallocMapped. Also: -refactor tests to move addCount and addCountReverse into HipTest namespace. -test multi-GPU host memory.
Этот коммит содержится в:
@@ -118,6 +118,7 @@ bool g_visible_device = false;
|
||||
unsigned g_deviceCnt;
|
||||
std::vector<int> g_hip_visible_devices;
|
||||
hsa_agent_t g_cpu_agent;
|
||||
hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
|
||||
unsigned g_numLogicalThreads;
|
||||
|
||||
std::atomic<int> g_lastShortTid(1);
|
||||
@@ -1389,6 +1390,14 @@ void ihipInit()
|
||||
g_deviceCnt++;
|
||||
}
|
||||
}
|
||||
|
||||
g_allAgents = static_cast<hsa_agent_t*> (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t)));
|
||||
g_allAgents[0] = g_cpu_agent;
|
||||
for (int i=0; i<g_deviceCnt; i++) {
|
||||
g_allAgents[i+1] = g_deviceArray[i]->_hsaAgent;
|
||||
}
|
||||
|
||||
|
||||
g_numLogicalThreads = std::thread::hardware_concurrency();
|
||||
|
||||
// If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized
|
||||
|
||||
@@ -826,6 +826,7 @@ private: // Critical data, protected with locked access:
|
||||
extern std::once_flag hip_initialized;
|
||||
extern unsigned g_deviceCnt;
|
||||
extern hsa_agent_t g_cpu_agent ; // the CPU agent.
|
||||
extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
|
||||
|
||||
//=================================================================================================
|
||||
// Extern functions:
|
||||
|
||||
@@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK
|
||||
}
|
||||
|
||||
// return 0 on success or -1 on error:
|
||||
int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
|
||||
int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
auto device = ctx->getWriteableDevice();
|
||||
|
||||
hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
|
||||
int peerCnt=0;
|
||||
{
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
// the peerCnt always stores self so make sure the trace actually
|
||||
peerCnt = crit->peerCnt();
|
||||
tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1);
|
||||
if (peerCnt > 1) {
|
||||
|
||||
//printf ("peer self access\n");
|
||||
if (shareWithAll) {
|
||||
hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr);
|
||||
tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt);
|
||||
if (s != HSA_STATUS_SUCCESS) {
|
||||
ret = -1;
|
||||
}
|
||||
} else {
|
||||
int peerCnt=0;
|
||||
{
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
// the peerCnt always stores self so make sure the trace actually
|
||||
peerCnt = crit->peerCnt();
|
||||
tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1);
|
||||
if (peerCnt > 1) {
|
||||
|
||||
// TODOD - remove me:
|
||||
for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
|
||||
tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
|
||||
};
|
||||
//printf ("peer self access\n");
|
||||
|
||||
hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
|
||||
if (s != HSA_STATUS_SUCCESS) {
|
||||
ret = -1;
|
||||
// TODOD - remove me:
|
||||
for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
|
||||
tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
|
||||
};
|
||||
|
||||
hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
|
||||
if (s != HSA_STATUS_SUCCESS) {
|
||||
ret = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
|
||||
|
||||
// Allocate a new pointer with am_alloc and share with all valid peers.
|
||||
// Returns null-ptr if a memory error occurs (either allocation or sharing)
|
||||
void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags)
|
||||
void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags)
|
||||
{
|
||||
|
||||
void *ptr = nullptr;
|
||||
@@ -108,7 +117,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig
|
||||
msg, ptr, static_cast<char*>(ptr)+sizeBytes, sizeBytes, device->_deviceId);
|
||||
|
||||
if (ptr != nullptr) {
|
||||
int r = sharePtr(ptr, ctx, hipFlags);
|
||||
int r = sharePtr(ptr, ctx, shareWithAll, hipFlags);
|
||||
if (r != 0) {
|
||||
ptr = nullptr;
|
||||
}
|
||||
@@ -220,7 +229,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
|
||||
} else {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
*ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/);
|
||||
*ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/);
|
||||
|
||||
if(sizeBytes && (*ptr == NULL)){
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
@@ -253,7 +262,8 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
} else {
|
||||
unsigned trueFlags = flags;
|
||||
if (flags == hipHostMallocDefault) {
|
||||
trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined;
|
||||
// HCC/ROCM provide a modern system with unified memory and should set both of these flags by default:
|
||||
trueFlags = hipHostMallocMapped | hipHostMallocPortable;
|
||||
}
|
||||
|
||||
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined;
|
||||
@@ -265,8 +275,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
auto device = ctx->getWriteableDevice();
|
||||
unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
|
||||
|
||||
|
||||
*ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host",
|
||||
sizeBytes, ctx, amFlags, flags);
|
||||
sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags);
|
||||
|
||||
if(sizeBytes && (*ptr == NULL)){
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
@@ -314,7 +326,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
|
||||
auto device = ctx->getWriteableDevice();
|
||||
|
||||
const unsigned am_flags = 0;
|
||||
*ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0);
|
||||
*ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0);
|
||||
|
||||
if (sizeBytes && (*ptr == NULL)) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
@@ -373,7 +385,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
|
||||
hip_status = hipErrorUnknown;
|
||||
break;
|
||||
}
|
||||
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0);
|
||||
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0);
|
||||
if (size && (*ptr == NULL)) {
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
|
||||
@@ -99,7 +99,7 @@ inline int zrand(int max)
|
||||
|
||||
|
||||
//=================================================================================================
|
||||
// Functins to run tests
|
||||
// Functions to run tests
|
||||
//=================================================================================================
|
||||
//--
|
||||
//Run through a couple simple cases to test lookups and host pointer arithmetic:
|
||||
|
||||
@@ -25,45 +25,106 @@ THE SOFTWARE.
|
||||
|
||||
#include"test_common.h"
|
||||
|
||||
#define SIZE 1024*1024*256
|
||||
#define NUM_ELEMENTS 1024*1024*64
|
||||
#define SIZE NUM_ELEMENTS*sizeof(int)
|
||||
|
||||
int main(){
|
||||
float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
|
||||
B = (float*)malloc(SIZE);
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipHostMalloc((void**)&B, SIZE);
|
||||
hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
|
||||
hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
|
||||
hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
|
||||
|
||||
hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
|
||||
|
||||
HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
|
||||
|
||||
HIPCHECK_API(hipFree(Ad) , hipSuccess);
|
||||
HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
|
||||
|
||||
HIPCHECK_API(hipFree(B) , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
|
||||
HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
|
||||
HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
|
||||
HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
|
||||
HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
|
||||
HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
|
||||
|
||||
HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
|
||||
HIPCHECK_API(hipHostFree(C) , hipSuccess);
|
||||
int p_count = 4;
|
||||
|
||||
|
||||
HIPCHECK_API(hipFree(NULL) , hipSuccess);
|
||||
HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
|
||||
void multiGpuHostAlloc(int allocDevice)
|
||||
{
|
||||
|
||||
int numDevices;
|
||||
HIPCHECK(hipGetDeviceCount(&numDevices));
|
||||
assert(numDevices > 1);
|
||||
|
||||
printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices);
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(allocDevice));
|
||||
|
||||
int *Ah, *Ch;
|
||||
hipHostMalloc((void**)&Ah, SIZE);
|
||||
hipHostMalloc((void**)&Ch, SIZE);
|
||||
|
||||
const int init = -1;
|
||||
for (size_t i=0; i<NUM_ELEMENTS; i++) {
|
||||
Ah[i] = init;
|
||||
Ch[i] = -2;
|
||||
}
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, NUM_ELEMENTS);
|
||||
|
||||
// The host memory allocations should be visible on all of the devices - verify by launching a kernel here that reads those devices:
|
||||
for (int i=0; i<numDevices; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0/*_stream*/, Ah, Ch, NUM_ELEMENTS, p_count);
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
};
|
||||
|
||||
|
||||
int expected = init + p_count;
|
||||
for (size_t i=0; i<NUM_ELEMENTS; i++) {
|
||||
if (Ch[i] != expected) {
|
||||
failed("for Ch[%zu] (%d) != expected(%d)\n", i, Ch[i], expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int more_argc = HipTest::parseStandardArguments(argc, argv, false);
|
||||
//assert(more_argc == 0);
|
||||
|
||||
{
|
||||
// Some negative testing - request a too-big allocation and verify it fails:
|
||||
// Someday when we support virtual memory may need to refactor these:
|
||||
size_t tooBig = 128LL*1024*1024*1024*1024; // 128 TB;
|
||||
void *p;
|
||||
HIPCHECK_API ( hipMalloc(&p, tooBig), hipErrorMemoryAllocation );
|
||||
HIPCHECK_API ( hipHostMalloc(&p, tooBig), hipErrorMemoryAllocation );
|
||||
float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
|
||||
B = (float*)malloc(SIZE);
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipHostMalloc((void**)&B, SIZE);
|
||||
hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
|
||||
hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
|
||||
hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
|
||||
|
||||
hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
|
||||
|
||||
HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
|
||||
|
||||
HIPCHECK_API(hipFree(Ad) , hipSuccess);
|
||||
HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
|
||||
|
||||
HIPCHECK_API(hipFree(B) , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
|
||||
HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
|
||||
HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
|
||||
HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
|
||||
HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
|
||||
HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
|
||||
|
||||
HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
|
||||
HIPCHECK_API(hipHostFree(C) , hipSuccess);
|
||||
|
||||
|
||||
HIPCHECK_API(hipFree(NULL) , hipSuccess);
|
||||
HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
|
||||
|
||||
{
|
||||
// Some negative testing - request a too-big allocation and verify it fails:
|
||||
// Someday when we support virtual memory may need to refactor these:
|
||||
size_t tooBig = 128LL*1024*1024*1024*1024; // 128 TB;
|
||||
void *p;
|
||||
HIPCHECK_API ( hipMalloc(&p, tooBig), hipErrorMemoryAllocation );
|
||||
HIPCHECK_API ( hipHostMalloc(&p, tooBig), hipErrorMemoryAllocation );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
int numDevices;
|
||||
HIPCHECK(hipGetDeviceCount(&numDevices));
|
||||
assert(numDevices > 1);
|
||||
|
||||
multiGpuHostAlloc(0);
|
||||
multiGpuHostAlloc(1);
|
||||
}
|
||||
|
||||
passed();
|
||||
|
||||
@@ -35,42 +35,6 @@ unsigned p_count = 100;
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
addCount( const T *A_d,
|
||||
T *C_d,
|
||||
size_t NELEM,
|
||||
int count)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
|
||||
// Deliberately do this in an inefficient way to increase kernel runtime
|
||||
for (int i=0; i<count; i++) {
|
||||
for (size_t i=offset; i<NELEM; i+=stride) {
|
||||
C_d[i] = A_d[i] + (T)count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
addCountReverse( const T *A_d,
|
||||
T *C_d,
|
||||
int64_t NELEM,
|
||||
int count)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
|
||||
// Deliberately do this in an inefficient way to increase kernel runtime
|
||||
for (int i=0; i<count; i++) {
|
||||
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
|
||||
C_d[i] = A_d[i] + (T)count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------
|
||||
@@ -171,9 +135,9 @@ void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
|
||||
if (_reverse) {
|
||||
hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
|
||||
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
|
||||
} else {
|
||||
hipLaunchKernelGGL(addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
|
||||
hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
|
||||
}
|
||||
HIPCHECK(hipEventRecord(_event, _stream));
|
||||
|
||||
|
||||
@@ -146,6 +146,44 @@ vectorADD(hipLaunchParm lp,
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
addCount( const T *A_d,
|
||||
T *C_d,
|
||||
size_t NELEM,
|
||||
int count)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
|
||||
// Deliberately do this in an inefficient way to increase kernel runtime
|
||||
for (int i=0; i<count; i++) {
|
||||
for (size_t i=offset; i<NELEM; i+=stride) {
|
||||
C_d[i] = A_d[i] + (T)count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
addCountReverse( const T *A_d,
|
||||
T *C_d,
|
||||
int64_t NELEM,
|
||||
int count)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
|
||||
// Deliberately do this in an inefficient way to increase kernel runtime
|
||||
for (int i=0; i<count; i++) {
|
||||
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
|
||||
C_d[i] = A_d[i] + (T)count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void initArraysForHost(T **A_h, T **B_h, T **C_h,
|
||||
size_t N, bool usePinnedHost=false)
|
||||
|
||||
Ссылка в новой задаче
Block a user