hipHostMalloc allocation are mapped to all devices by default.

Support hipHostMallocPortable flag. Default flags are hipHostMallocPortable | hipHostMallocMapped. Also: -refactor tests to move addCount and addCountReverse into HipTest namespace. -test multi-GPU host memory.
2017-05-10 17:32:25 -05:00
@@ -118,6 +118,7 @@ bool g_visible_device = false;
 unsigned g_deviceCnt;
 std::vector<int> g_hip_visible_devices;
 hsa_agent_t g_cpu_agent;
+hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
 unsigned g_numLogicalThreads;

 std::atomic<int> g_lastShortTid(1);
@@ -1389,6 +1390,14 @@ void ihipInit()
            g_deviceCnt++;
        }
    }
+
+    g_allAgents = static_cast<hsa_agent_t*> (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t)));
+    g_allAgents[0] = g_cpu_agent;
+    for (int i=0; i<g_deviceCnt; i++) {
+        g_allAgents[i+1] = g_deviceArray[i]->_hsaAgent;
+    }
+
+
    g_numLogicalThreads = std::thread::hardware_concurrency();

    // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized
@@ -826,6 +826,7 @@ private:  // Critical data, protected with locked access:
 extern std::once_flag hip_initialized;
 extern unsigned g_deviceCnt;
 extern hsa_agent_t g_cpu_agent ;   // the CPU agent.
+extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.

 //=================================================================================================
 // Extern functions:
@@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK
 }

 // return 0 on success or -1 on error:
-int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
+int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags)
 {
    int ret = 0;

    auto device = ctx->getWriteableDevice();

    hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
-    int peerCnt=0;
-    {
-        LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-        // the peerCnt always stores self so make sure the trace actually
-        peerCnt = crit->peerCnt();
-        tprintf(DB_MEM, "  allow access to %d other peer(s)\n", peerCnt-1);
-        if (peerCnt > 1) {

-            //printf ("peer self access\n");
+    if (shareWithAll) {
+        hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr);
+        tprintf (DB_MEM, "    allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); 
+        if (s != HSA_STATUS_SUCCESS) {
+            ret = -1;
+        }
+    } else {
+        int peerCnt=0;
+        {
+            LockedAccessor_CtxCrit_t crit(ctx->criticalData());
+            // the peerCnt always stores self so make sure the trace actually
+            peerCnt = crit->peerCnt();
+            tprintf(DB_MEM, "  allow access to %d other peer(s)\n", peerCnt-1);
+            if (peerCnt > 1) {

-            // TODOD - remove me:
-            for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
-                tprintf (DB_MEM, "    allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
-            };
+                //printf ("peer self access\n");

-            hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
-            if (s != HSA_STATUS_SUCCESS) {
-                ret = -1;
+                // TODOD - remove me:
+                for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
+                    tprintf (DB_MEM, "    allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
+                };
+
+                hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
+                if (s != HSA_STATUS_SUCCESS) {
+                    ret = -1;
+                }
            }
        }
    }
@@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)

 // Allocate a new pointer with am_alloc and share with all valid peers.
 // Returns null-ptr if a memory error occurs (either allocation or sharing)
-void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags)
+void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags)
 {

    void *ptr = nullptr;
@@ -108,7 +117,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig
            msg, ptr, static_cast<char*>(ptr)+sizeBytes, sizeBytes, device->_deviceId);

    if (ptr != nullptr) {
-        int r = sharePtr(ptr, ctx, hipFlags);
+        int r = sharePtr(ptr, ctx, shareWithAll, hipFlags);
        if (r != 0) {
            ptr = nullptr;
        }
@@ -220,7 +229,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)

    } else {
        auto device = ctx->getWriteableDevice();
-        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx,  0/*amFlags*/, 0/*hipFlags*/);
+        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx,  false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/);

        if(sizeBytes  && (*ptr == NULL)){
            hip_status = hipErrorMemoryAllocation;
@@ -253,7 +262,8 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
    } else {
        unsigned trueFlags = flags;
        if (flags == hipHostMallocDefault) {
-            trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined;
+            // HCC/ROCM provide a modern system with unified memory and should set both of these flags by default:
+            trueFlags = hipHostMallocMapped | hipHostMallocPortable;
        }

        const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined;
@@ -265,8 +275,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
            auto device = ctx->getWriteableDevice();
            unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;

+
            *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host",
-                                                  sizeBytes, ctx, amFlags, flags);
+                                                  sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags);
+
            if(sizeBytes  && (*ptr == NULL)){
                hip_status = hipErrorMemoryAllocation;
            }
@@ -314,7 +326,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
        auto device = ctx->getWriteableDevice();

        const unsigned am_flags = 0;
-        *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0);
+        *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0);

        if (sizeBytes && (*ptr == NULL)) {
            hip_status = hipErrorMemoryAllocation;
@@ -373,7 +385,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
                hip_status = hipErrorUnknown;
                break;
        }
-        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0);
+        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0);
        if (size && (*ptr == NULL)) {
            hip_status = hipErrorMemoryAllocation;
        }
@@ -99,7 +99,7 @@ inline int zrand(int max)


 //=================================================================================================
-// Functins to run tests
+// Functions to run tests
 //=================================================================================================
 //--
 //Run through a couple simple cases to test lookups and host pointer arithmetic:
@@ -25,45 +25,106 @@ THE SOFTWARE.

 #include"test_common.h"

-#define SIZE 1024*1024*256
+#define NUM_ELEMENTS 1024*1024*64
+#define SIZE NUM_ELEMENTS*sizeof(int)

-int main(){
-    float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
-    B = (float*)malloc(SIZE);
-    hipMalloc((void**)&Ad, SIZE);
-    hipHostMalloc((void**)&B, SIZE);
-    hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
-    hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
-    hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
-
-    hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
-
-    HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
-
-    HIPCHECK_API(hipFree(Ad) ,  hipSuccess);
-    HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
-
-    HIPCHECK_API(hipFree(B)  , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
-    HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
-    HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
-    HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
-
-    HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipHostFree(C) , hipSuccess);
+int p_count = 4;


-    HIPCHECK_API(hipFree(NULL) , hipSuccess);
-    HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
+void multiGpuHostAlloc(int allocDevice)
+{
+
+    int numDevices;
+    HIPCHECK(hipGetDeviceCount(&numDevices));
+    assert(numDevices > 1);
+
+    printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices);
+
+
+    HIPCHECK(hipSetDevice(allocDevice));
+
+    int *Ah, *Ch;
+    hipHostMalloc((void**)&Ah, SIZE);
+    hipHostMalloc((void**)&Ch, SIZE);
+
+    const int init = -1;
+    for (size_t i=0; i<NUM_ELEMENTS; i++) {
+        Ah[i] = init;
+        Ch[i] = -2;
+    }
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, NUM_ELEMENTS);
+
+    // The host memory allocations should be visible on all of the devices - verify by launching a kernel here that reads those devices:
+    for (int i=0; i<numDevices; i++)  {
+        HIPCHECK(hipSetDevice(i));
+        hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0/*_stream*/,    Ah, Ch, NUM_ELEMENTS, p_count);
+        HIPCHECK(hipDeviceSynchronize());
+    };
+
+
+    int expected = init + p_count;
+    for (size_t i=0; i<NUM_ELEMENTS; i++) {
+        if (Ch[i] != expected) {
+            failed("for Ch[%zu] (%d)  !=  expected(%d)\n", i, Ch[i], expected);
+        }
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    int more_argc = HipTest::parseStandardArguments(argc, argv, false);
+    //assert(more_argc == 0);

    {
-        // Some negative testing - request a too-big allocation and verify it fails:
-        // Someday when we support virtual memory may need to refactor these:
-        size_t tooBig = 128LL*1024*1024*1024*1024;  // 128 TB;
-        void *p;
-        HIPCHECK_API ( hipMalloc(&p, tooBig),  hipErrorMemoryAllocation );
-        HIPCHECK_API ( hipHostMalloc(&p, tooBig),  hipErrorMemoryAllocation );
+        float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
+        B = (float*)malloc(SIZE);
+        hipMalloc((void**)&Ad, SIZE);
+        hipHostMalloc((void**)&B, SIZE);
+        hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
+        hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
+        hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
+
+        hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
+
+        HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
+
+        HIPCHECK_API(hipFree(Ad) ,  hipSuccess);
+        HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
+
+        HIPCHECK_API(hipFree(B)  , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
+        HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
+        HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
+        HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
+        HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
+        HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
+
+        HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
+        HIPCHECK_API(hipHostFree(C) , hipSuccess);
+
+
+        HIPCHECK_API(hipFree(NULL) , hipSuccess);
+        HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
+
+        {
+            // Some negative testing - request a too-big allocation and verify it fails:
+            // Someday when we support virtual memory may need to refactor these:
+            size_t tooBig = 128LL*1024*1024*1024*1024;  // 128 TB;
+            void *p;
+            HIPCHECK_API ( hipMalloc(&p, tooBig),  hipErrorMemoryAllocation );
+            HIPCHECK_API ( hipHostMalloc(&p, tooBig),  hipErrorMemoryAllocation );
+        }
+    }
+
+
+    {
+        int numDevices;
+        HIPCHECK(hipGetDeviceCount(&numDevices));
+        assert(numDevices > 1);
+
+        multiGpuHostAlloc(0);
+        multiGpuHostAlloc(1);
    }

    passed();
@@ -35,42 +35,6 @@ unsigned p_count = 100;



-template <typename T>
-__global__ void
-addCount( const T *A_d,
-        T *C_d,
-        size_t NELEM,
-        int count)
-{
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
-
-    // Deliberately do this in an inefficient way to increase kernel runtime
-    for (int i=0; i<count; i++) {
-        for (size_t i=offset; i<NELEM; i+=stride) {
-            C_d[i] = A_d[i] + (T)count;
-        }
-    }
-}
-
-
-template <typename T>
-__global__ void
-addCountReverse( const T *A_d,
-        T *C_d,
-        int64_t NELEM,
-        int count)
-{
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
-
-    // Deliberately do this in an inefficient way to increase kernel runtime
-    for (int i=0; i<count; i++) {
-        for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
-            C_d[i] = A_d[i] + (T)count;
-        }
-    }
-}


 //------
@@ -171,9 +135,9 @@ void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)

    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
    if (_reverse) {
-        hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
+        hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
    } else {
-        hipLaunchKernelGGL(addCount,         dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
+        hipLaunchKernelGGL(HipTest::addCount,         dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
    }
    HIPCHECK(hipEventRecord(_event, _stream));

@@ -146,6 +146,44 @@ vectorADD(hipLaunchParm lp,
 }


+template <typename T>
+__global__ void
+addCount( const T *A_d,
+        T *C_d,
+        size_t NELEM,
+        int count)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    // Deliberately do this in an inefficient way to increase kernel runtime
+    for (int i=0; i<count; i++) {
+        for (size_t i=offset; i<NELEM; i+=stride) {
+            C_d[i] = A_d[i] + (T)count;
+        }
+    }
+}
+
+
+template <typename T>
+__global__ void
+addCountReverse( const T *A_d,
+        T *C_d,
+        int64_t NELEM,
+        int count)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    // Deliberately do this in an inefficient way to increase kernel runtime
+    for (int i=0; i<count; i++) {
+        for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
+            C_d[i] = A_d[i] + (T)count;
+        }
+    }
+}
+
+
 template <typename T>
 void initArraysForHost(T **A_h, T **B_h, T **C_h,
                size_t N, bool usePinnedHost=false)