From ff9bed653584a34cf5fafa3545a1ad1f22c74ca2 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Wed, 10 May 2017 17:32:25 -0500
Subject: [PATCH] hipHostMalloc allocation are mapped to all devices by
 default.

Support hipHostMallocPortable flag.
Default flags are hipHostMallocPortable | hipHostMallocMapped.

Also:
-refactor tests to move addCount and addCountReverse into HipTest
namespace.
-test multi-GPU host memory.
---
 hipamd/src/hip_hcc.cpp                        |   9 ++
 hipamd/src/hip_hcc_internal.h                 |   1 +
 hipamd/src/hip_memory.cpp                     |  58 ++++----
 hipamd/tests/src/hipPointerAttrib.cpp         |   2 +-
 .../runtimeApi/memory/hipMemoryAllocate.cpp   | 129 +++++++++++++-----
 .../runtimeApi/stream/hipStreamWaitEvent.cpp  |  40 +-----
 hipamd/tests/src/test_common.h                |  38 ++++++
 7 files changed, 181 insertions(+), 96 deletions(-)
diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp
index 71d947488d..81a2079b5b 100644
--- a/hipamd/src/hip_hcc.cpp
+++ b/hipamd/src/hip_hcc.cpp
@@ -118,6 +118,7 @@ bool g_visible_device = false;
 unsigned g_deviceCnt;
 std::vector<int> g_hip_visible_devices;
 hsa_agent_t g_cpu_agent;
+hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
 unsigned g_numLogicalThreads;
 
 std::atomic<int> g_lastShortTid(1);
@@ -1389,6 +1390,14 @@ void ihipInit()
             g_deviceCnt++;
         }
     }
+
+    g_allAgents = static_cast<hsa_agent_t*> (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t)));
+    g_allAgents[0] = g_cpu_agent;
+    for (int i=0; i<g_deviceCnt; i++) {
+        g_allAgents[i+1] = g_deviceArray[i]->_hsaAgent;
+    }
+
+
     g_numLogicalThreads = std::thread::hardware_concurrency();
 
     // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized
diff --git a/hipamd/src/hip_hcc_internal.h b/hipamd/src/hip_hcc_internal.h
index 9c17c6e98c..132f099ce8 100644
--- a/hipamd/src/hip_hcc_internal.h
+++ b/hipamd/src/hip_hcc_internal.h
@@ -826,6 +826,7 @@ private:  // Critical data, protected with locked access:
 extern std::once_flag hip_initialized;
 extern unsigned g_deviceCnt;
 extern hsa_agent_t g_cpu_agent ;   // the CPU agent.
+extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
 
 //=================================================================================================
 // Extern functions:
diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp
index 1ba698f461..c4f0f64e50 100644
--- a/hipamd/src/hip_memory.cpp
+++ b/hipamd/src/hip_memory.cpp
@@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK
 }
 
 // return 0 on success or -1 on error:
-int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
+int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags)
 {
     int ret = 0;
 
     auto device = ctx->getWriteableDevice();
 
     hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
-    int peerCnt=0;
-    {
-        LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-        // the peerCnt always stores self so make sure the trace actually
-        peerCnt = crit->peerCnt();
-        tprintf(DB_MEM, "  allow access to %d other peer(s)\n", peerCnt-1);
-        if (peerCnt > 1) {
 
-            //printf ("peer self access\n");
+    if (shareWithAll) {
+        hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr);
+        tprintf (DB_MEM, "    allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); 
+        if (s != HSA_STATUS_SUCCESS) {
+            ret = -1;
+        }
+    } else {
+        int peerCnt=0;
+        {
+            LockedAccessor_CtxCrit_t crit(ctx->criticalData());
+            // the peerCnt always stores self so make sure the trace actually
+            peerCnt = crit->peerCnt();
+            tprintf(DB_MEM, "  allow access to %d other peer(s)\n", peerCnt-1);
+            if (peerCnt > 1) {
 
-            // TODOD - remove me:
-            for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
-                tprintf (DB_MEM, "    allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
-            };
+                //printf ("peer self access\n");
 
-            hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
-            if (s != HSA_STATUS_SUCCESS) {
-                ret = -1;
+                // TODOD - remove me:
+                for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
+                    tprintf (DB_MEM, "    allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
+                };
+
+                hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
+                if (s != HSA_STATUS_SUCCESS) {
+                    ret = -1;
+                }
             }
         }
     }
@@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
 
 // Allocate a new pointer with am_alloc and share with all valid peers.
 // Returns null-ptr if a memory error occurs (either allocation or sharing)
-void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags)
+void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags)
 {
 
     void *ptr = nullptr;
@@ -108,7 +117,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig
             msg, ptr, static_cast<char*>(ptr)+sizeBytes, sizeBytes, device->_deviceId);
 
     if (ptr != nullptr) {
-        int r = sharePtr(ptr, ctx, hipFlags);
+        int r = sharePtr(ptr, ctx, shareWithAll, hipFlags);
         if (r != 0) {
             ptr = nullptr;
         }
@@ -220,7 +229,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
 
     } else {
         auto device = ctx->getWriteableDevice();
-        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx,  0/*amFlags*/, 0/*hipFlags*/);
+        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx,  false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/);
 
         if(sizeBytes  && (*ptr == NULL)){
             hip_status = hipErrorMemoryAllocation;
@@ -253,7 +262,8 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
     } else {
         unsigned trueFlags = flags;
         if (flags == hipHostMallocDefault) {
-            trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined;
+            // HCC/ROCM provide a modern system with unified memory and should set both of these flags by default:
+            trueFlags = hipHostMallocMapped | hipHostMallocPortable;
         }
 
         const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined;
@@ -265,8 +275,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
             auto device = ctx->getWriteableDevice();
             unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
 
+
             *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host",
-                                                  sizeBytes, ctx, amFlags, flags);
+                                                  sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags);
+
             if(sizeBytes  && (*ptr == NULL)){
                 hip_status = hipErrorMemoryAllocation;
             }
@@ -314,7 +326,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
         auto device = ctx->getWriteableDevice();
 
         const unsigned am_flags = 0;
-        *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0);
+        *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0);
 
         if (sizeBytes && (*ptr == NULL)) {
             hip_status = hipErrorMemoryAllocation;
@@ -373,7 +385,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
                 hip_status = hipErrorUnknown;
                 break;
         }
-        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0);
+        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0);
         if (size && (*ptr == NULL)) {
             hip_status = hipErrorMemoryAllocation;
         }
diff --git a/hipamd/tests/src/hipPointerAttrib.cpp b/hipamd/tests/src/hipPointerAttrib.cpp
index fb7832d9a6..7a2ab64bea 100644
--- a/hipamd/tests/src/hipPointerAttrib.cpp
+++ b/hipamd/tests/src/hipPointerAttrib.cpp
@@ -99,7 +99,7 @@ inline int zrand(int max)
 
 
 //=================================================================================================
-// Functins to run tests
+// Functions to run tests
 //=================================================================================================
 //--
 //Run through a couple simple cases to test lookups and host pointer arithmetic:
diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp
index 0a256d6362..1ee5cbc9bb 100644
--- a/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp
+++ b/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp
@@ -25,45 +25,106 @@ THE SOFTWARE.
 
 #include"test_common.h"
 
-#define SIZE 1024*1024*256
+#define NUM_ELEMENTS 1024*1024*64
+#define SIZE NUM_ELEMENTS*sizeof(int)
 
-int main(){
-    float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
-    B = (float*)malloc(SIZE);
-    hipMalloc((void**)&Ad, SIZE);
-    hipHostMalloc((void**)&B, SIZE);
-    hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
-    hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
-    hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
-
-    hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
-
-    HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
-
-    HIPCHECK_API(hipFree(Ad) ,  hipSuccess);
-    HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
-
-    HIPCHECK_API(hipFree(B)  , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
-    HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
-    HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
-    HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
-
-    HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipHostFree(C) , hipSuccess);
+int p_count = 4;
 
 
-    HIPCHECK_API(hipFree(NULL) , hipSuccess);
-    HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
+void multiGpuHostAlloc(int allocDevice)
+{
+
+    int numDevices;
+    HIPCHECK(hipGetDeviceCount(&numDevices));
+    assert(numDevices > 1);
+
+    printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices);
+
+
+    HIPCHECK(hipSetDevice(allocDevice));
+
+    int *Ah, *Ch;
+    hipHostMalloc((void**)&Ah, SIZE);
+    hipHostMalloc((void**)&Ch, SIZE);
+
+    const int init = -1;
+    for (size_t i=0; i<NUM_ELEMENTS; i++) {
+        Ah[i] = init;
+        Ch[i] = -2;
+    }
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, NUM_ELEMENTS);
+
+    // The host memory allocations should be visible on all of the devices - verify by launching a kernel here that reads those devices:
+    for (int i=0; i<numDevices; i++)  {
+        HIPCHECK(hipSetDevice(i));
+        hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0/*_stream*/,    Ah, Ch, NUM_ELEMENTS, p_count);
+        HIPCHECK(hipDeviceSynchronize());
+    };
+
+
+    int expected = init + p_count;
+    for (size_t i=0; i<NUM_ELEMENTS; i++) {
+        if (Ch[i] != expected) {
+            failed("for Ch[%zu] (%d)  !=  expected(%d)\n", i, Ch[i], expected);
+        }
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    int more_argc = HipTest::parseStandardArguments(argc, argv, false);
+    //assert(more_argc == 0);
 
     {
-        // Some negative testing - request a too-big allocation and verify it fails:
-        // Someday when we support virtual memory may need to refactor these:
-        size_t tooBig = 128LL*1024*1024*1024*1024;  // 128 TB;
-        void *p;
-        HIPCHECK_API ( hipMalloc(&p, tooBig),  hipErrorMemoryAllocation );
-        HIPCHECK_API ( hipHostMalloc(&p, tooBig),  hipErrorMemoryAllocation );
+        float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
+        B = (float*)malloc(SIZE);
+        hipMalloc((void**)&Ad, SIZE);
+        hipHostMalloc((void**)&B, SIZE);
+        hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
+        hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
+        hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
+
+        hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
+
+        HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
+
+        HIPCHECK_API(hipFree(Ad) ,  hipSuccess);
+        HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
+
+        HIPCHECK_API(hipFree(B)  , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
+        HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
+        HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
+        HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
+        HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
+        HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
+
+        HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
+        HIPCHECK_API(hipHostFree(C) , hipSuccess);
+
+
+        HIPCHECK_API(hipFree(NULL) , hipSuccess);
+        HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
+
+        {
+            // Some negative testing - request a too-big allocation and verify it fails:
+            // Someday when we support virtual memory may need to refactor these:
+            size_t tooBig = 128LL*1024*1024*1024*1024;  // 128 TB;
+            void *p;
+            HIPCHECK_API ( hipMalloc(&p, tooBig),  hipErrorMemoryAllocation );
+            HIPCHECK_API ( hipHostMalloc(&p, tooBig),  hipErrorMemoryAllocation );
+        }
+    }
+
+
+    {
+        int numDevices;
+        HIPCHECK(hipGetDeviceCount(&numDevices));
+        assert(numDevices > 1);
+
+        multiGpuHostAlloc(0);
+        multiGpuHostAlloc(1);
     }
 
     passed();
diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
index adf0d4af0c..80ff7ad98d 100644
--- a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
+++ b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
@@ -35,42 +35,6 @@ unsigned p_count = 100;
 
 
 
-template <typename T>
-__global__ void
-addCount( const T *A_d,
-        T *C_d,
-        size_t NELEM,
-        int count)
-{
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
-
-    // Deliberately do this in an inefficient way to increase kernel runtime
-    for (int i=0; i<count; i++) {
-        for (size_t i=offset; i<NELEM; i+=stride) {
-            C_d[i] = A_d[i] + (T)count;
-        }
-    }
-}
-
-
-template <typename T>
-__global__ void
-addCountReverse( const T *A_d,
-        T *C_d,
-        int64_t NELEM,
-        int count)
-{
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
-
-    // Deliberately do this in an inefficient way to increase kernel runtime
-    for (int i=0; i<count; i++) {
-        for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
-            C_d[i] = A_d[i] + (T)count;
-        }
-    }
-}
 
 
 //------
@@ -171,9 +135,9 @@ void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
 
     unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
     if (_reverse) {
-        hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
+        hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
     } else {
-        hipLaunchKernelGGL(addCount,         dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
+        hipLaunchKernelGGL(HipTest::addCount,         dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
     }
     HIPCHECK(hipEventRecord(_event, _stream));
 
diff --git a/hipamd/tests/src/test_common.h b/hipamd/tests/src/test_common.h
index 1250de4801..633ee6f825 100644
--- a/hipamd/tests/src/test_common.h
+++ b/hipamd/tests/src/test_common.h
@@ -146,6 +146,44 @@ vectorADD(hipLaunchParm lp,
 }
 
 
+template <typename T>
+__global__ void
+addCount( const T *A_d,
+        T *C_d,
+        size_t NELEM,
+        int count)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    // Deliberately do this in an inefficient way to increase kernel runtime
+    for (int i=0; i<count; i++) {
+        for (size_t i=offset; i<NELEM; i+=stride) {
+            C_d[i] = A_d[i] + (T)count;
+        }
+    }
+}
+
+
+template <typename T>
+__global__ void
+addCountReverse( const T *A_d,
+        T *C_d,
+        int64_t NELEM,
+        int count)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    // Deliberately do this in an inefficient way to increase kernel runtime
+    for (int i=0; i<count; i++) {
+        for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
+            C_d[i] = A_d[i] + (T)count;
+        }
+    }
+}
+
+
 template <typename T>
 void initArraysForHost(T **A_h, T **B_h, T **C_h,
                 size_t N, bool usePinnedHost=false)