P4 to Git Change 2026152 by cpaquot@cpaquot-ocl-lc-lnx on 2019/11/06 17:50:08

SWDEV-206239 - [HIP] RCCL: finegrain VRAM does not work
	Implemented fine grained VRAM allocation via ATOMICS.

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/hip/hip_memory.cpp#84 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#138 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#43 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#45 edit


[ROCm/clr commit: 2506752348]
This commit is contained in:
foreman
2019-11-06 18:00:44 -05:00
parent a1ebb70b22
commit 8a2d0028ba
3 changed files with 22 additions and 7 deletions
@@ -155,6 +155,7 @@ Device::Device(hsa_agent_t bkendDevice)
system_segment_.handle = 0;
system_coarse_segment_.handle = 0;
gpuvm_segment_.handle = 0;
gpu_fine_grained_segment_.handle = 0;
}
Device::~Device() {
@@ -861,7 +862,18 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
switch (segment_type) {
case HSA_REGION_SEGMENT_GLOBAL: {
if (dev->settings().enableLocalMemory_) {
dev->gpuvm_segment_ = pool;
uint32_t global_flag = 0;
hsa_status_t stat =
hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (stat != HSA_STATUS_SUCCESS) {
return stat;
}
if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
dev->gpu_fine_grained_segment_ = pool;
} else {
dev->gpuvm_segment_ = pool;
}
}
break;
}
@@ -1677,13 +1689,15 @@ void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); }
void* Device::deviceLocalAlloc(size_t size) const {
if (gpuvm_segment_.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
void* Device::deviceLocalAlloc(size_t size, bool atomics) const {
const hsa_amd_memory_pool_t& pool = (atomics)? gpu_fine_grained_segment_ : gpuvm_segment_;
if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
return nullptr;
}
void* ptr = nullptr;
hsa_status_t stat = hsa_amd_memory_pool_allocate(gpuvm_segment_, size, 0, &ptr);
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
if (stat != HSA_STATUS_SUCCESS) {
LogError("Fail allocation local memory");
return nullptr;
@@ -1692,7 +1706,7 @@ void* Device::deviceLocalAlloc(size_t size) const {
if (p2pAgents().size() > 0) {
stat = hsa_amd_agents_allow_access(p2pAgents().size(), p2pAgents().data(), nullptr, ptr);
if (stat != HSA_STATUS_SUCCESS) {
LogError("Allow p2p acces for memory allocation");
LogError("Allow p2p access for memory allocation");
memFree(ptr, size);
return nullptr;
}
@@ -344,7 +344,7 @@ class Device : public NullDevice {
virtual void hostFree(void* ptr, size_t size = 0) const;
void* deviceLocalAlloc(size_t size) const;
void* deviceLocalAlloc(size_t size, bool atomics = false) const;
void memFree(void* ptr, size_t size) const;
@@ -437,6 +437,7 @@ class Device : public NullDevice {
hsa_amd_memory_pool_t system_segment_;
hsa_amd_memory_pool_t system_coarse_segment_;
hsa_amd_memory_pool_t gpuvm_segment_;
hsa_amd_memory_pool_t gpu_fine_grained_segment_;
size_t gpuvm_segment_max_alloc_;
size_t alloc_granularity_;
static const bool offlineDevice_;
@@ -705,7 +705,7 @@ bool Buffer::create() {
}
flags_ |= HostMemoryDirectAccess;
} else {
deviceMemory_ = dev().deviceLocalAlloc(size());
deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0);
}
owner()->setSvmPtr(deviceMemory_);
} else {