From 03910f3663d663cecf2c2f5fe6d67ee40dd175e1 Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 5 Jun 2018 15:48:18 -0400 Subject: [PATCH] P4 to Git Change 1564298 by gandryey@gera-w8 on 2018/06/05 15:43:17 SWDEV-79445 - OCL generic changes and code clean-up - Add reallocation logic for memory dependency. SVM path can send the amount of SVM ptrs over the max size of arguments Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#420 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#105 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#54 edit [ROCm/clr commit: 1130565901258cb44898acd76719898166cdced4] --- .../rocclr/runtime/device/gpu/gpuvirtual.cpp | 29 +++++++++++++----- .../rocclr/runtime/device/pal/palvirtual.cpp | 28 +++++++++++++---- .../rocclr/runtime/device/rocm/rocvirtual.cpp | 30 ++++++++++++++----- 3 files changed, 65 insertions(+), 22 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp index 65b188c4b8..38c67e5906 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -80,7 +80,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor } // Did we reach the limit? - if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { + if (maxMemObjectsInQueue_ <= numMemObjectsInQueue_) { flushL1Cache = true; } @@ -109,14 +109,27 @@ void VirtualGPU::MemoryDependency::clear(bool all) { endMemObjectsInQueue_ = numMemObjectsInQueue_; } - // Preserve all objects from the current kernel - for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { - memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; - memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; - memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + // If the current launch didn't start from the beginning, then move the data + if (0 != endMemObjectsInQueue_) { + // Preserve all objects from the current kernel + for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { + memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; + memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; + memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + } + } else if (numMemObjectsInQueue_ >= maxMemObjectsInQueue_) { + // note: The array growth shouldn't occur under the normal conditions, + // but in a case when SVM path sends the amount of SVM ptrs over + // the max size of kernel arguments + MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1]; + if (nullptr == ptr) { + numMemObjectsInQueue_ = 0; + return; + } + maxMemObjectsInQueue_ <<= 1; + memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_); + memObjectsInQueue_= ptr; } - // Clear all objects except current kernel - memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); numMemObjectsInQueue_ -= endMemObjectsInQueue_; endMemObjectsInQueue_ = 0; } diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 80443ef28a..1f0756342d 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -454,7 +454,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor } // Did we reach the limit? - if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { + if (maxMemObjectsInQueue_ <= numMemObjectsInQueue_) { flushL1Cache = true; } @@ -485,12 +485,28 @@ void VirtualGPU::MemoryDependency::clear(bool all) { endMemObjectsInQueue_ = numMemObjectsInQueue_; } - // Preserve all objects from the current kernel - for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { - memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; - memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; - memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + // If the current launch didn't start from the beginning, then move the data + if (0 != endMemObjectsInQueue_) { + // Preserve all objects from the current kernel + for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { + memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; + memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; + memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + } + } else if (numMemObjectsInQueue_ >= maxMemObjectsInQueue_) { + // note: The array growth shouldn't occur under the normal conditions, + // but in a case when SVM path sends the amount of SVM ptrs over + // the max size of kernel arguments + MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1]; + if (nullptr == ptr) { + numMemObjectsInQueue_ = 0; + return; + } + maxMemObjectsInQueue_ <<= 1; + memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_); + memObjectsInQueue_= ptr; } + // Adjust the number of active objects numMemObjectsInQueue_ -= endMemObjectsInQueue_; endMemObjectsInQueue_ = 0; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index 924761a897..e147071c78 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -128,7 +128,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor } // Did we reach the limit? - if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { + if (maxMemObjectsInQueue_ <= numMemObjectsInQueue_) { flushL1Cache = true; } @@ -157,14 +157,28 @@ void VirtualGPU::MemoryDependency::clear(bool all) { endMemObjectsInQueue_ = numMemObjectsInQueue_; } - // Preserve all objects from the current kernel - for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { - memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; - memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; - memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + // If the current launch didn't start from the beginning, then move the data + if (0 != endMemObjectsInQueue_) { + // Preserve all objects from the current kernel + for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { + memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; + memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; + memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + } + } else if (numMemObjectsInQueue_ >= maxMemObjectsInQueue_) { + // note: The array growth shouldn't occur under the normal conditions, + // but in a case when SVM path sends the amount of SVM ptrs over + // the max size of kernel arguments + MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1]; + if (nullptr == ptr) { + numMemObjectsInQueue_ = 0; + return; + } + maxMemObjectsInQueue_ <<= 1; + memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_); + memObjectsInQueue_= ptr; } - // Clear all objects except current kernel - memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); + numMemObjectsInQueue_ -= endMemObjectsInQueue_; endMemObjectsInQueue_ = 0; }