P4 to Git Change 1591041 by vsytchen@vsytchen-win10 on 2018/08/08 18:46:17

SWDEV-159881 - [OCL][ROCm] Add SVM coarse-grain buffer support with device memory (Part 1)

	1. Implement submitSvmFree/Copy/FillMemory.
	2. Add macro IS_HIP that determines if the client is HIP.
	3. Add setting enableCoarseGrainSVM that allows the use of device memory for coarse grain SVM allocations.
	4. Set enableCoarseGrainSVM to be true only for HIP.

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/15597/diff/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#93 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#37 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#35 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#15 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#61 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#18 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#294 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/macros.hpp#10 edit


[ROCm/clr commit: b33c52ed6f]
This commit is contained in:
foreman
2018-08-08 18:58:03 -04:00
parent b641645bbf
commit 7ad3f2f33a
8 changed files with 293 additions and 109 deletions
@@ -1532,43 +1532,37 @@ void Device::updateFreeMemory(size_t size, bool free) {
void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags,
void* svmPtr) const {
amd::Memory* mem = nullptr;
if (nullptr == svmPtr) {
bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0;
void* ptr = hostAlloc(size, alignment, atomics);
if (ptr != nullptr) {
// Copy paste from ORCA code.
// create a hidden buffer, which will allocated on the device later
mem = new (context) amd::Buffer(context, (CL_MEM_USE_HOST_PTR | (flags & 0xFFFF0000)), size, ptr);
if (mem == nullptr) {
LogError("failed to create a svm mem object!");
return nullptr;
}
if (!mem->create(ptr)) {
LogError("failed to create a svm hidden buffer!");
mem->release();
return nullptr;
}
// add the information to context so that we can use it later.
amd::MemObjMap::AddMemObj(ptr, mem);
return ptr;
} else {
// create a hidden buffer, which will allocated on the device later
mem = new (context) amd::Buffer(context, flags, size, reinterpret_cast<void*>(1));
if (mem == nullptr) {
LogError("failed to create a svm mem object!");
return nullptr;
}
if (!mem->create(nullptr)) {
LogError("failed to create a svm hidden buffer!");
mem->release();
return nullptr;
}
// if the device supports SVM FGS, return the committed CPU address directly.
Memory* gpuMem = getRocMemory(mem);
// add the information to context so that we can use it later.
amd::MemObjMap::AddMemObj(mem->getSvmPtr(), mem);
svmPtr = mem->getSvmPtr();
} else {
// Copy paste from ORCA code.
// Find the existing amd::mem object
mem = amd::MemObjMap::FindMemObj(svmPtr);
if (nullptr == mem) {
return nullptr;
}
return svmPtr;
svmPtr = mem->getSvmPtr();
}
return svmPtr;
}
void Device::svmFree(void* ptr) const {
@@ -1577,7 +1571,6 @@ void Device::svmFree(void* ptr) const {
if (nullptr != svmMem) {
svmMem->release();
amd::MemObjMap::RemoveMemObj(ptr);
hostFree(ptr);
}
}
@@ -581,7 +581,27 @@ void Buffer::destroy() {
return;
}
const cl_mem_flags memFlags = owner()->getMemFlags();
cl_mem_flags memFlags = owner()->getMemFlags();
if (owner()->getSvmPtr() != nullptr) {
if (!dev().settings().enableCoarseGrainSVM_) {
memFlags |= CL_MEM_SVM_FINE_GRAIN_BUFFER;
}
const bool isFineGrain = memFlags & CL_MEM_SVM_FINE_GRAIN_BUFFER;
if (isFineGrain) {
dev().hostFree(deviceMemory_, size());
} else {
dev().memFree(deviceMemory_, size());
}
if (dev().settings().apuSystem_ || !isFineGrain) {
const_cast<Device&>(dev()).updateFreeMemory(size(), true);
}
return;
}
#ifdef WITH_AMDGPU_PRO
if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) {
dev().iPro().FreeDmaBuffer(deviceMemory_);
@@ -628,6 +648,34 @@ bool Buffer::create() {
return false;
}
// Allocate backing storage in device local memory unless UHP or AHP are set
cl_mem_flags memFlags = owner()->getMemFlags();
if (owner()->getSvmPtr() != nullptr) {
if (!dev().settings().enableCoarseGrainSVM_) {
memFlags |= CL_MEM_SVM_FINE_GRAIN_BUFFER;
flags_ |= HostMemoryDirectAccess;
}
const bool isFineGrain = memFlags & CL_MEM_SVM_FINE_GRAIN_BUFFER;
if (owner()->getSvmPtr() == reinterpret_cast<void*>(1)) {
if (isFineGrain) {
deviceMemory_ = dev().hostAlloc(size(), 1, false);
} else {
deviceMemory_ = dev().deviceLocalAlloc(size());
}
owner()->setSvmPtr(deviceMemory_);
} else {
deviceMemory_ = owner()->getSvmPtr();
}
if (dev().settings().apuSystem_ || !isFineGrain) {
const_cast<Device&>(dev()).updateFreeMemory(size(), false);
}
return deviceMemory_ != nullptr;
}
// Interop buffer
if (owner()->isInterop()) return createInteropBuffer(GL_ARRAY_BUFFER, 0);
@@ -658,9 +706,6 @@ bool Buffer::create() {
return true;
}
// Allocate backing storage in device local memory unless UHP or AHP are set
const cl_mem_flags memFlags = owner()->getMemFlags();
#ifdef WITH_AMDGPU_PRO
if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) {
void* host_ptr = nullptr;
@@ -20,6 +20,7 @@ Settings::Settings() {
pollCompletion_ = ENVVAR_HSA_POLL_KERNEL_COMPLETION;
enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM;
maxWorkGroupSize_ = 1024;
preferredWorkGroupSize_ = 256;
@@ -22,6 +22,7 @@ class Settings : public device::Settings {
uint doublePrecision_ : 1; //!< Enables double precision support
uint pollCompletion_ : 1; //!< Enables polling in HSA
uint enableLocalMemory_ : 1; //!< Enable GPUVM memory
uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations
uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory
uint enablePartialDispatch_ : 1; //!< Enable support for Partial Dispatch
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
@@ -29,7 +30,7 @@ class Settings : public device::Settings {
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
uint singleFpDenorm_ : 1; //!< Support Single FP Denorm
uint apuSystem_ : 1; //!< APU system
uint reserved_ : 20;
uint reserved_ : 21;
};
uint value_;
};
@@ -1109,94 +1109,70 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
profilingEnd(cmd);
}
void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
// in-order semantics: previous commands need to be done before we start
releaseGpuMemoryFence();
profilingBegin(cmd);
amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
profilingEnd(cmd);
}
void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
// in-order semantics: previous commands need to be done before we start
releaseGpuMemoryFence();
profilingBegin(cmd);
amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
profilingEnd(cmd);
}
void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
Memory* srcDevMem = dev().getRocMemory(&cmd.source());
Memory* dstDevMem = dev().getRocMemory(&cmd.destination());
bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem,
bool entire, const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect) {
Memory* srcDevMem = dev().getRocMemory(&srcMem);
Memory* dstDevMem = dev().getRocMemory(&dstMem);
// Synchronize source and destination memory
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = cmd.isEntireMemory();
syncFlags.skipEntire_ = entire;
dstDevMem->syncCacheFromHost(*this, syncFlags);
srcDevMem->syncCacheFromHost(*this);
amd::Coord3D size = cmd.size();
cl_command_type type = cmd.type();
bool result = false;
bool srcImageBuffer = false;
bool dstImageBuffer = false;
// Force buffer copy for IMAGE1D_BUFFER
if (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
if (srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
srcImageBuffer = true;
type = CL_COMMAND_COPY_BUFFER;
}
if (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
if (dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
dstImageBuffer = true;
type = CL_COMMAND_COPY_BUFFER;
}
switch (cmd.type()) {
switch (type) {
case CL_COMMAND_SVM_MEMCPY:
case CL_COMMAND_COPY_BUFFER: {
amd::Coord3D srcOrigin(cmd.srcOrigin()[0]);
amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);
amd::Coord3D realSrcOrigin(srcOrigin[0]);
amd::Coord3D realDstOrigin(dstOrigin[0]);
amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]);
if (srcImageBuffer) {
const size_t elemSize = cmd.source().asImage()->getImageFormat().getElementSize();
srcOrigin.c[0] *= elemSize;
const size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize();
realSrcOrigin.c[0] *= elemSize;
if (dstImageBuffer) {
dstOrigin.c[0] *= elemSize;
realDstOrigin.c[0] *= elemSize;
}
size.c[0] *= elemSize;
realSize.c[0] *= elemSize;
} else if (dstImageBuffer) {
const size_t elemSize = cmd.destination().asImage()->getImageFormat().getElementSize();
dstOrigin.c[0] *= elemSize;
size.c[0] *= elemSize;
const size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize();
realDstOrigin.c[0] *= elemSize;
realSize.c[0] *= elemSize;
}
result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size,
cmd.isEntireMemory());
result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin, realSize, entire);
break;
}
case CL_COMMAND_COPY_BUFFER_RECT: {
result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size,
cmd.isEntireMemory());
result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, srcRect, dstRect, size, entire);
break;
}
case CL_COMMAND_COPY_IMAGE: {
result = blitMgr().copyImage(*srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), size,
cmd.isEntireMemory());
result = blitMgr().copyImage(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, entire);
break;
}
case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
result = blitMgr().copyImageToBuffer(*srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(),
size, cmd.isEntireMemory());
result = blitMgr().copyImageToBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, entire);
break;
}
case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
result = blitMgr().copyBufferToImage(*srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(),
size, cmd.isEntireMemory());
result = blitMgr().copyBufferToImage(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, entire);
break;
}
default:
@@ -1206,11 +1182,103 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
if (!result) {
LogError("submitCopyMemory failed!");
cmd.setStatus(CL_OUT_OF_RESOURCES);
return false;
}
cmd.destination().signalWrite(&dev());
// Mark this as the most-recently written cache of the destination
dstMem.signalWrite(&dev());
return true;
}
void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
cl_command_type type = cmd.type();
bool entire = cmd.isEntireMemory();
if (!copyMemory(type, cmd.source(), cmd.destination(), entire, cmd.srcOrigin(),
cmd.dstOrigin(), cmd.size(), cmd.srcRect(), cmd.dstRect())) {
cmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(cmd);
}
void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
// in-order semantics: previous commands need to be done before we start
releaseGpuMemoryFence();
profilingBegin(cmd);
// no op for FGS supported device
if (!dev().isFineGrainedSystem() &&
dev().settings().enableCoarseGrainSVM_) {
amd::Coord3D srcOrigin(0, 0, 0);
amd::Coord3D dstOrigin(0, 0, 0);
amd::Coord3D size(cmd.srcSize(), 1, 1);
amd::BufferRect srcRect;
amd::BufferRect dstRect;
bool result = false;
amd::Memory* srcMem = amd::MemObjMap::FindMemObj(cmd.src());
amd::Memory* dstMem = amd::MemObjMap::FindMemObj(cmd.dst());
device::Memory::SyncFlags syncFlags;
if (nullptr != srcMem) {
srcMem->commitSvmMemory();
srcOrigin.c[0] =
static_cast<const_address>(cmd.src()) - static_cast<address>(srcMem->getSvmPtr());
if (!(srcMem->validateRegion(srcOrigin, size))) {
cmd.setStatus(CL_INVALID_OPERATION);
return;
}
}
if (nullptr != dstMem) {
dstMem->commitSvmMemory();
dstOrigin.c[0] =
static_cast<const_address>(cmd.dst()) - static_cast<address>(dstMem->getSvmPtr());
if (!(dstMem->validateRegion(dstOrigin, size))) {
cmd.setStatus(CL_INVALID_OPERATION);
return;
}
}
if (nullptr == srcMem && nullptr == dstMem) { // both not in svm space
amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize());
result = true;
} else if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space
Memory* memory = dev().getRocMemory(dstMem);
// Synchronize source and destination memory
syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size);
memory->syncCacheFromHost(*this, syncFlags);
result = blitMgr().writeBuffer(cmd.src(), *memory, dstOrigin, size,
dstMem->isEntirelyCovered(dstOrigin, size));
// Mark this as the most-recently written cache of the destination
dstMem->signalWrite(&dev());
} else if (nullptr != srcMem && nullptr == dstMem) { // dst not in svm space
Memory* memory = dev().getRocMemory(srcMem);
// Synchronize source and destination memory
memory->syncCacheFromHost(*this);
result = blitMgr().readBuffer(*memory, cmd.dst(), srcOrigin, size,
srcMem->isEntirelyCovered(srcOrigin, size));
} else if (nullptr != srcMem && nullptr != dstMem) { // both in svm space
bool entire =
srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size);
result =
copyMemory(cmd.type(), *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect);
}
if (!result) {
cmd.setStatus(CL_INVALID_OPERATION);
}
} else {
// direct memcpy for FGS enabled system
amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
}
profilingEnd(cmd);
}
@@ -1480,53 +1548,48 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
profilingEnd(cmd);
}
void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern,
size_t patternSize, const amd::Coord3D& origin,
const amd::Coord3D& size) {
Memory* memory = dev().getRocMemory(amdMemory);
profilingBegin(cmd);
Memory* memory = dev().getRocMemory(&cmd.memory());
bool entire = cmd.isEntireMemory();
bool entire = amdMemory->isEntirelyCovered(origin, size);
// Synchronize memory from host if necessary
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = entire;
memory->syncCacheFromHost(*this, syncFlags);
cl_command_type type = cmd.type();
bool result = false;
bool imageBuffer = false;
float fillValue[4];
// Force fill buffer for IMAGE1D_BUFFER
if ((type == CL_COMMAND_FILL_IMAGE) && (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
type = CL_COMMAND_FILL_BUFFER;
imageBuffer = true;
}
// Find the the right fill operation
switch (type) {
case CL_COMMAND_SVM_MEMFILL:
case CL_COMMAND_FILL_BUFFER: {
const void* pattern = cmd.pattern();
size_t patternSize = cmd.patternSize();
amd::Coord3D origin(cmd.origin()[0]);
amd::Coord3D size(cmd.size()[0]);
amd::Coord3D realOrigin(origin[0]);
amd::Coord3D realSize(size[0]);
// Reprogram fill parameters if it's an IMAGE1D_BUFFER object
if (imageBuffer) {
size_t elemSize = cmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize();
realOrigin.c[0] *= elemSize;
realSize.c[0] *= elemSize;
memset(fillValue, 0, sizeof(fillValue));
cmd.memory().asImage()->getImageFormat().formatColor(pattern, fillValue);
amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue);
pattern = fillValue;
patternSize = elemSize;
}
result = blitMgr().fillBuffer(*memory, pattern, patternSize, origin, size, entire);
result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, entire);
break;
}
case CL_COMMAND_FILL_IMAGE: {
result = blitMgr().fillImage(*memory, cmd.pattern(), cmd.origin(), cmd.size(), entire);
result = blitMgr().fillImage(*memory, pattern, origin, size, entire);
break;
}
default:
@@ -1536,10 +1599,60 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
if (!result) {
LogError("submitFillMemory failed!");
cmd.setStatus(CL_OUT_OF_RESOURCES);
}
cmd.memory().signalWrite(&dev());
amdMemory->signalWrite(&dev());
return true;
}
void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(),
cmd.size())) {
cmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(cmd);
}
void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
// in-order semantics: previous commands need to be done before we start
releaseGpuMemoryFence();
profilingBegin(cmd);
if (!dev().isFineGrainedSystem() &&
dev().settings().enableCoarseGrainSVM_) {
size_t patternSize = cmd.patternSize();
size_t fillSize = patternSize * cmd.times();
amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst());
assert(dstMemory && "No svm Buffer to fill with!");
size_t offset = reinterpret_cast<uintptr_t>(cmd.dst()) -
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
Memory* memory = dev().getRocMemory(dstMemory);
amd::Coord3D origin(offset, 0, 0);
amd::Coord3D size(fillSize, 1, 1);
assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!");
// Synchronize memory from host if necessary
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = dstMemory->isEntirelyCovered(origin, size);
memory->syncCacheFromHost(*this, syncFlags);
if (!fillMemory(cmd.type(), dstMemory, cmd.pattern(), cmd.patternSize(), origin, size)) {
cmd.setStatus(CL_INVALID_OPERATION);
}
// Mark this as the most-recently written cache of the destination
dstMemory->signalWrite(&dev());
} else {
// for FGS capable device, fill CPU memory directly
amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
}
profilingEnd(cmd);
}
@@ -179,6 +179,10 @@ class VirtualGPU : public device::VirtualDevice {
void submitFillMemory(amd::FillMemoryCommand& cmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
// { roc OpenCL integration
// Added these stub (no-ops) implementation of pure virtual methods,
// when integrating HSA and OpenCL branches.
@@ -187,9 +191,6 @@ class VirtualGPU : public device::VirtualDevice {
virtual void submitSignal(amd::SignalCommand& cmd) {}
virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {}
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
@@ -262,6 +263,27 @@ class VirtualGPU : public device::VirtualDevice {
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(uint deviceQueueSize);
//! Common function for fill memory used by both svm Fill and non-svm fill
bool fillMemory(cl_command_type type, //!< the command type
amd::Memory* amdMemory, //!< memory object to fill
const void* pattern, //!< pattern to fill the memory
size_t patternSize, //!< pattern size
const amd::Coord3D& origin, //!< memory origin
const amd::Coord3D& size //!< memory size for filling
);
//! Common function for memory copy used by both svm Copy and non-svm Copy
bool copyMemory(cl_command_type type, //!< the command type
amd::Memory& srcMem, //!< source memory object
amd::Memory& dstMem, //!< destination memory object
bool entire, //!< flag of entire memory copy
const amd::Coord3D& srcOrigin, //!< source memory origin
const amd::Coord3D& dstOrigin, //!< destination memory object
const amd::Coord3D& size, //!< copy size
const amd::BufferRect& srcRect, //!< region of source for copy
const amd::BufferRect& dstRect //!< region of destination for copy
);
//! Updates AQL header for the upcomming dispatch
void setAqlHeader(uint16_t header) { aqlHeader_ = header; }
@@ -145,6 +145,8 @@ release(uint, HSA_SIGNAL_POOL_SIZE, 16, \
"Signal object pool size") \
release(bool, HSA_ENABLE_ATOMICS_32B, false, \
"1 = Enable SVM atomics in 32 bits (HSA backend-only). Any other value keeps then disabled.") \
release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, IS_HIP, \
"Enable device memory for coarse grain SVM allocations") \
release(bool, GPU_IFH_MODE, false, \
"1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
release(bool, GPU_MIPMAP, true, \
@@ -107,6 +107,10 @@
#define NOT_LIGHTNING(x) x
#endif /* !WITH_LIGHTNING_COMPILER */
#ifdef BUILD_HIP
#define IS_HIP true
#endif
#ifndef IS_LINUX
#define IS_LINUX false
#endif
@@ -119,6 +123,9 @@
#ifndef IS_LIGHTNING
#define IS_LIGHTNING false
#endif
#ifndef IS_HIP
#define IS_HIP false
#endif
#define IF_LEFT_true(x) x
#define IF_LEFT_false(x)