P4 to Git Change 2058803 by gandryey@gera-win10 on 2020/01/17 15:47:42

SWDEV-219901 - [OCL-ROCr]Add pitch workaround for Navi10
	- Add pitch workaroud. Allocate a native image as the backing store and perform double copy when necessary

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#149 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#48 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#46 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#16 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#47 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#95 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#31 edit


[ROCm/clr commit: 3c137e7b19]
This commit is contained in:
foreman
2020-01-17 15:51:03 -05:00
parent 0f31d637fd
commit bd191b9d2e
8 changed files with 119 additions and 2 deletions
@@ -969,6 +969,10 @@ Sampler::~Sampler() {
hsa_ext_sampler_destroy(dev_.getBackendDevice(), hsa_sampler);
}
Memory* Device::getGpuMemory(amd::Memory* mem) const {
return static_cast<roc::Memory*>(mem->getDeviceMemory(*this));
}
bool Device::populateOCLDeviceConstants() {
info_.available_ = true;
@@ -430,6 +430,9 @@ class Device : public NullDevice {
virtual bool findLinkTypeAndHopCount(amd::Device* other_device, uint32_t* link_type,
uint32_t* hop_count);
//! Returns a GPU memory object from AMD memory object
roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object
) const;
private:
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
@@ -1022,7 +1022,10 @@ bool Image::createInteropImage() {
}
bool Image::create() {
if (owner()->parent()) {
if (owner()->parent() != nullptr) {
if (!ValidateMemory()) {
return false;
}
// Image view creation
roc::Memory* parent = static_cast<roc::Memory*>(owner()->parent()->getDeviceMemory(dev_));
@@ -1219,6 +1222,8 @@ void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& regi
Image::~Image() { destroy(); }
void Image::destroy() {
delete copyImageBuffer_;
if (hsaImageObject_.handle != 0) {
hsa_status_t status = hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_);
assert(status == HSA_STATUS_SUCCESS);
@@ -1241,5 +1246,31 @@ void Image::destroy() {
const_cast<Device&>(dev()).updateFreeMemory(size(), true);
}
}
bool Image::ValidateMemory() {
// Detect image view from buffer to distinguish linear paths from tiled.
amd::Memory* ancestor = owner()->parent();
while ((ancestor->asBuffer() == nullptr) && (ancestor->parent() != nullptr)) {
ancestor = ancestor->parent();
}
bool linearLayout = (ancestor->asBuffer() != nullptr);
if (dev().settings().imageBufferWar_ && linearLayout && (owner() != nullptr) &&
((owner()->asImage()->getWidth() * owner()->asImage()->getImageFormat().getElementSize()) <
owner()->asImage()->getRowPitch())) {
constexpr bool ForceLinear = true;
amd::Image* img = owner()->asImage();
// Create a native image without pitch for validation
copyImageBuffer_ =
new (dev().context()) amd::Image(dev().context(), CL_MEM_OBJECT_IMAGE2D, img->getMemFlags(),
img->getImageFormat(), img->getWidth(), img->getHeight(), 1, 0, 0);
if ((copyImageBuffer_ == nullptr) || !copyImageBuffer_->create()) {
return false;
}
}
return true;
}
}
#endif // WITHOUT_HSA_BACKEND
@@ -93,6 +93,9 @@ class Memory : public device::Memory {
virtual void IpcCreate (size_t offset, size_t* mem_size, void* handle) const;
//! Validates allocated memory for possible workarounds
virtual bool ValidateMemory() { return true; }
protected:
bool allocateMapMemory(size_t allocationSize);
@@ -179,6 +182,11 @@ class Image : public roc::Memory {
virtual const address cpuSrd() const { return reinterpret_cast<const address>(getHsaImageObject().handle); }
//! Validates allocated memory for possible workarounds
bool ValidateMemory() final;
amd::Image* CopyImageBuffer() const { return copyImageBuffer_; }
private:
//! Disable copy constructor
Image(const Buffer&);
@@ -200,6 +208,7 @@ class Image : public roc::Memory {
hsa_ext_image_t hsaImageObject_;
void* originalDeviceMemory_;
amd::Image* copyImageBuffer_ = nullptr;
};
}
#endif
@@ -70,6 +70,7 @@ Settings::Settings() {
useLightning_ = (!flagIsDefault(GPU_ENABLE_LC)) ? GPU_ENABLE_LC : true;
lcWavefrontSize64_ = true;
imageBufferWar_ = false;
}
bool Settings::create(bool fullProfile, int gfxipVersion) {
@@ -128,6 +129,10 @@ bool Settings::create(bool fullProfile, int gfxipVersion) {
if (gfxipVersion >= 1000) {
enableWave32Mode_ = true;
enableWgpMode_ = GPU_ENABLE_WGP_MODE;
if (gfxipVersion == 1001) {
// GFX10.1 HW doesn't support custom pitch. Enable double copy workaround
imageBufferWar_ = GPU_IMAGE_BUFFER_WAR;
}
}
if (!flagIsDefault(GPU_ENABLE_WAVE32_MODE)) {
enableWave32Mode_ = GPU_ENABLE_WAVE32_MODE;
@@ -26,7 +26,8 @@ class Settings : public device::Settings {
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
uint reserved_ : 25;
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
uint reserved_ : 24;
};
uint value_;
};
@@ -330,6 +330,30 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
const uint64_t image_srd = image->getHsaImageObject().handle;
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
WriteAqlArgAt(const_cast<address>(params), &image_srd, sizeof(image_srd), desc.offset_);
// Check if synchronization has to be performed
if (image->CopyImageBuffer() != nullptr) {
Memory* devBuf = dev().getGpuMemory(mem->parent());
amd::Coord3D offs(0);
Image* devCpImg = static_cast<Image*>(dev().getGpuMemory(image->CopyImageBuffer()));
amd::Image* img = mem->asImage();
// Copy memory from the original image buffer into the backing store image
bool result = blitMgr().copyBufferToImage(
*devBuf, *devCpImg, offs, offs, img->getRegion(), true,
img->getRowPitch(), img->getSlicePitch());
// Make sure the copy operation is done
setAqlHeader(dispatchPacketHeader_);
// Use backing store SRD as the replacment
const uint64_t srd = devCpImg->getHsaImageObject().handle;
WriteAqlArgAt(const_cast<address>(params), &srd, sizeof(srd), desc.offset_);
// If it's not a read only resource, then runtime has to write back
if (!desc.info_.readOnly_) {
wrtBackImageBuffer_.push_back(devCpImg);
imageBufferWrtBack_ = true;
}
}
}
}
}
@@ -947,6 +971,23 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
break;
}
case CL_COMMAND_READ_IMAGE: {
if ((cmd.source().parent() != nullptr) && (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
Image* imageBuffer = static_cast<Image*>(devMem);
// Check if synchronization has to be performed
if (nullptr != imageBuffer->CopyImageBuffer()) {
amd::Memory* memory = imageBuffer->CopyImageBuffer();
devMem = dev().getGpuMemory(memory);
if (nullptr == imageBuffer->owner()->getLastWriter()) {
Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
amd::Image* image = imageBuffer->owner()->asImage();
amd::Coord3D offs(0);
// Copy memory from the original image buffer into the backing store image
result = blitMgr().copyBufferToImage(*buffer, *devMem, offs,
offs, image->getRegion(), true,
image->getRowPitch(), image->getSlicePitch());
}
}
}
if (hostMemory != nullptr) {
// Accelerated image to buffer transfer without pinning
amd::Coord3D dstOrigin(offset);
@@ -2202,6 +2243,25 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
}
// Check if image buffer write back is required
if (imageBufferWrtBack_) {
// Avoid recursive write back
imageBufferWrtBack_ = false;
// Make sure the original kernel execution is done
releaseGpuMemoryFence();
for (const auto imageBuffer : wrtBackImageBuffer_) {
Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
amd::Image* image = imageBuffer->owner()->asImage();
Image* devImage = static_cast<Image*>(dev().getGpuMemory(imageBuffer->owner()));
Memory* cpyImage = dev().getGpuMemory(devImage->CopyImageBuffer());
amd::Coord3D offs(0);
// Copy memory from the the backing store image into original buffer
bool result = blitMgr().copyImageToBuffer(*cpyImage, *buffer, offs,
offs, image->getRegion(), true,
image->getRowPitch(), image->getSlicePitch());
}
wrtBackImageBuffer_.clear();
}
return true;
}
/**
@@ -300,6 +300,10 @@ class VirtualGPU : public device::VirtualDevice {
* used to synchronized on kernel outputs.
*/
bool hasPendingDispatch_;
bool imageBufferWrtBack_; //!< Enable image buffer write back
std::vector<device::Memory*> wrtBackImageBuffer_; //!< Array of images for write back
Timestamp* timestamp_;
hsa_agent_t gpu_device_; //!< Physical device
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu