P4 to Git Change 2058803 by gandryey@gera-win10 on 2020/01/17 15:47:42
SWDEV-219901 - [OCL-ROCr]Add pitch workaround for Navi10
- Add pitch workaroud. Allocate a native image as the backing store and perform double copy when necessary
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#149 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#48 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#46 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#16 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#47 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#95 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#31 edit
[ROCm/clr commit: 3c137e7b19]
This commit is contained in:
@@ -969,6 +969,10 @@ Sampler::~Sampler() {
|
||||
hsa_ext_sampler_destroy(dev_.getBackendDevice(), hsa_sampler);
|
||||
}
|
||||
|
||||
Memory* Device::getGpuMemory(amd::Memory* mem) const {
|
||||
return static_cast<roc::Memory*>(mem->getDeviceMemory(*this));
|
||||
}
|
||||
|
||||
bool Device::populateOCLDeviceConstants() {
|
||||
info_.available_ = true;
|
||||
|
||||
|
||||
@@ -430,6 +430,9 @@ class Device : public NullDevice {
|
||||
virtual bool findLinkTypeAndHopCount(amd::Device* other_device, uint32_t* link_type,
|
||||
uint32_t* hop_count);
|
||||
|
||||
//! Returns a GPU memory object from AMD memory object
|
||||
roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object
|
||||
) const;
|
||||
private:
|
||||
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
|
||||
|
||||
|
||||
@@ -1022,7 +1022,10 @@ bool Image::createInteropImage() {
|
||||
}
|
||||
|
||||
bool Image::create() {
|
||||
if (owner()->parent()) {
|
||||
if (owner()->parent() != nullptr) {
|
||||
if (!ValidateMemory()) {
|
||||
return false;
|
||||
}
|
||||
// Image view creation
|
||||
roc::Memory* parent = static_cast<roc::Memory*>(owner()->parent()->getDeviceMemory(dev_));
|
||||
|
||||
@@ -1219,6 +1222,8 @@ void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& regi
|
||||
Image::~Image() { destroy(); }
|
||||
|
||||
void Image::destroy() {
|
||||
delete copyImageBuffer_;
|
||||
|
||||
if (hsaImageObject_.handle != 0) {
|
||||
hsa_status_t status = hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
@@ -1241,5 +1246,31 @@ void Image::destroy() {
|
||||
const_cast<Device&>(dev()).updateFreeMemory(size(), true);
|
||||
}
|
||||
}
|
||||
|
||||
bool Image::ValidateMemory() {
|
||||
// Detect image view from buffer to distinguish linear paths from tiled.
|
||||
amd::Memory* ancestor = owner()->parent();
|
||||
while ((ancestor->asBuffer() == nullptr) && (ancestor->parent() != nullptr)) {
|
||||
ancestor = ancestor->parent();
|
||||
}
|
||||
bool linearLayout = (ancestor->asBuffer() != nullptr);
|
||||
|
||||
if (dev().settings().imageBufferWar_ && linearLayout && (owner() != nullptr) &&
|
||||
((owner()->asImage()->getWidth() * owner()->asImage()->getImageFormat().getElementSize()) <
|
||||
owner()->asImage()->getRowPitch())) {
|
||||
constexpr bool ForceLinear = true;
|
||||
amd::Image* img = owner()->asImage();
|
||||
// Create a native image without pitch for validation
|
||||
copyImageBuffer_ =
|
||||
new (dev().context()) amd::Image(dev().context(), CL_MEM_OBJECT_IMAGE2D, img->getMemFlags(),
|
||||
img->getImageFormat(), img->getWidth(), img->getHeight(), 1, 0, 0);
|
||||
|
||||
if ((copyImageBuffer_ == nullptr) || !copyImageBuffer_->create()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
#endif // WITHOUT_HSA_BACKEND
|
||||
|
||||
@@ -93,6 +93,9 @@ class Memory : public device::Memory {
|
||||
|
||||
virtual void IpcCreate (size_t offset, size_t* mem_size, void* handle) const;
|
||||
|
||||
//! Validates allocated memory for possible workarounds
|
||||
virtual bool ValidateMemory() { return true; }
|
||||
|
||||
protected:
|
||||
bool allocateMapMemory(size_t allocationSize);
|
||||
|
||||
@@ -179,6 +182,11 @@ class Image : public roc::Memory {
|
||||
|
||||
virtual const address cpuSrd() const { return reinterpret_cast<const address>(getHsaImageObject().handle); }
|
||||
|
||||
//! Validates allocated memory for possible workarounds
|
||||
bool ValidateMemory() final;
|
||||
|
||||
amd::Image* CopyImageBuffer() const { return copyImageBuffer_; }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Image(const Buffer&);
|
||||
@@ -200,6 +208,7 @@ class Image : public roc::Memory {
|
||||
hsa_ext_image_t hsaImageObject_;
|
||||
|
||||
void* originalDeviceMemory_;
|
||||
amd::Image* copyImageBuffer_ = nullptr;
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -70,6 +70,7 @@ Settings::Settings() {
|
||||
useLightning_ = (!flagIsDefault(GPU_ENABLE_LC)) ? GPU_ENABLE_LC : true;
|
||||
|
||||
lcWavefrontSize64_ = true;
|
||||
imageBufferWar_ = false;
|
||||
}
|
||||
|
||||
bool Settings::create(bool fullProfile, int gfxipVersion) {
|
||||
@@ -128,6 +129,10 @@ bool Settings::create(bool fullProfile, int gfxipVersion) {
|
||||
if (gfxipVersion >= 1000) {
|
||||
enableWave32Mode_ = true;
|
||||
enableWgpMode_ = GPU_ENABLE_WGP_MODE;
|
||||
if (gfxipVersion == 1001) {
|
||||
// GFX10.1 HW doesn't support custom pitch. Enable double copy workaround
|
||||
imageBufferWar_ = GPU_IMAGE_BUFFER_WAR;
|
||||
}
|
||||
}
|
||||
if (!flagIsDefault(GPU_ENABLE_WAVE32_MODE)) {
|
||||
enableWave32Mode_ = GPU_ENABLE_WAVE32_MODE;
|
||||
|
||||
@@ -26,7 +26,8 @@ class Settings : public device::Settings {
|
||||
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
|
||||
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
|
||||
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
|
||||
uint reserved_ : 25;
|
||||
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
|
||||
uint reserved_ : 24;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -330,6 +330,30 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
|
||||
const uint64_t image_srd = image->getHsaImageObject().handle;
|
||||
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
|
||||
WriteAqlArgAt(const_cast<address>(params), &image_srd, sizeof(image_srd), desc.offset_);
|
||||
|
||||
// Check if synchronization has to be performed
|
||||
if (image->CopyImageBuffer() != nullptr) {
|
||||
Memory* devBuf = dev().getGpuMemory(mem->parent());
|
||||
amd::Coord3D offs(0);
|
||||
Image* devCpImg = static_cast<Image*>(dev().getGpuMemory(image->CopyImageBuffer()));
|
||||
amd::Image* img = mem->asImage();
|
||||
|
||||
// Copy memory from the original image buffer into the backing store image
|
||||
bool result = blitMgr().copyBufferToImage(
|
||||
*devBuf, *devCpImg, offs, offs, img->getRegion(), true,
|
||||
img->getRowPitch(), img->getSlicePitch());
|
||||
// Make sure the copy operation is done
|
||||
setAqlHeader(dispatchPacketHeader_);
|
||||
// Use backing store SRD as the replacment
|
||||
const uint64_t srd = devCpImg->getHsaImageObject().handle;
|
||||
WriteAqlArgAt(const_cast<address>(params), &srd, sizeof(srd), desc.offset_);
|
||||
|
||||
// If it's not a read only resource, then runtime has to write back
|
||||
if (!desc.info_.readOnly_) {
|
||||
wrtBackImageBuffer_.push_back(devCpImg);
|
||||
imageBufferWrtBack_ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -947,6 +971,23 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_READ_IMAGE: {
|
||||
if ((cmd.source().parent() != nullptr) && (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
|
||||
Image* imageBuffer = static_cast<Image*>(devMem);
|
||||
// Check if synchronization has to be performed
|
||||
if (nullptr != imageBuffer->CopyImageBuffer()) {
|
||||
amd::Memory* memory = imageBuffer->CopyImageBuffer();
|
||||
devMem = dev().getGpuMemory(memory);
|
||||
if (nullptr == imageBuffer->owner()->getLastWriter()) {
|
||||
Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
|
||||
amd::Image* image = imageBuffer->owner()->asImage();
|
||||
amd::Coord3D offs(0);
|
||||
// Copy memory from the original image buffer into the backing store image
|
||||
result = blitMgr().copyBufferToImage(*buffer, *devMem, offs,
|
||||
offs, image->getRegion(), true,
|
||||
image->getRowPitch(), image->getSlicePitch());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hostMemory != nullptr) {
|
||||
// Accelerated image to buffer transfer without pinning
|
||||
amd::Coord3D dstOrigin(offset);
|
||||
@@ -2202,6 +2243,25 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
|
||||
}
|
||||
|
||||
// Check if image buffer write back is required
|
||||
if (imageBufferWrtBack_) {
|
||||
// Avoid recursive write back
|
||||
imageBufferWrtBack_ = false;
|
||||
// Make sure the original kernel execution is done
|
||||
releaseGpuMemoryFence();
|
||||
for (const auto imageBuffer : wrtBackImageBuffer_) {
|
||||
Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
|
||||
amd::Image* image = imageBuffer->owner()->asImage();
|
||||
Image* devImage = static_cast<Image*>(dev().getGpuMemory(imageBuffer->owner()));
|
||||
Memory* cpyImage = dev().getGpuMemory(devImage->CopyImageBuffer());
|
||||
amd::Coord3D offs(0);
|
||||
// Copy memory from the the backing store image into original buffer
|
||||
bool result = blitMgr().copyImageToBuffer(*cpyImage, *buffer, offs,
|
||||
offs, image->getRegion(), true,
|
||||
image->getRowPitch(), image->getSlicePitch());
|
||||
}
|
||||
wrtBackImageBuffer_.clear();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
|
||||
@@ -300,6 +300,10 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
* used to synchronized on kernel outputs.
|
||||
*/
|
||||
bool hasPendingDispatch_;
|
||||
|
||||
bool imageBufferWrtBack_; //!< Enable image buffer write back
|
||||
std::vector<device::Memory*> wrtBackImageBuffer_; //!< Array of images for write back
|
||||
|
||||
Timestamp* timestamp_;
|
||||
hsa_agent_t gpu_device_; //!< Physical device
|
||||
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
|
||||
|
||||
Reference in New Issue
Block a user