P4 to Git Change 1527848 by gandryey@gera-w8 on 2018/03/15 17:11:43

SWDEV-79445 - OCL generic changes and code clean-up
	- Add suballocations support for local(invisible) memory. It should significantly improve memory footprint and TLB usage with 2MB pages
	- Implementation uses BuddyAllocator provided in PAL
	- The chunk allocation size is 64MB, min allocation 4KB and max 4MB. GPU_MAX_SUBALLOC_SIZE controls the max size in KB

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#33 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#76 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#56 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#51 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#45 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#16 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#77 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#285 edit
Tento commit je obsažen v:
foreman
2018-03-15 17:26:25 -04:00
rodič c4a81872f3
revize 7ae94da05b
11 změnil soubory, kde provedl 947 přidání a 664 odebrání
+1
Zobrazit soubor
@@ -8,6 +8,7 @@
#include "palGpuMemory.h"
#include "palImage.h"
#include "palFormatInfo.h"
#include "util/palSysMemory.h"
//
/// Memory Object Type
+13 -12
Zobrazit soubor
@@ -54,6 +54,10 @@ void PalDeviceUnload() { pal::Device::tearDown(); }
namespace pal {
Util::GenericAllocator NullDevice::allocator_;
char* Device::platformObj_;
Pal::IPlatform* Device::platform_;
NullDevice::Compiler* NullDevice::compiler_;
AppProfile Device::appProfile_;
@@ -183,6 +187,7 @@ bool NullDevice::init() {
return true;
}
bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
uint xNACKSupported) {
online_ = false;
@@ -736,7 +741,7 @@ bool Device::create(Pal::IDevice* device) {
if (!amd::Device::create()) {
return false;
}
resourceList_ = new std::list<GpuMemoryReference*>();
resourceList_ = new std::list<Resource*>();
if (nullptr == resourceList_) {
return false;
}
@@ -865,7 +870,7 @@ bool Device::create(Pal::IDevice* device) {
size_t resourceCacheSize = settings().resourceCacheSize_;
// Create resource cache.
// \note Cache must be created before any resource creation to avoid nullptr check
resourceCache_ = new ResourceCache(resourceCacheSize);
resourceCache_ = new ResourceCache(this, resourceCacheSize);
if (nullptr == resourceCache_) {
return false;
}
@@ -925,8 +930,6 @@ bool Device::create(Pal::IDevice* device) {
return true;
}
static Pal::IPlatform* platform;
bool Device::initializeHeapResources() {
amd::ScopedLock k(lockForInitHeap_);
if (!heapInitComplete_) {
@@ -998,7 +1001,7 @@ bool Device::initializeHeapResources() {
xferQueue_->enableSyncedBlit();
// Create RGP capture manager
rgpCaptureMgr_ = RgpCaptureMgr::Create(platform, *this);
rgpCaptureMgr_ = RgpCaptureMgr::Create(platform_, *this);
}
return true;
}
@@ -1096,8 +1099,6 @@ static int reportHook(int reportType, char* message, int* returnValue) {
}
#endif // _WIN32 & DEBUG
static char* platformObj;
bool Device::init() {
uint32_t numDevices = 0;
bool useDeviceList = false;
@@ -1123,7 +1124,7 @@ bool Device::init() {
#endif // !defined(WITH_LIGHTNING_COMPILER)
size_t size = Pal::GetPlatformSize();
platformObj = new char[size];
platformObj_ = new char[size];
Pal::PlatformCreateInfo info = {};
info.flags.disableGpuTimeout = true;
#if !defined(PAL_BUILD_DTIF)
@@ -1138,14 +1139,14 @@ bool Device::init() {
info.maxSvmSize = static_cast<Pal::gpusize>(OCL_SET_SVM_SIZE * Mi);
// PAL init
if (Pal::Result::Success != Pal::CreatePlatform(info, platformObj, &platform)) {
if (Pal::Result::Success != Pal::CreatePlatform(info, platformObj_, &platform_)) {
return false;
}
// Get the total number of active devices
// Count up all the devices in the system.
Pal::IDevice* deviceList[Pal::MaxDevices] = {};
platform->EnumerateDevices(&numDevices, &deviceList[0]);
platform_->EnumerateDevices(&numDevices, &deviceList[0]);
uint ordinal = 0;
const char* selectDeviceByName = nullptr;
@@ -1175,8 +1176,8 @@ bool Device::init() {
}
void Device::tearDown() {
platform->Destroy();
delete platformObj;
platform_->Destroy();
delete platformObj_;
#if !defined(WITH_LIGHTNING_COMPILER)
if (compiler_ != nullptr) {
+18 -7
Zobrazit soubor
@@ -120,7 +120,12 @@ class NullDevice : public amd::Device {
amd::CacheCompilation* cacheCompilation() const { return cacheCompilation_.get(); }
#endif
void* Alloc(const Util::AllocInfo& allocInfo) { return allocator_.Alloc(allocInfo); }
void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
protected:
static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL
Pal::AsicRevision asicRevision_; //!< ASIC revision
Pal::GfxIpLevel ipLevel_; //!< Device IP level
const AMDDeviceInfo* hwInfo_; //!< Device HW info structure
@@ -464,6 +469,9 @@ class Device : public NullDevice {
//! Returns PAL device properties
const Pal::DeviceProperties& properties() const { return properties_; }
//! Returns PAL platform interface
Pal::IPlatform* iPlat() const { return platform_; }
//! Returns PAL device interface
Pal::IDevice* iDev() const { return device_; }
@@ -496,19 +504,19 @@ class Device : public NullDevice {
bool resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const;
//! Adds a resource to the global list
void addResource(GpuMemoryReference* mem) const {
void addResource(Resource* res) const {
amd::ScopedLock lock(lockResources());
auto findIt = std::find(resourceList_->begin(), resourceList_->end(), mem);
mem->events_.resize(numOfVgpus());
auto findIt = std::find(resourceList_->begin(), resourceList_->end(), res);
res->resizeGpuEvents(numOfVgpus() - 1);
if (resourceList_->end() == findIt) {
resourceList_->push_back(mem);
resourceList_->push_back(res);
}
}
//! Removes a resource from the global list
void removeResource(GpuMemoryReference* mem) const {
void removeResource(Resource* res) const {
amd::ScopedLock lock(lockResources());
resourceList_->remove(mem);
resourceList_->remove(res);
}
//! Resizes global resource list to accumulate a new queue
@@ -566,6 +574,9 @@ class Device : public NullDevice {
bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;
static char* platformObj_; //!< Memory allocated for PAL platform object
static Pal::IPlatform* platform_; //!< Pointer to the PAL platform object
amd::Context* context_; //!< A dummy context for internal allocations
amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device
amd::Monitor*
@@ -592,7 +603,7 @@ class Device : public NullDevice {
Pal::IDevice* device_; //!< PAL device object
std::atomic<Pal::gpusize> freeMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter
amd::Monitor* lockResourceOps_; //!< Lock to serialise resource access
std::list<GpuMemoryReference*>* resourceList_; //!< Active resource list
std::list<Resource*>* resourceList_; //!< Active resource list
RgpCaptureMgr* rgpCaptureMgr_; //!< RGP capture manager
};
+2 -2
Zobrazit soubor
@@ -89,14 +89,14 @@ void Segment::copy(size_t offset, const void* src, size_t size) {
amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
Memory& xferBuf = gpuAccess_->dev().xferWrite().acquire();
size_t tmpSize = std::min(static_cast<size_t>(xferBuf.vmSize()), size);
size_t tmpSize = std::min(static_cast<size_t>(xferBuf.size()), size);
size_t srcOffs = 0;
while (size != 0) {
xferBuf.hostWrite(&gpu, reinterpret_cast<const_address>(src) + srcOffs, 0, tmpSize);
xferBuf.partialMemCopyTo(gpu, 0, (offset + srcOffs), tmpSize, *gpuAccess_, false, true);
size -= tmpSize;
srcOffs += tmpSize;
tmpSize = std::min(static_cast<size_t>(xferBuf.vmSize()), size);
tmpSize = std::min(static_cast<size_t>(xferBuf.size()), size);
}
gpu.waitAllEngines();
}
Rozdílový obsah nebyl zobrazen, protože je příliš veliký Načíst rozdílové porovnání
+79 -22
Zobrazit soubor
@@ -6,6 +6,7 @@
#include "platform/command.hpp"
#include "platform/program.hpp"
#include "device/pal/paldefs.hpp"
#include "util/palBuddyAllocatorImpl.h"
//! \namespace pal PAL Resource Implementation
namespace pal {
@@ -16,7 +17,6 @@ class VirtualGPU;
/*! \addtogroup PAL PAL Resource Implementation
* @{
*/
class GpuMemoryReference : public amd::ReferenceCountedObject {
public:
static GpuMemoryReference* Create(const Device& dev, const Pal::GpuMemoryCreateInfo& createInfo);
@@ -36,12 +36,6 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
//! Default constructor
GpuMemoryReference(const Device& dev);
//! Resizes the events array to account the new queue
void resizeGpuEvents(uint index) { events_.resize(index + 1); }
//! Erase an entry in the array for provided queue index
void eraseGpuEvents(uint index) { events_.erase(events_.begin() + index); }
//! Get PAL memory object
Pal::IGpuMemory* iMem() const { return gpuMem_; }
@@ -50,7 +44,6 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
const Device& device_; //!< GPU device
//! @note: This field is necessary for the thread safe release only
VirtualGPU* gpu_; //!< Resource will be used only on this queue
std::vector<GpuEvent> events_; //!< GPU events associated with the resource
protected:
//! Default destructor
@@ -64,6 +57,8 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
GpuMemoryReference& operator=(const GpuMemoryReference&);
};
static constexpr Pal::gpusize MaxGpuAlignment = 4 * Ki;
//! GPU resource
class Resource : public amd::HeapObject {
public:
@@ -178,7 +173,7 @@ class Resource : public amd::HeapObject {
uint imageArray_ : 1; //!< PAL resource is an array of images
uint buffer_ : 1; //!< PAL resource is a buffer
uint tiled_ : 1; //!< PAL resource is tiled
uint SVMRes_ : 1; //!< SVM flag to the cal resource
uint SVMRes_ : 1; //!< SVM flag to the pal resource
uint scratch_ : 1; //!< Scratch buffer
uint isAllocExecute_ : 1; //!< SVM resource allocation attribute for shader\cmdbuf
uint isDoppTexture_ : 1; //!< PAL resource is for a DOPP desktop texture
@@ -205,9 +200,9 @@ class Resource : public amd::HeapObject {
//! Destructor of the resource
virtual ~Resource();
/*! \brief Creates a CAL object, associated with the resource
/*! \brief Creates a PAL object, associated with the resource
*
* \return True if we succesfully created a CAL resource
* \return True if we succesfully created a PAL resource
*/
virtual bool create(MemoryType memType, //!< memory type
CreateParams* params = 0 //!< special parameters for resource allocation
@@ -263,7 +258,7 @@ class Resource : public amd::HeapObject {
uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }
//! Returns global memory offset
uint64_t vmSize() const { return iMem()->Desc().size - offset_; }
uint64_t vmSize() const { return desc_.width_ * elementSize(); }
//! Returns global memory offset
bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
@@ -290,7 +285,7 @@ class Resource : public amd::HeapObject {
//! Marks the resource as busy
void setBusy(VirtualGPU& gpu, //!< Virtual GPU device object
GpuEvent calEvent //!< CAL event
GpuEvent calEvent //!< PAL event
) const;
//! Wait for the resource
@@ -326,7 +321,7 @@ class Resource : public amd::HeapObject {
//! Get the mapped address of this resource
address data() const { return reinterpret_cast<address>(address_); }
//! Frees all allocated CAL memories and resources,
//! Frees all allocated PAL memories and resources,
//! associated with this objects. And also destroys all rename structures
//! Note: doesn't destroy the object itself
void free();
@@ -360,7 +355,42 @@ class Resource : public amd::HeapObject {
//! Returns GPU event associated with this resource and specified queue
GpuEvent* getGpuEvent(const VirtualGPU& gpu) const;
//! Resizes the events array to account the new queue
void resizeGpuEvents(uint index) { events_.resize(index + 1); }
//! Erase an entry in the array for provided queue index
void eraseGpuEvents(uint index) { events_.erase(events_.begin() + index); }
protected:
/*! \brief Creates a PAL iamge object, associated with the resource
*
* \return True if we succesfully created a PAL resource
*/
bool CreateImage(CreateParams* params //!< special parameters for resource allocation
);
/*! \brief Creates a PAL interop object, associated with the resource
*
* \return True if we succesfully created a PAL interop resource
*/
bool CreateInterop(CreateParams* params //!< special parameters for resource allocation
);
/*! \brief Creates a PAL pinned object, associated with the resource
*
* \return True if we succesfully created a PAL pinned resource
*/
bool CreatePinned(CreateParams* params //!< special parameters for resource allocation
);
/*! \brief Creates a PAL SVM object, associated with the resource
*
* \return True if we succesfully created a PAL SVM resource
*/
bool CreateSvm(CreateParams* params, //!< special parameters for resource allocation
Pal::gpusize svmPtr
);
uint elementSize_; //!< Size of a single element in bytes
private:
@@ -424,6 +454,7 @@ class Resource : public amd::HeapObject {
uint32_t curRename_; //!< Current active rename in the list
RenameList renames_; //!< Rename resource list
GpuMemoryReference* memRef_; //!< PAL resource reference
Pal::gpusize subOffset_; //!< GPU memory offset in the oririnal resource
const Resource* viewOwner_; //!< GPU resource, which owns this view
void* glInteropMbRes_; //!< Mb Res handle
uint32_t glType_; //!< GL interop type
@@ -438,26 +469,50 @@ class Resource : public amd::HeapObject {
uint32_t* hwState_; //!< HW state for image object
uint64_t hwSrd_; //!< GPU pointer to HW SRD
//! Note: Access to the events are thread safe.
mutable std::vector<GpuEvent> events_; //!< GPU events associated with the resource
};
typedef Util::BuddyAllocator<Device> MemBuddyAllocator;
class MemorySubAllocator : public amd::HeapObject {
public:
MemorySubAllocator(Device* device) : device_(device) {}
~MemorySubAllocator();
GpuMemoryReference* Allocate(Pal::gpusize size,
Pal::gpusize alignment, Pal::gpusize* offset);
bool Free(GpuMemoryReference* ref, Pal::gpusize offset);
private:
Device* device_;
std::map<GpuMemoryReference*, MemBuddyAllocator*> mem_heap_;
};
class ResourceCache : public amd::HeapObject {
public:
//! Default constructor
ResourceCache(size_t cacheSizeLimit)
: lockCacheOps_("PAL resource cache", true), cacheSize_(0), cacheSizeLimit_(cacheSizeLimit) {}
ResourceCache(Device* device, size_t cacheSizeLimit)
: lockCacheOps_("PAL resource cache", true)
, cacheSize_(0)
, cacheSizeLimit_(cacheSizeLimit)
, memSubAllocLocal_(device) {}
//! Default destructor
~ResourceCache();
//! Adds a CAL resource to the cache
bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key
GpuMemoryReference* ref //!< Resource reference
//! Adds a PAL resource to the cache
bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key
GpuMemoryReference* ref, //!< Resource reference
Pal::gpusize offset //!< Original resource offset
);
//! Finds a CAL resource from the cache
//! Finds a PAL resource from the cache
GpuMemoryReference* findGpuMemory(
Resource::Descriptor* desc, //!< Resource descriptor - cache key
Pal::gpusize size, Pal::gpusize alignment);
Pal::gpusize size, Pal::gpusize alignment, Pal::gpusize* offset);
//! Destroys cache
bool free(size_t minCacheEntries = 0);
@@ -477,8 +532,10 @@ class ResourceCache : public amd::HeapObject {
size_t cacheSize_; //!< Current cache size in bytes
const size_t cacheSizeLimit_; //!< Cache size limit in bytes
//! CAL resource cache
//! PAL resource cache
std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> > resCache_;
MemorySubAllocator memSubAllocLocal_; //!< Allocator for suballocations in Local
};
/*@}*/} // namespace pal
+6
Zobrazit soubor
@@ -138,6 +138,12 @@ Settings::Settings() {
rgpSqttDispCount_ = PAL_RGP_DISP_COUNT;
rgpSqttWaitIdle_ = true;
rgpSqttForceDisable_ = false;
// Sub allocation parameters
subAllocationMinSize_ = 4 * Ki;
subAllocationChunkSize_ = 64 * Mi;
subAllocationMaxSize_ =
std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
}
bool Settings::create(const Pal::DeviceProperties& palProp,
+4
Zobrazit soubor
@@ -98,6 +98,10 @@ class Settings : public device::Settings {
uint64_t maxAllocSize_; //!< Maximum single allocation size
uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT
uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations
uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations
uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
//! Default constructor
+9 -5
Zobrazit soubor
@@ -409,7 +409,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
}
uint64_t curStart = memory->vmAddress();
uint64_t curEnd = curStart + memory->vmSize();
uint64_t curEnd = curStart + memory->size();
// Loop through all memory objects in the queue and find dependency
// @note don't include objects from the current kernel
@@ -1974,6 +1974,7 @@ void VirtualGPU::PostDeviceEnqueue(
uint64_t vmParentWrap,
GpuEvent* gpuEvent)
{
uint32_t id = gpuEvent->id;
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
// Make sure exculsive access to the device queue
@@ -2055,6 +2056,9 @@ void VirtualGPU::PostDeviceEnqueue(
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
signalAddr, dev().settings().useDeviceQueue_);
if (id != gpuEvent->id) {
LogError("Something is wrong. ID mismatch!\n");
}
eventEnd(MainEngine, *gpuEvent);
}
@@ -2203,6 +2207,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
if (profiling() || state_.profileEnabled_) {
addBarrier();
}
if (id != gpuEvent.id) {
LogError("Something is wrong. ID mismatch!\n");
}
eventEnd(MainEngine, gpuEvent);
// Execute scheduler for device enqueue
@@ -2210,9 +2217,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
PostDeviceEnqueue(kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
}
if (id != gpuEvent.id) {
LogError("Something is wrong. ID mismatch!\n");
}
// Update the global GPU event
setGpuEvent(gpuEvent, needFlush);
@@ -2266,7 +2270,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
}
}
void VirtualGPU::releaseMemory(GpuMemoryReference* mem, GpuEvent* event) {
void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
queues_[MainEngine]->removeCmdMemRef(mem);
queues_[SdmaEngine]->removeCmdMemRef(mem);
}
+1 -1
Zobrazit soubor
@@ -314,7 +314,7 @@ class VirtualGPU : public device::VirtualDevice {
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
void releaseMemory(GpuMemoryReference* mem, GpuEvent* event);
void releaseMemory(GpuMemoryReference* mem);
void flush(amd::Command* list = nullptr, bool wait = false);
bool terminate() { return true; }
+2
Zobrazit soubor
@@ -86,6 +86,8 @@ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 512, \
"The minimal buffer size for pinned read/write transfers in KBytes") \
release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \
"The resource cache size in MB") \
release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \
"The maximum size accepted for suballocaitons in KB") \
release(uint, GPU_ASYNC_MEM_COPY, 0, \
"Enables async memory transfers with DRM engine") \
release(bool, GPU_FORCE_64BIT_PTR, 0, \